In [1]:
import pandas as pd
from tqdm import tqdm
from langchain_community.llms import Ollama
from sklearn.datasets import fetch_20newsgroups

# Dataset

In [2]:
# Load the 20 newsgroups dataset
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

news_test = pd.DataFrame({'text': newsgroups_test.data, 'label': newsgroups_test.target})

news_test['label'] = news_test['label'].map(lambda x: newsgroups_test.target_names[x])

In [3]:
news_test.head(2)

Unnamed: 0,text,label
0,I am a little confused on all of the models of...,rec.autos
1,I'm not familiar at all with the format of the...,comp.windows.x


# Modeling

In [4]:
llm = Ollama(model="gemma2")

  llm = Ollama(model="gemma2")


### Step 0 - Defining Topics according to the dataset

In [5]:
all_topics = [
'rec.autos (Discussions about cars and automobiles)', 
 'rec.motorcycles (Discussions about motorcycles and related topics)', 
 'rec.sport.baseball (Baseball teams, players, and games)', 
 'rec.sport.hockey (Hockey leagues, teams, and players)', 
 'soc.religion.christian (Christianity, its doctrines, and practices)', 
 'comp.sys.ibm.pc.hardware (IBM PC-compatible hardware and troubleshooting)', 
 'comp.graphics (Computer graphics, including rendering and 3D modeling)', 
 'comp.windows.x (The X Window System for graphical user interfaces on UNIX-like systems)',
 'comp.sys.mac.hardware (Apple Macintosh hardware and troubleshooting)', 
 'comp.os.ms-windows.misc (Miscellaneous topics about Microsoft Windows)', 
 'talk.politics.guns (Gun politics, legislation, and rights)', 
 'talk.politics.misc (General political discussions)', 
 'talk.politics.mideast (Politics and current events in the Middle East)', 
 'talk.religion.misc (General religious discussions)', 
 'sci.med (Topics about medical science, health, and treatments)', 
 'sci.space (Space exploration, astronomy, and related science)', 
 'sci.crypt (Cryptography, including encryption and security techniques)', 
 'sci.electronics (Electronics, circuit design, and troubleshooting)',
 'misc.forsale (Items for sale and related discussions)', 
 'alt.atheism (Debates and discussions about atheism and related topics)', 
 ]

### Step 3 - Assigning batches of dataset to Topics Generated in Step 2 and sentiment of the review

In [6]:
def assign_topics(input_df, topics, llm=None, checkpoint_interval=300):
    """
    Assign topics to News Group one by one and update DataFrame directly.
    Saves checkpoint CSV files every specified number of rows.
    
    Parameters:
    input_df (pd.DataFrame): DataFrame containing news items
    topics (str): String of topics to assign from
    llm: The language model instance to use for assigning topics
    checkpoint_interval (int): Number of rows to process before saving a checkpoint
    
    Returns:
    pd.DataFrame: Updated DataFrame with topic assignments
    """
    if llm is None:
        raise ValueError("LLM instance must be provided")
    
    # Create a copy of the DataFrame to avoid modifying the original
    df = input_df.copy()
    
    # Initialize Topic column with 'Unknown'
    df['Predicted Topic'] = 'Unknown'
    
    # Process each news item individually with progress bar
    for idx in tqdm(range(len(df)), desc="Assigning topics"):
        news_item = df.iloc[idx]['text']
        
        # Generate prompt for current item
        prompt_assigning_prompt = f'''You are provided with news and helping to cluster them based on the topics.
Please assign the news to the topics provided. Return only the name of the topic for the respective news.
News can be found in tripletick block: ```{news_item}```
Topics to choose from: {topics}
Please return only the topic name, without any additional text, quotes, or formatting.'''
        
        try:
            # Get assignment for current item
            result = llm.invoke(prompt_assigning_prompt, temperature=0.0)
            
            # Clean the result
            result = result.strip()
            if result.startswith('```') and result.endswith('```'):
                result = result[3:-3].strip()
            
            # Update the DataFrame with the assigned topic
            df.iloc[idx, df.columns.get_loc('Predicted Topic')] = result
            
            # Save checkpoint every checkpoint_interval rows
            if (idx + 1) % checkpoint_interval == 0:
                checkpoint_filename = f'outputs/news_assigned_checkpoint_{idx + 1}.csv'
                df.to_csv(checkpoint_filename, index=False)
                print(f"\nCheckpoint saved: {checkpoint_filename}")
                
        except Exception as e:
            print(f"\nError processing item {idx}: {str(e)}")
            print(f"Result received: {result}")
            continue
    
    # Save final results
    final_filename = 'outputs/news_assigned_final.csv'
    df.to_csv(final_filename, index=False)
    print(f"\nFinal results saved: {final_filename}")
    
    return df

In [8]:
news_assigned = assign_topics(
    news_test,
    topics=str(all_topics),
    llm=llm
)

Assigning topics:   0%|          | 0/7532 [00:00<?, ?it/s]

Assigning topics:   4%|▍         | 300/7532 [14:23<5:22:34,  2.68s/it] 


Checkpoint saved: outputs/news_assigned_checkpoint_300.csv


Assigning topics:   8%|▊         | 600/7532 [29:55<5:35:59,  2.91s/it] 


Checkpoint saved: outputs/news_assigned_checkpoint_600.csv


Assigning topics:  12%|█▏        | 900/7532 [44:26<4:52:58,  2.65s/it]


Checkpoint saved: outputs/news_assigned_checkpoint_900.csv


Assigning topics:  16%|█▌        | 1200/7532 [1:00:15<5:21:52,  3.05s/it]


Checkpoint saved: outputs/news_assigned_checkpoint_1200.csv


Assigning topics:  20%|█▉        | 1500/7532 [1:14:59<4:07:12,  2.46s/it] 


Checkpoint saved: outputs/news_assigned_checkpoint_1500.csv


Assigning topics:  24%|██▍       | 1800/7532 [1:29:44<3:44:04,  2.35s/it] 


Checkpoint saved: outputs/news_assigned_checkpoint_1800.csv


Assigning topics:  28%|██▊       | 2100/7532 [1:45:15<3:57:41,  2.63s/it] 


Checkpoint saved: outputs/news_assigned_checkpoint_2100.csv


Assigning topics:  32%|███▏      | 2400/7532 [2:00:36<5:03:19,  3.55s/it]


Checkpoint saved: outputs/news_assigned_checkpoint_2400.csv


Assigning topics:  36%|███▌      | 2700/7532 [2:15:59<3:23:08,  2.52s/it] 


Checkpoint saved: outputs/news_assigned_checkpoint_2700.csv


Assigning topics:  40%|███▉      | 3000/7532 [2:31:47<3:15:04,  2.58s/it]


Checkpoint saved: outputs/news_assigned_checkpoint_3000.csv


Assigning topics:  44%|████▍     | 3300/7532 [2:46:14<3:40:25,  3.13s/it]


Checkpoint saved: outputs/news_assigned_checkpoint_3300.csv


Assigning topics:  48%|████▊     | 3600/7532 [3:01:54<3:44:47,  3.43s/it]


Checkpoint saved: outputs/news_assigned_checkpoint_3600.csv


Assigning topics:  52%|█████▏    | 3900/7532 [3:18:17<2:52:23,  2.85s/it] 


Checkpoint saved: outputs/news_assigned_checkpoint_3900.csv


Assigning topics:  56%|█████▌    | 4200/7532 [3:34:04<3:43:51,  4.03s/it]


Checkpoint saved: outputs/news_assigned_checkpoint_4200.csv


Assigning topics:  60%|█████▉    | 4500/7532 [3:49:54<2:55:10,  3.47s/it]


Checkpoint saved: outputs/news_assigned_checkpoint_4500.csv


Assigning topics:  64%|██████▎   | 4800/7532 [4:04:56<2:10:30,  2.87s/it]


Checkpoint saved: outputs/news_assigned_checkpoint_4800.csv


Assigning topics:  68%|██████▊   | 5100/7532 [4:20:50<1:47:58,  2.66s/it]


Checkpoint saved: outputs/news_assigned_checkpoint_5100.csv


Assigning topics:  72%|███████▏  | 5400/7532 [4:36:07<1:39:32,  2.80s/it]


Checkpoint saved: outputs/news_assigned_checkpoint_5400.csv


Assigning topics:  76%|███████▌  | 5700/7532 [4:51:46<1:18:07,  2.56s/it]


Checkpoint saved: outputs/news_assigned_checkpoint_5700.csv


Assigning topics:  80%|███████▉  | 6000/7532 [5:07:35<1:06:12,  2.59s/it]


Checkpoint saved: outputs/news_assigned_checkpoint_6000.csv


Assigning topics:  82%|████████▏ | 6157/7532 [5:29:25<1:13:34,  3.21s/it]


KeyboardInterrupt: 

In [18]:
news_assigned = pd.read_csv('outputs/news_assigned_checkpoint_6000.csv')

In [19]:
news_assigned

Unnamed: 0,text,label,Predicted Topic
0,I am a little confused on all of the models of...,rec.autos,rec.autos
1,I'm not familiar at all with the format of the...,comp.windows.x,comp.windows.x
2,"\nIn a word, yes.\n",alt.atheism,comp.sys.ibm.pc.hardware
3,\nThey were attacking the Iraqis to drive them...,talk.politics.mideast,talk.politics.mideast \ntalk.religion.misc
4,\nI've just spent two solid months arguing tha...,talk.religion.misc,alt.atheism
...,...,...,...
7527,"\n Henry, if I read you correctly, you may b...",sci.space,Unknown
7528,"about\nthem on\n\nActually, I thought Macs wer...",comp.sys.mac.hardware,Unknown
7529,"I sent a version of this post out a while ago,...",rec.sport.baseball,Unknown
7530,I have this kit which includes the following :...,misc.forsale,Unknown


In [None]:
news_assigned.to_csv('outputs/news_assigned_final.csv', index=False)

In [9]:
news_assigned = pd.read_csv('outputs/news_assigned.csv')

### Accuracy

In [21]:
news_assigned_cut = news_assigned[news_assigned['Predicted Topic'] != 'Unknown']

In [22]:
from sklearn.metrics import f1_score, precision_score, recall_score

# Calculate accuracy
accuracy = (news_assigned_cut['label'] == news_assigned_cut['Predicted Topic']).mean()

f1 = f1_score(
    news_assigned_cut['label'], 
    news_assigned_cut['Predicted Topic'], 
    average='weighted', 
    zero_division=0
)
precision = precision_score(
    news_assigned_cut['label'], 
    news_assigned_cut['Predicted Topic'], 
    average='weighted', 
    zero_division=0
)
recall = recall_score(
    news_assigned_cut['label'], 
    news_assigned_cut['Predicted Topic'], 
    average='weighted', 
    zero_division=0
)

# Print all metrics
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")

Accuracy: 60.25%
F1 Score: 64.69%
Precision: 77.62%
Recall: 60.25%
