In [2]:
import pandas as pd
from tqdm import tqdm
from langchain_community.llms import Ollama

# Dataset

In [2]:
bbc_df = pd.read_csv('../../datasets/BBC News/BBC News Train.csv')
bbc_df = bbc_df.sample(n=735, random_state=42).reset_index(drop=True) # 735 samples to imitate test set (which is unlabelled)
bbc_df

Unnamed: 0,ArticleId,Text,Category
0,2160,wal-mart is sued over rude lyrics the parents ...,entertainment
1,1360,howard taunts blair over splits tony blair s f...,politics
2,302,fox attacks blair s tory lies tony blair lie...,politics
3,864,online commons to spark debate online communit...,tech
4,2184,piero gives rugby perspective bbc sport unveil...,tech
...,...,...,...
730,512,big guns ease through in san jose top-seeded a...,sport
731,905,marsh executive in guilty plea an executive at...,business
732,351,sky takes over oscar night mantle sky has sign...,entertainment
733,603,pension hitch for long-living men male life ex...,business


# Modeling

In [3]:
llm = Ollama(model="gemma2")

  llm = Ollama(model="gemma2")


### Step 0 - Defining Topics according to the dataset

In [4]:
all_topics = [
'sport', 'business', 'politics', 'tech', 'entertainment' 
 ]

### Step 3 - Assigning batches of dataset to Topics Generated in Step 2 and sentiment of the review

In [5]:
def assign_topics_in_batches(input_df, topics, batch_size, llm=None):
    """
    Assign topics to News Group in batches and update DataFrame directly.

    Parameters:
    input_df (pd.DataFrame): DataFrame containing Amazon reviews
    topics (str): String of topics to assign from
    batch_size (int): Number of reviews to process in each batch
    llm: The language model instance to use for assigning topics

    Returns:
    pd.DataFrame: Updated DataFrame with topic and sentiment assignments
    """
    if llm is None:
        raise ValueError("LLM instance must be provided")

    # Create a copy of the DataFrame to avoid modifying the original
    df = input_df.copy()

    # Initialize Topic and Sentiment columns with 'Unknown'
    df['Predicted Topic'] = 'Unknown'

    total_df = len(df)

    # Process reviews in batches with progress bar
    for start_idx in tqdm(range(0, total_df, batch_size), desc="Assigning topics"):
        end_idx = min(start_idx + batch_size, total_df)
        batch_news_list = df.iloc[start_idx:end_idx]
        batch_news = " ".join([f"Item {i + 1}: {review}," for i, review in enumerate(batch_news_list['Text'])])
        
        # Generate prompt for current batch
        prompt_assigning_prompt = f'''You are provided with news and helping to cluster them based on the topics.
Please assign the news to the topics provided. Return only the name of the topic for the respective nwes.
News can be found in tripletick block: ```{batch_news}```
Topics to choose from: {topics}
Please return in CSV format only topics for respective reviews and nothing else. Do not use triple backtick blocks. Only output exactly as on the example below:
Example: Having an input of News1, News2, News3, News4, News5, ... NewsN
Output: Topic1, Topic2, Topic3, Topic4, Topic5, ... TopicN
'''
        # Get assignments for current batch
        result = llm.invoke(prompt_assigning_prompt, temperature=0.0)
        try:
            # Get assignments for current batch
            result = llm.invoke(prompt_assigning_prompt, temperature=0.0)
            
            # Clean and split the result
            result = result.strip()
            if result.startswith('```') and result.endswith('```'):
                result = result[3:-3].strip()
            batch_assigned_topics = [topic.strip() for topic in result.split(',')]
            
            # Make sure we have the right number of topics
            current_batch_size = len(batch_news)
            
            # Update each row individually to avoid alignment issues
            for idx, topic in enumerate(batch_assigned_topics):
                current_idx = start_idx + idx
                if current_idx < len(df):
                    df.iloc[current_idx, df.columns.get_loc('Predicted Topic')] = topic
                    
        except Exception as e:
            print(f"\nError processing batch {start_idx}-{end_idx}: {str(e)}")
            print(f"Result received: {result}")
            print(f"Batch size: {current_batch_size}")
            print(f"Number of topics received: {len(batch_assigned_topics) if 'batch_assigned_topics' in locals() else 'N/A'}")
            continue
    return df

In [6]:
news_assigned = assign_topics_in_batches(
    bbc_df,
    topics=str(all_topics),
    batch_size=1,
    llm=llm
)

Assigning topics:   0%|          | 0/735 [00:00<?, ?it/s]

Assigning topics: 100%|██████████| 735/735 [39:28<00:00,  3.22s/it]  


In [8]:
news_assigned.to_csv('outputs/bbc_news_assigned.csv', index=False)

In [3]:
news_assigned = pd.read_csv('outputs/bbc_news_assigned.csv')

### Accuracy

In [4]:
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score

def calculate_metrics(y_true, y_pred):
    """
    Calculate various classification metrics
    
    Parameters:
    y_true: True labels (actual categories)
    y_pred: Predicted labels
    
    Returns:
    dict: Dictionary containing various metrics
    """
    # Calculate individual metrics
    accuracy = (y_true == y_pred).mean()
    f1 = f1_score(y_true, y_pred, average='weighted')
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    
    # Get detailed classification report
    report = classification_report(y_true, y_pred)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    metrics = {
        'accuracy': accuracy * 100,
        'f1_score': f1 * 100,
        'precision': precision * 100,
        'recall': recall * 100,
        'detailed_report': report,
        'confusion_matrix': conf_matrix
    }
    
    return metrics

# Using your existing data
metrics = calculate_metrics(
    news_assigned['Category'],
    news_assigned['Predicted Topic']
)

# Print results
print(f"Accuracy: {metrics['accuracy']:.2f}%")
print(f"F1 Score: {metrics['f1_score']:.2f}%")
print(f"Precision: {metrics['precision']:.2f}%")
print(f"Recall: {metrics['recall']:.2f}%")
print("\nDetailed Classification Report:")
print(metrics['detailed_report'])

Accuracy: 94.97%
F1 Score: 94.93%
Precision: 95.16%
Recall: 94.97%

Detailed Classification Report:
               precision    recall  f1-score   support

     business       0.99      0.88      0.93       169
entertainment       0.94      0.98      0.96       128
     politics       0.90      0.98      0.94       140
        sport       0.96      1.00      0.98       162
         tech       0.96      0.91      0.94       136

     accuracy                           0.95       735
    macro avg       0.95      0.95      0.95       735
 weighted avg       0.95      0.95      0.95       735



In [9]:
news_assigned[news_assigned['Category'] != news_assigned['Predicted Topic']]

Unnamed: 0,ArticleId,Text,Category,Predicted Topic
3,864,online commons to spark debate online communit...,tech,politics
4,2184,piero gives rugby perspective bbc sport unveil...,tech,sport
8,140,rich grab half colombia poor fund half of the ...,business,politics
39,1969,brussels raps mobile call charges the european...,business,tech
79,1503,iran budget seeks state sell-offs iran s presi...,business,politics
110,182,blogger grounded by her airline a us airline a...,tech,business
132,510,dutch watch van gogh s last film the last film...,entertainment,politics
169,1336,musicians upbeat about the net musicians are...,tech,entertainment
172,1820,halo fans hope for sequel xbox video game hal...,tech,entertainment
187,548,ink helps drive democracy in asia the kyrgyz r...,tech,politics
