## **Importing Libraries**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from nltk.probability import FreqDist
import re
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import classification_report, f1_score

## **Dataset Preparation**

In [21]:
train_dir = 'Train/'
test_dir = 'Test/'

def import_data(directory_path):
    try:
        all_filenames = os.listdir(directory_path)
        csv_files = [
            os.path.join(directory_path, filename) 
            for filename in all_filenames 
            if filename.endswith('.csv')
        ]
    except FileNotFoundError:
        print(f"Error: The directory '{directory_path}' was not found.")
        return None

    if not csv_files:
        print(f"Warning: No CSV files found in the directory: {directory_path}")
        return None

    list_of_dfs = []
    print(f"Found {len(csv_files)} CSV files to process...")

    for file_path in csv_files:
        try:
            temp_df = pd.read_csv(file_path)
            filename = os.path.basename(file_path)
            app_name = os.path.splitext(filename)[0]
            temp_df['App'] = app_name
            list_of_dfs.append(temp_df)
            print(f"  - Processed {filename} and added '{app_name}' as App.")
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

    print("\nCombining all DataFrames...")
    combined_df = pd.concat(list_of_dfs, ignore_index=True)
    print("Done.")
    return combined_df

In [22]:
train_set = import_data(train_dir)
test_set = import_data(test_dir)

Found 6 CSV files to process...
  - Processed claude.csv and added 'claude' as App.
  - Processed deepseek.csv and added 'deepseek' as App.
  - Processed gemini.csv and added 'gemini' as App.
  - Processed gpt.csv and added 'gpt' as App.
  - Processed grok.csv and added 'grok' as App.
  - Processed perplexity.csv and added 'perplexity' as App.

Combining all DataFrames...
Done.
Found 6 CSV files to process...
  - Processed claude.csv and added 'claude' as App.
  - Processed deepseek.csv and added 'deepseek' as App.
  - Processed gemini.csv and added 'gemini' as App.
  - Processed gpt.csv and added 'gpt' as App.
  - Processed grok.csv and added 'grok' as App.
  - Processed perplexity.csv and added 'perplexity' as App.

Combining all DataFrames...
Done.


In [None]:
def preprocess(df, text_column):
    
    df_copy = df.copy()
    df_copy[text_column] = df_copy[text_column].astype(str).fillna('')
    df_copy['cleaned_text'] = df_copy[text_column].apply(
        lambda x: re.sub(r'[^a-zA-Z\s]', '', x.lower())
    )
    return df_copy

In [24]:
train, val = train_test_split(train_set, test_size=0.2, random_state=42, stratify=train_set['Sentiment'])

In [25]:
ret = preprocess(train, text_column='Comment')


--- Starting Text Preprocessing ---


## **Modelling & Evaluation**

In [26]:
TEXT_COLUMN = 'Comment'
APP_VERSION_COLUMN = 'AppVersion'
AT_COLUMN = 'At'
LABEL_COLUMN = 'Sentiment'
BASE_MODEL_NAME = 'bert-base-uncased'
BASE_MODEL_PATH = './models/base_model_with_metadata2/'
SPECIALIST_MODELS_PATH = './models/specialist_models_with_metadata2/'
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS_BASE = 3
EPOCHS_SPECIALIST = 3
LEARNING_RATE = 2e-5

class ReviewDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, item):
        row = self.dataframe.iloc[item]
        
        comment = str(row[TEXT_COLUMN])
        app_version = str(row[APP_VERSION_COLUMN])
        date_str = pd.to_datetime(row[AT_COLUMN]).strftime('%B %Y')
        
        combined_text = f"version {app_version} date {date_str} comment: {comment}"
        label = row[LABEL_COLUMN]
        
        encoding = self.tokenizer.encode_plus(
            combined_text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [27]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = ReviewDataset(
        dataframe=df,
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, num_workers=0)

In [28]:
def train_model(model, data_loader, optimizer, device, n_epochs):
    model = model.train()
    for epoch in range(n_epochs):
        print(f'Epoch {epoch + 1}/{n_epochs}')
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
    return model

def train_base_model(train_df, device):
    print("\n--- Training Base Model with Metadata ---")
    tokenizer = BertTokenizer.from_pretrained(BASE_MODEL_NAME)
    model = BertForSequenceClassification.from_pretrained(BASE_MODEL_NAME, num_labels=3)
    model = model.to(device)
    
    train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    
    model = train_model(model, train_data_loader, optimizer, device, EPOCHS_BASE)
    
    print(f"Base model training complete. Saving to {BASE_MODEL_PATH}")
    model.save_pretrained(BASE_MODEL_PATH)
    tokenizer.save_pretrained(BASE_MODEL_PATH)

def train_specialist_models(train_df, device):
    print("\n--- Training Specialist Models with Metadata ---")
    app_names = train_df['App'].unique()
    
    for app_name in app_names:
        print(f"\nFine-tuning for app: {app_name}")
        
        tokenizer = BertTokenizer.from_pretrained(BASE_MODEL_PATH)
        model = BertForSequenceClassification.from_pretrained(BASE_MODEL_PATH)
        model = model.to(device)
        
        app_df = train_df[train_df['App'] == app_name]
        
        app_data_loader = create_data_loader(app_df, tokenizer, MAX_LEN, BATCH_SIZE)
        optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
        
        model = train_model(model, app_data_loader, optimizer, device, EPOCHS_SPECIALIST)
        
        specialist_path = os.path.join(SPECIALIST_MODELS_PATH, f'bert-specialist-{app_name}')
        print(f"Specialist model for {app_name} training complete. Saving to {specialist_path}")
        model.save_pretrained(specialist_path)
        tokenizer.save_pretrained(specialist_path)

def evaluate_models(test_df, device):
    print("\n--- Evaluating Specialist Models ---")
    y_pred = []
    y_test = []

    app_names = test_df['App'].unique()

    for app_name in app_names:
        print(f"Evaluating on test data for app: {app_name}")
        specialist_path = os.path.join(SPECIALIST_MODELS_PATH, f'bert-specialist-{app_name}')
        
        tokenizer = BertTokenizer.from_pretrained(specialist_path)
        model = BertForSequenceClassification.from_pretrained(specialist_path)
        model = model.to(device)
        model = model.eval()

        app_test_df = test_df[test_df['App'] == app_name]
        
        for _, row in app_test_df.iterrows():
            true_label = row[LABEL_COLUMN]
            
            comment = str(row[TEXT_COLUMN])
            app_version = str(row[APP_VERSION_COLUMN])
            date_str = pd.to_datetime(row[AT_COLUMN]).strftime('%B %Y')
            review_text = f"version {app_version} date {date_str} comment: {comment}"
            
            encoded_review = tokenizer.encode_plus(
                review_text, max_length=MAX_LEN, add_special_tokens=True,
                return_token_type_ids=False, padding='max_length', truncation=True,
                return_attention_mask=True, return_tensors='pt'
            )
            
            input_ids = encoded_review['input_ids'].to(device)
            attention_mask = encoded_review['attention_mask'].to(device)
            
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                _, prediction = torch.max(outputs.logits, dim=1)
            
            y_pred.append(prediction.item())
            y_test.append(true_label)

    print("\n--- Final Evaluation Report ---")
    print(classification_report(y_test, y_pred, target_names=['Negative', 'Mixed', 'Positive']))
    f1_macro = f1_score(y_test, y_pred, average='macro')
    print(f"\nF1 Macro Score: {f1_macro:.4f}")
def predict_on_new_data(predict_df, device):
    print("\n--- Predicting on New Data ---")
    predictions = []
    app_names = predict_df['App'].unique()
    
    models = {}
    tokenizers = {}
    for app_name in app_names:
        specialist_path = os.path.join(SPECIALIST_MODELS_PATH, f'bert-specialist-{app_name}')
        if os.path.exists(specialist_path):
            tokenizers[app_name] = BertTokenizer.from_pretrained(specialist_path)
            models[app_name] = BertForSequenceClassification.from_pretrained(specialist_path)
            models[app_name].to(device)
            models[app_name].eval()

    for _, row in predict_df.iterrows():
        app_name = row['App']
        
        comment = str(row[TEXT_COLUMN])
        app_version = str(row[APP_VERSION_COLUMN])
        date_str = pd.to_datetime(row[AT_COLUMN]).strftime('%B %Y')
        review_text = f"version {app_version} date {date_str} comment: {comment}"
        
        if app_name in models:
            tokenizer = tokenizers[app_name]
            model = models[app_name]
            
            encoded_review = tokenizer.encode_plus(
                review_text, max_length=MAX_LEN, add_special_tokens=True,
                return_token_type_ids=False, padding='max_length', truncation=True,
                return_attention_mask=True, return_tensors='pt'
            )
            
            input_ids = encoded_review['input_ids'].to(device)
            attention_mask = encoded_review['attention_mask'].to(device)
            
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                _, prediction = torch.max(outputs.logits, dim=1)
            
            predictions.append(prediction.item())
        else:
            predictions.append(None)

    predict_df['predicted_sentiment'] = predictions
    return predict_df

def format_for_submission(results_df):
    print("\n--- Formatting for Submission ---")
    app_order = ['gpt', 'claude', 'deepseek', 'gemini', 'grok', 'perplexity']
    results_df['App'] = pd.Categorical(results_df['App'], categories=app_order, ordered=True)
    sorted_df = results_df.sort_values(by=['App', 'CommentId'])
    
    submission_df = pd.DataFrame()
    submission_df['CommentId'] = sorted_df['App'].astype(str) + '_' + sorted_df['CommentId'].astype(str)
    submission_df['Sentiment'] = sorted_df['predicted_sentiment']
    return submission_df

In [29]:
if __name__ == '__main__':
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    train_set = import_data('Train/')
    test_set = import_data('Test/') 

    train, val = train_test_split(train_set, test_size=0.2, random_state=42, stratify=train_set['Sentiment'])
    
    if not os.path.exists(BASE_MODEL_PATH):
        os.makedirs(BASE_MODEL_PATH)
    if not os.path.exists(SPECIALIST_MODELS_PATH):
        os.makedirs(SPECIALIST_MODELS_PATH)

    train_base_model(train, device)
    train_specialist_models(train, device)
    evaluate_models(val, device)
    
    prediction_results_df = predict_on_new_data(test_set, device)
    submission = format_for_submission(prediction_results_df)
    submission.to_csv('submission.csv', index=False)
    print("\nSubmission file 'submission.csv' created successfully.")

Using device: cuda
Found 6 CSV files to process...
  - Processed claude.csv and added 'claude' as App.
  - Processed deepseek.csv and added 'deepseek' as App.
  - Processed gemini.csv and added 'gemini' as App.
  - Processed gpt.csv and added 'gpt' as App.
  - Processed grok.csv and added 'grok' as App.
  - Processed perplexity.csv and added 'perplexity' as App.

Combining all DataFrames...
Done.
Found 6 CSV files to process...
  - Processed claude.csv and added 'claude' as App.
  - Processed deepseek.csv and added 'deepseek' as App.
  - Processed gemini.csv and added 'gemini' as App.
  - Processed gpt.csv and added 'gpt' as App.
  - Processed grok.csv and added 'grok' as App.
  - Processed perplexity.csv and added 'perplexity' as App.

Combining all DataFrames...
Done.

--- Training Base Model with Metadata ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
Base model training complete. Saving to ./models/base_model_with_metadata2/

--- Training Specialist Models with Metadata ---

Fine-tuning for app: grok
Epoch 1/3
Epoch 2/3
Epoch 3/3
Specialist model for grok training complete. Saving to ./models/specialist_models_with_metadata2/bert-specialist-grok

Fine-tuning for app: gemini
Epoch 1/3
Epoch 2/3
Epoch 3/3
Specialist model for gemini training complete. Saving to ./models/specialist_models_with_metadata2/bert-specialist-gemini

Fine-tuning for app: gpt
Epoch 1/3
Epoch 2/3
Epoch 3/3
Specialist model for gpt training complete. Saving to ./models/specialist_models_with_metadata2/bert-specialist-gpt

Fine-tuning for app: perplexity
Epoch 1/3
Epoch 2/3
Epoch 3/3
Specialist model for perplexity training complete. Saving to ./models/specialist_models_with_metadata2/bert-specialist-perplexity

Fine-tuning for app: deepseek
Epoch 1/3
Epoch 2/3
Epoch 3/3
Specialist model for deepseek training complete. Saving to ./m