In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from nltk.probability import FreqDist
import re
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split
import torch
from sklearn.metrics import classification_report, f1_score

In [3]:
train_dir = 'Train/'
test_dir = 'Test/'

def import_data(directory_path):
    try:
        all_filenames = os.listdir(directory_path)
        csv_files = [
            os.path.join(directory_path, filename) 
            for filename in all_filenames 
            if filename.endswith('.csv')
        ]
    except FileNotFoundError:
        print(f"Error: The directory '{directory_path}' was not found.")
        return None

    if not csv_files:
        print(f"Warning: No CSV files found in the directory: {directory_path}")
        return None

    list_of_dfs = []
    print(f"Found {len(csv_files)} CSV files to process...")

    for file_path in csv_files:
        try:
            temp_df = pd.read_csv(file_path)
            filename = os.path.basename(file_path)
            app_name = os.path.splitext(filename)[0]
            temp_df['App'] = app_name
            list_of_dfs.append(temp_df)
            print(f"  - Processed {filename} and added '{app_name}' as App.")
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")

    print("\nCombining all DataFrames...")
    combined_df = pd.concat(list_of_dfs, ignore_index=True)
    print("Done.")
    return combined_df

In [4]:
train_set = import_data(train_dir)
test_set = import_data(test_dir)

Found 6 CSV files to process...
  - Processed claude.csv and added 'claude' as App.
  - Processed deepseek.csv and added 'deepseek' as App.
  - Processed gemini.csv and added 'gemini' as App.
  - Processed gpt.csv and added 'gpt' as App.
  - Processed grok.csv and added 'grok' as App.
  - Processed perplexity.csv and added 'perplexity' as App.

Combining all DataFrames...
Done.
Found 6 CSV files to process...
  - Processed claude.csv and added 'claude' as App.
  - Processed deepseek.csv and added 'deepseek' as App.
  - Processed gemini.csv and added 'gemini' as App.
  - Processed gpt.csv and added 'gpt' as App.
  - Processed grok.csv and added 'grok' as App.
  - Processed perplexity.csv and added 'perplexity' as App.

Combining all DataFrames...
Done.


In [5]:
def preprocess(df, text_column):
    print("\n--- Starting Text Preprocessing ---")
    
    df_copy = df.copy()
    df_copy[text_column] = df_copy[text_column].astype(str).fillna('')

    print("Step 1: Cleaning and lowercasing text...")
    df_copy['cleaned_text'] = df_copy[text_column].apply(
        lambda x: re.sub(r'[^a-zA-Z\s]', '', x.lower())
    )

    # print("Step 2: Tokenizing text with BertTokenizer...")
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    # df_copy['tokenized_text'] = df_copy['cleaned_text'].apply(tokenizer.tokenize)

    # print("Step 3: Removing stop words...")
    # stop_words = set(stopwords.words('english'))
    # df_copy['tokenized_text'] = df_copy['tokenized_text'].apply(
    #     lambda tokens: [word for word in tokens if word not in stop_words]
    # )

    # print("Step 4: Applying stemming...")
    # stemmer = PorterStemmer()
    # df_copy['tokenized_text'] = df_copy['tokenized_text'].apply(
    #     lambda tokens: [stemmer.stem(word) for word in tokens]
    # )

    # if is_train:
    #     print("Step 5: Identifying and removing rare words (from training data)...")
    #     all_words = [word for tokens in df_copy['tokenized_text'] for word in tokens]
    #     fdist = FreqDist(all_words)
    #     rare_words_to_remove = {word for word, count in fdist.items() if count <= 2}
    #     print(f"  - Found {len(rare_words_to_remove)} rare words to remove.")
        
    #     df_copy['processed_text'] = df_copy['tokenized_text'].apply(
    #         lambda tokens: [word for word in tokens if word not in rare_words_to_remove]
    #     )
    #     print("Preprocessing complete.")
    #     return df_copy, rare_words_to_remove
    # else:
    #     print("Step 5: Removing rare words (using provided list)...")
    #     if rare_words_to_remove is None:
    #         raise ValueError("`rare_words_to_remove` must be provided when `is_train` is False.")
            
    #     df_copy['processed_text'] = df_copy['tokenized_text'].apply(
    #         lambda tokens: [word for word in tokens if word not in rare_words_to_remove]
        
    print("Preprocessing complete.")
    return df_copy

In [6]:
train, val = train_test_split(train_set, test_size=0.2, random_state=42, stratify=train_set['Sentiment'])

In [None]:
ret = preprocess(train, text_column='Comment')


--- Starting Text Preprocessing ---
Step 1: Cleaning and lowercasing text...
Preprocessing complete.


In [8]:
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = str(self.reviews[item])
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'review_text': review,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [9]:
def create_data_loader(df, tokenizer, max_len, batch_size, text_col, label_col):
    ds = ReviewDataset(
        reviews=df[text_col].to_numpy(),
        labels=df[label_col].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(ds, batch_size=batch_size, num_workers=0)

In [22]:
MAX_LEN = 128
BATCH_SIZE = 3
TEXT_COLUMN = 'Comment'
LABEL_COLUMN = 'Sentiment'
EPOCHS_BASE = 3
EPOCHS_SPECIALIST = 3
LEARNING_RATE = 2e-5
BASE_MODEL_PATH = 'base_model/'
SPECIALIST_MODELS_PATH = 'specialist_models/'

def train_model(model, data_loader, optimizer, device, n_epochs):
    """
    Generic training loop for a PyTorch model.
    """
    model = model.train()
    for epoch in range(n_epochs):
        print(f'Epoch {epoch + 1}/{n_epochs}')
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
    return model
def train_base_model(train_df, device):
    print("\n--- Step 2: Training Base Model (Domain Adaptation) ---")
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
    model = model.to(device)
    
    train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE, TEXT_COLUMN, LABEL_COLUMN)
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    
    model = train_model(model, train_data_loader, optimizer, device, EPOCHS_BASE)
    
    print(f"Base model training complete. Saving to {BASE_MODEL_PATH}")
    model.save_pretrained(BASE_MODEL_PATH)
    tokenizer.save_pretrained(BASE_MODEL_PATH)

# --- 6. Specialization: Fine-tune for Each App ---
def train_specialist_models(train_df, device):
    print("\n--- Step 3: Training Specialist Models (Fine-tuning) ---")
    app_names = train_df['App'].unique()
    
    for app_name in app_names:
        print(f"\nFine-tuning for app: {app_name}")
        
        # Load the domain-adapted base model
        tokenizer = BertTokenizer.from_pretrained(BASE_MODEL_PATH)
        model = BertForSequenceClassification.from_pretrained(BASE_MODEL_PATH)
        model = model.to(device)
        
        # Filter data for the specific app
        app_df = train_df[train_df['App'] == app_name]
        
        app_data_loader = create_data_loader(app_df, tokenizer, MAX_LEN, BATCH_SIZE, TEXT_COLUMN, LABEL_COLUMN)
        optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
        
        model = train_model(model, app_data_loader, optimizer, device, EPOCHS_SPECIALIST)
        
        # Save the specialist model
        specialist_path = os.path.join(SPECIALIST_MODELS_PATH, f'bert-specialist-{app_name}')
        print(f"Specialist model for {app_name} training complete. Saving to {specialist_path}")
        model.save_pretrained(specialist_path)
        tokenizer.save_pretrained(specialist_path)

# --- 7. Evaluation ---
def evaluate_models(test_df, device):
    print("\n--- Step 4: Evaluating Specialist Models ---")
    y_reviews = []
    y_pred = []
    y_test = []

    app_names = test_df['App'].unique()

    for app_name in app_names:
        print(f"Evaluating on test data for app: {app_name}")
        specialist_path = os.path.join(SPECIALIST_MODELS_PATH, f'bert-specialist-{app_name}')
        
        # Load the specialist model for this app
        tokenizer = BertTokenizer.from_pretrained(specialist_path)
        model = BertForSequenceClassification.from_pretrained(specialist_path)
        model = model.to(device)
        model = model.eval()

        app_test_df = test_df[test_df['App'] == app_name]
        
        for _, row in app_test_df.iterrows():
            # --- FIX: Ensure the review_text is a string to handle NaN values ---
            review_text = str(row[TEXT_COLUMN])
            true_label = row['Sentiment'] # Assuming LABEL_COLUMN is 'Sentiment'
            
            encoded_review = tokenizer.encode_plus(
                review_text,
                max_length=MAX_LEN,
                add_special_tokens=True,
                return_token_type_ids=False,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )
            
            input_ids = encoded_review['input_ids'].to(device)
            attention_mask = encoded_review['attention_mask'].to(device)
            
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                _, prediction = torch.max(outputs.logits, dim=1)
            
            y_reviews.append(review_text)
            y_pred.append(prediction.item())
            y_test.append(true_label)

    print("\n--- Final Evaluation Report ---")
    print(classification_report(y_test, y_pred, target_names=['Negative', 'Mixed', 'Positive']))
    
    # Calculate and print F1 macro score
    f1_macro = f1_score(y_test, y_pred, average='macro')
    print(f"\nF1 Macro Score: {f1_macro:.4f}")

In [23]:
if __name__ == '__main__':
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # # Create directories for models if they don't exist
    # if not os.path.exists(BASE_MODEL_PATH):
    #     os.makedirs(BASE_MODEL_PATH)
    # if not os.path.exists(SPECIALIST_MODELS_PATH):
    #     os.makedirs(SPECIALIST_MODELS_PATH)
    
    # # 2. Train the base model on all training data
    # train_base_model(train, device)
    
    # # 3. Fine-tune specialist models for each app
    # train_specialist_models(train, device)
    
    # # 4. Evaluate the specialist models on the locked-away test set
    evaluate_models(val, device)

Using device: cuda

--- Step 4: Evaluating Specialist Models ---
Evaluating on test data for app: gemini
Evaluating on test data for app: gpt
Evaluating on test data for app: grok
Evaluating on test data for app: deepseek
Evaluating on test data for app: perplexity
Evaluating on test data for app: claude

--- Final Evaluation Report ---
              precision    recall  f1-score   support

    Negative       0.79      0.72      0.75      3504
       Mixed       0.24      0.10      0.14      1118
    Positive       0.93      0.97      0.95     21555

    accuracy                           0.90     26177
   macro avg       0.66      0.60      0.62     26177
weighted avg       0.88      0.90      0.89     26177


F1 Macro Score: 0.6175


In [14]:
def predict_on_new_data(predict_df, device):
    """
    Makes predictions on a new, unlabeled DataFrame.
    """
    print("\n--- Step 5: Predicting on New Data ---")
    predictions = []
    app_names = predict_df['App'].unique()
    
    # Load each specialist model once to avoid reloading in the loop
    models = {}
    tokenizers = {}
    for app_name in app_names:
        print(f"Loading model for app: {app_name}")
        specialist_path = os.path.join(SPECIALIST_MODELS_PATH, f'bert-specialist-{app_name}')
        if os.path.exists(specialist_path):
            tokenizers[app_name] = BertTokenizer.from_pretrained(specialist_path)
            models[app_name] = BertForSequenceClassification.from_pretrained(specialist_path)
            models[app_name] = models[app_name].to(device)
            models[app_name] = models[app_name].eval()
        else:
            print(f"Warning: No specialist model found for app '{app_name}'. Skipping its predictions.")

    # Iterate through the dataframe to make predictions
    for _, row in predict_df.iterrows():
        review_text = str(row[TEXT_COLUMN])
        app_name = row['App']
        
        if app_name in models:
            tokenizer = tokenizers[app_name]
            model = models[app_name]
            
            encoded_review = tokenizer.encode_plus(
                review_text,
                max_length=MAX_LEN,
                add_special_tokens=True,
                return_token_type_ids=False,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )
            
            input_ids = encoded_review['input_ids'].to(device)
            attention_mask = encoded_review['attention_mask'].to(device)
            
            with torch.no_grad():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                _, prediction = torch.max(outputs.logits, dim=1)
            
            predictions.append(prediction.item())
        else:
            predictions.append(None) # Append None if no model was found for the app

    predict_df['predicted_sentiment'] = predictions
    return predict_df

# --- Main Execution Block ---
if __name__ == '__main__':
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Create directories for models if they don't exist
    if not os.path.exists(BASE_MODEL_PATH):
        os.makedirs(BASE_MODEL_PATH)
    if not os.path.exists(SPECIALIST_MODELS_PATH):
        os.makedirs(SPECIALIST_MODELS_PATH)


    # Get predictions
    prediction_results_df = predict_on_new_data(test_set, device)
    
    print("\n--- Prediction Results ---")
    print(prediction_results_df)


Using device: cuda

--- Step 5: Predicting on New Data ---
Loading model for app: claude
Loading model for app: deepseek
Loading model for app: gemini
Loading model for app: gpt
Loading model for app: grok
Loading model for app: perplexity

--- Prediction Results ---
       CommentId                                            Comment  \
0              1  So this length limit issue has ruined Claude f...   
1              2                        Great Ai consultation tool.   
2              3  I am a pro user and I think there is a bug in ...   
3              4                                   I love Claude AI   
4              5  No access to live data without subscription (p...   
...          ...                                                ...   
45966       5347                                            amazing   
45967       5348  used this app for a while, enjoyed it a lot, g...   
45968       5349  it is for knowledge and helping for speech or ...   
45969       5350      

In [18]:
def format_for_submission(results_df):
    """
    Formats the prediction results into the required submission format.
    """
    print("\nFormatting data for submission...")
    
    # Define the desired order for the apps
    app_order = ['gpt', 'claude', 'deepseek', 'gemini', 'grok', 'perplexity']
    
    # Convert 'App' column to a categorical type with the specified order
    results_df['App'] = pd.Categorical(results_df['App'], categories=app_order, ordered=True)
    
    # Sort the DataFrame first by the custom app order, then by CommentId
    sorted_df = results_df.sort_values(by=['App', 'CommentId'])
    
    submission_df = pd.DataFrame()
    # Create the 'CommentId' by combining 'App' and the original 'CommentId'
    submission_df['CommentId'] = sorted_df['App'].astype(str) + '_' + sorted_df['CommentId'].astype(str)
    # Rename the 'predicted_sentiment' column to 'Sentiment'
    submission_df['Sentiment'] = sorted_df['predicted_sentiment']
    
    print("Formatting complete.")
    return submission_df


In [20]:
submission = format_for_submission(prediction_results_df)
submission.to_csv('submission.csv', index=False)


Formatting data for submission...
Formatting complete.
