In [1]:
import os
os.chdir('../')

In [2]:
import sys
sys.path.append(os.path.abspath('src'))

import pandas as pd
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import TruncatedSVD

from scipy.sparse import hstack, csr_matrix
from sklearn.preprocessing import StandardScaler

from lightgbm import LGBMClassifier

from sentence_transformers import SentenceTransformer

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, ClassLabel

import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW

from feature_engineering import FullFeatureExtractor, SelectStructured, SelectText
from src.nlp_models.sentence_bert_lr import SentenceBertStructuredLRClassifier, load_sentence_bert_lr
from src.nlp_models.fine_tuned_bert import FineTunedBertClassifier
from resources import blocklist, whitelist
from resources_lemmatization import trigram_group_mapping, bigram_group_mapping, trigram_list, bigram_list

In [3]:
feature_extractor = FullFeatureExtractor(blocklist, whitelist,
                                         trigram_group_mapping, bigram_group_mapping,
                                         trigram_list, bigram_list)
feature_extractor_bert = FullFeatureExtractor(blocklist, whitelist,
                                         trigram_group_mapping, bigram_group_mapping,
                                         trigram_list, bigram_list, use_light_clean_for_text=True)

In [4]:
df = pd.read_csv('data/processed/final_processed_train.csv')

In [5]:
# Data
df['final_text_enriched'] = df['final_text_enriched'].fillna('')
X_text = df['final_text_enriched']
X_structured_raw = df[['has_number', 'has_rating_number', 'text_length', 'n_tokens', 'sentiment']].values
y = df['target']

# Structured features scaling
scaler = StandardScaler()
X_structured = scaler.fit_transform(X_structured_raw)

In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# First tier

## Dummy model

In [7]:
dummy = DummyClassifier(strategy='most_frequent')
y_pred_dummy = cross_val_predict(dummy, X_structured, y, cv=skf)

# Train on full
dummy.fit(X_structured, y)

In [8]:
print("Dummy Classifier Report:\n", classification_report(y, y_pred_dummy, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred_dummy))

Dummy Classifier Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       153
         1.0       0.74      1.00      0.85       434

    accuracy                           0.74       587
   macro avg       0.37      0.50      0.43       587
weighted avg       0.55      0.74      0.63       587

Confusion Matrix:
 [[  0 153]
 [  0 434]]


## BOW + Logistic Regression

In [9]:
# BoW vectorization
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X_text)

# Combine
X_combined_bow = hstack([X_bow, csr_matrix(X_structured)])

# Model
lr_bow = LogisticRegression(max_iter=1000, class_weight='balanced')
y_pred_bow = cross_val_predict(lr_bow, X_combined_bow, y, cv=skf)

lr_bow.fit(X_combined_bow, y)

## TF-IDF + Structured + Logistic Regression

In [10]:
vectorizer_tfidf = TfidfVectorizer()
X_tfidf = vectorizer_tfidf.fit_transform(X_text)

# --- Combine structured + text ---
X_combined_tfidf = hstack([X_tfidf, csr_matrix(X_structured)])

# --- Model ---
lr_tfidf = LogisticRegression(max_iter=1000, class_weight='balanced')

# --- Cross-Validation ---
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_pred_tfidf = cross_val_predict(lr_tfidf, X_combined_tfidf, y, cv=skf)

# --- Fit on full data ---
lr_tfidf.fit(X_combined_tfidf, y)

## TF-IDF + SVD + LightGBM

In [11]:
# TF-IDF
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X_text)

# SVD
svd = TruncatedSVD(n_components=100, random_state=42)
X_svd = svd.fit_transform(X_tfidf)

# Combine dense
X_combined_svd = np.concatenate([X_svd, X_structured], axis=1)

# LGBM
lgbm = LGBMClassifier(
    num_leaves=15,
    max_depth=3,
    learning_rate=0.05,
    n_estimators=200,
    class_weight='balanced',
    random_state=42,
    verbose=-1
)

# Manual CV for LGBM
y_pred_lgbm = np.zeros_like(y)
for train_idx, test_idx in skf.split(X_combined_svd, y):
    X_train_fold, X_test_fold = X_combined_svd[train_idx], X_combined_svd[test_idx]
    y_train_fold = y.iloc[train_idx]
    lgbm.fit(X_train_fold, y_train_fold)
    y_pred_lgbm[test_idx] = lgbm.predict(X_test_fold)

lgbm.fit(X_combined_svd, y)



# Second tier

## LSA (TF-IDF + SVD) + Logistic Regression

In [12]:
# Same X_svd as above
X_combined_lsa = np.concatenate([X_svd, X_structured], axis=1)

lr_lsa = LogisticRegression(max_iter=1000, class_weight='balanced')
y_pred_lsa = cross_val_predict(lr_lsa, X_combined_lsa, y, cv=skf)

lr_lsa.fit(X_combined_lsa, y)

## BERT Sentence Embeddings + Logistic Regression

In [13]:
# Use raw 'bert_text'
X_text_bert = df['bert_text']

# Structured features
X_structured_raw = df[['has_number', 'has_rating_number', 'text_length', 'n_tokens', 'sentiment']].values
scaler = StandardScaler()
X_structured = scaler.fit_transform(X_structured_raw)

# BERT sentence embeddings (dense, one per review)
bert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
X_bert_emb = bert_model.encode(X_text_bert.tolist(), batch_size=32, show_progress_bar=True)

# Combine embeddings with structured
X_combined_bert = np.concatenate([X_bert_emb, X_structured], axis=1)

# Logistic Regression
lr_bert = LogisticRegression(max_iter=1000, class_weight='balanced')

# CV
y_pred_bert = cross_val_predict(lr_bert, X_combined_bert, y, cv=skf)

# Train on full data
lr_bert.fit(X_combined_bert, y)


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

# Third tier

## Fine-tuned bert

In [14]:
# 1. Prepare dataset
class ReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }
        

In [15]:
# 2. Data split
df_bert = df[['bert_text', 'target']].rename(columns={'bert_text': 'text', 'target': 'label'})
df_bert['label'] = df_bert['label'].astype(int)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_bert['text'].tolist(), df_bert['label'].tolist(), test_size=0.2, stratify=df_bert['label'], random_state=42
)

print(pd.Series(train_labels).sum()/len(train_labels))
print(pd.Series(val_labels).sum()/len(val_labels))

0.7398720682302772
0.7372881355932204


In [16]:
# 3. Tokenizer and datasets
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = ReviewDataset(train_texts, train_labels, tokenizer)
val_dataset = ReviewDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# 4. Model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

# 5. Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# 6. Training loop
epochs = 3
model.train()
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Loss: {loss.item()}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Loss: 0.26706358790397644
Epoch 2/3
Loss: 0.022812960669398308
Epoch 3/3
Loss: 0.22133469581604004


In [32]:
# 7. Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        labels = batch['labels'].cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels)

# Evaluation

In [40]:
print("Dummy Classifier Report:\n", classification_report(y, y_pred_dummy, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred_dummy, normalize='all'))

Dummy Classifier Report:
               precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       153
         1.0       0.74      1.00      0.85       434

    accuracy                           0.74       587
   macro avg       0.37      0.50      0.43       587
weighted avg       0.55      0.74      0.63       587

Confusion Matrix:
 [[0.         0.26064736]
 [0.         0.73935264]]


In [39]:
print("BoW + Structured + LR Report:\n", classification_report(y, y_pred_bow, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred_bow, normalize='all'))

BoW + Structured + LR Report:
               precision    recall  f1-score   support

         0.0       0.61      0.63      0.62       153
         1.0       0.87      0.86      0.86       434

    accuracy                           0.80       587
   macro avg       0.74      0.74      0.74       587
weighted avg       0.80      0.80      0.80       587

Confusion Matrix:
 [[0.16354344 0.09710392]
 [0.10562181 0.63373083]]


In [38]:
print("BoW + Structured + LR Report:\n", classification_report(y, y_pred_tfidf, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred_tfidf, normalize='all'))

BoW + Structured + LR Report:
               precision    recall  f1-score   support

         0.0       0.55      0.71      0.62       153
         1.0       0.89      0.80      0.84       434

    accuracy                           0.78       587
   macro avg       0.72      0.75      0.73       587
weighted avg       0.80      0.78      0.78       587

Confusion Matrix:
 [[0.18398637 0.07666099]
 [0.14821124 0.5911414 ]]


In [37]:
print("TF-IDF + SVD + Structured + LGBM Report:\n", classification_report(y, y_pred_lgbm, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred_lgbm, normalize='all'))

TF-IDF + SVD + Structured + LGBM Report:
               precision    recall  f1-score   support

         0.0       0.62      0.55      0.58       153
         1.0       0.85      0.88      0.86       434

    accuracy                           0.79       587
   macro avg       0.73      0.71      0.72       587
weighted avg       0.79      0.79      0.79       587

Confusion Matrix:
 [[0.14310051 0.11754685]
 [0.08858603 0.65076661]]


In [36]:
print("LSA + Structured + LR Report:\n", classification_report(y, y_pred_lsa, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred_lsa))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred_lsa, normalize='all'))

LSA + Structured + LR Report:
               precision    recall  f1-score   support

         0.0       0.54      0.72      0.62       153
         1.0       0.89      0.79      0.84       434

    accuracy                           0.77       587
   macro avg       0.72      0.75      0.73       587
weighted avg       0.80      0.77      0.78       587

Confusion Matrix:
 [[110  43]
 [ 92 342]]
Confusion Matrix:
 [[0.18739353 0.07325383]
 [0.15672913 0.58262351]]


In [41]:
print("BERT Embeddings + Structured + LR Report:\n", classification_report(y, y_pred_bert, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y, y_pred_bert, normalize='all'))

BERT Embeddings + Structured + LR Report:
               precision    recall  f1-score   support

         0.0       0.77      0.82      0.79       153
         1.0       0.93      0.91      0.92       434

    accuracy                           0.89       587
   macro avg       0.85      0.87      0.86       587
weighted avg       0.89      0.89      0.89       587

Confusion Matrix:
 [[0.21294719 0.04770017]
 [0.06303237 0.67632027]]


Bert metrics below are calculated differently (train_test_split instead of cross-validation, because BERT training takes too long. In any case, target distrution in train and test in pretty much the same)

In [42]:
print("BERT Fine tuned:\n", classification_report(all_labels, all_preds))
print("Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))
print("Confusion Matrix:\n", confusion_matrix(all_labels, all_preds, normalize='all'))

BERT Fine tuned:
               precision    recall  f1-score   support

           0       0.82      1.00      0.90        31
           1       1.00      0.92      0.96        87

    accuracy                           0.94       118
   macro avg       0.91      0.96      0.93       118
weighted avg       0.95      0.94      0.94       118

Confusion Matrix:
 [[31  0]
 [ 7 80]]
Confusion Matrix:
 [[0.26271186 0.        ]
 [0.05932203 0.6779661 ]]


F1-scores for models using BERT were superior to regular models (bert embedding/ bert fine tuned: 0.84/0.96). Dummy had 0.85
For practical use the suggested model is the BERT Sentence Embeddings + Logistic Regression. If more time or processing is available, then the suggestions if to use Fine-tuned BERT