In [1]:
import os
os.chdir('../')

In [None]:
import sys
sys.path.append(os.path.abspath('src'))

import pandas as pd
import numpy as np
import joblib

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.base import BaseEstimator, ClassifierMixin

from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

from lightgbm import LGBMClassifier

from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, BertForSequenceClassification

from feature_engineering import FullFeatureExtractor, SelectStructured, SelectText
from src.nlp_models.sentence_bert_lr import SentenceBertStructuredLRClassifier, load_sentence_bert_lr
from src.nlp_models.fine_tuned_bert import FineTunedBertClassifier
from resources import blocklist, whitelist
from resources_lemmatization import trigram_group_mapping, bigram_group_mapping, trigram_list, bigram_list

In [4]:
feature_extractor = FullFeatureExtractor(blocklist, whitelist,
                                         trigram_group_mapping, bigram_group_mapping,
                                         trigram_list, bigram_list)
feature_extractor_bert = FullFeatureExtractor(blocklist, whitelist,
                                         trigram_group_mapping, bigram_group_mapping,
                                         trigram_list, bigram_list, use_light_clean_for_text=True)

In [5]:
df = pd.read_csv('data/processed/final_label.csv')

In [6]:
X = df['text']
y = df['target']

# First tier

## Dummy model

In [7]:
# Structured pipeline (same as before)
structured_pipeline = Pipeline([
    ('select_structured', SelectStructured()),
    ('scaler', StandardScaler())
])

# Final dummy pipeline
pipeline_dummy = Pipeline([
    ('feature_extractor', feature_extractor),
    ('structured', structured_pipeline),
    ('clf', DummyClassifier(strategy='most_frequent'))
])

In [8]:
pipeline_dummy.fit(X, y)
y_pred_dummy = pipeline_dummy.predict(X)

print(classification_report(y, y_pred_dummy, zero_division=0))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       153
         1.0       0.74      1.00      0.85       434

    accuracy                           0.74       587
   macro avg       0.37      0.50      0.43       587
weighted avg       0.55      0.74      0.63       587



## BOW + Logistic Regression

In [9]:
# Text pipeline
text_pipeline = Pipeline([
    ('select_text', SelectText()),
    ('bow', CountVectorizer())
])

# Structured pipeline
structured_pipeline = Pipeline([
    ('select_structured', SelectStructured()),
    ('scaler', StandardScaler())
])

# Combine
combined_features = FeatureUnion([
    ('structured', structured_pipeline),
    ('text', text_pipeline)
])

# Final pipeline
pipeline_bow_lr = Pipeline([
    ('feature_extractor', feature_extractor),
    ('combined_features', combined_features),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

In [10]:
pipeline_bow_lr.fit(X, y)
y_pred_bow_lr = pipeline_bow_lr.predict(X)

print(classification_report(y, y_pred_bow_lr, zero_division=0))

              precision    recall  f1-score   support

         0.0       0.88      0.97      0.92       153
         1.0       0.99      0.95      0.97       434

    accuracy                           0.96       587
   macro avg       0.93      0.96      0.95       587
weighted avg       0.96      0.96      0.96       587



## TF-IDF + Structured + Logistic Regression

In [11]:
text_pipeline_tfidf = Pipeline([
    ('select_text', SelectText()),
    ('tfidf', TfidfVectorizer())
])

combined_features_tfidf = FeatureUnion([
    ('structured', structured_pipeline),
    ('text', text_pipeline_tfidf)
])

pipeline_tfidf_lr = Pipeline([
    ('feature_extractor', feature_extractor),
    ('combined_features', combined_features_tfidf),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

In [12]:
pipeline_tfidf_lr.fit(X, y)
y_pred_tfidf_lr = pipeline_tfidf_lr.predict(X)

print(classification_report(y, y_pred_tfidf_lr, zero_division=0))

              precision    recall  f1-score   support

         0.0       0.69      0.91      0.79       153
         1.0       0.96      0.86      0.91       434

    accuracy                           0.87       587
   macro avg       0.83      0.88      0.85       587
weighted avg       0.89      0.87      0.88       587



## TF-IDF + SVD + LightGBM

In [13]:
text_pipeline_tfidf_svd = Pipeline([
    ('select_text', SelectText()),
    ('tfidf', TfidfVectorizer()),
    ('svd', TruncatedSVD(n_components=100, random_state=42))
])

combined_features_tfidf_svd = FeatureUnion([
    ('structured', structured_pipeline),
    ('text', text_pipeline_tfidf_svd)
])

pipeline_tfidf_svd_lgbm = Pipeline([
    ('feature_extractor', feature_extractor),
    ('combined_features', combined_features_tfidf_svd),
    ('clf', LGBMClassifier(num_leaves=15, max_depth=3, learning_rate=0.05,
                           n_estimators=200, class_weight='balanced', random_state=42, verbose=-1))
])

In [14]:
pipeline_tfidf_svd_lgbm.fit(X, y)
y_pred_tfidf_svd_lgbm = pipeline_tfidf_svd_lgbm.predict(X)

print(classification_report(y, y_pred_tfidf_svd_lgbm))

              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98       153
         1.0       1.00      0.98      0.99       434

    accuracy                           0.99       587
   macro avg       0.98      0.99      0.98       587
weighted avg       0.99      0.99      0.99       587





# Second tier

## LSA (TF-IDF + SVD) + Logistic Regression

In [15]:
text_pipeline_lsa = Pipeline([
    ('select_text', SelectText()),
    ('tfidf', TfidfVectorizer()),
    ('svd', TruncatedSVD(n_components=100, random_state=42))
])

combined_features_lsa = FeatureUnion([
    ('structured', structured_pipeline),
    ('text', text_pipeline_lsa)
])

pipeline_lsa_lr = Pipeline([
    ('feature_extractor', feature_extractor),
    ('combined_features', combined_features_lsa),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

pipeline_lsa_lr.fit(X, y)
y_pred_lsa = pipeline_lsa_lr.predict(X)

In [16]:
pipeline_lsa_lr.fit(X, y)
y_pred_lsa = pipeline_lsa_lr.predict(X)

print(classification_report(y, y_pred_lsa))

              precision    recall  f1-score   support

         0.0       0.59      0.81      0.68       153
         1.0       0.92      0.80      0.86       434

    accuracy                           0.80       587
   macro avg       0.76      0.80      0.77       587
weighted avg       0.84      0.80      0.81       587



## BERT Sentence Embeddings + Logistic Regression

In [17]:
# Initialize model (loading the pre-trained Sentence-BERT from HuggingFace)
model_bert_ls = SentenceBertStructuredLRClassifier(
    sentence_model_name_or_path='paraphrase-MiniLM-L6-v2',
    preprocessor=feature_extractor_bert
)

# Fit
model_bert_ls.fit(X, y)

In [18]:
y_pred_bert_ls = model_bert_ls.predict(X)

print(classification_report(y, y_pred_bert_ls))

              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96       153
         1.0       1.00      0.97      0.99       434

    accuracy                           0.98       587
   macro avg       0.96      0.99      0.97       587
weighted avg       0.98      0.98      0.98       587



In [19]:
# Save everything
model_bert_ls.save('models/bert_sentence_lr')

# Third tier

## Fine-tuned bert

In [26]:
bert_clf = FineTunedBertClassifier('bert-base-uncased')
bert_clf.fit(X, y, epochs=3)
# bert_clf.save('models/bert_finetuned_model')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
bert_clf.save('models/bert_finetuned_model')

In [28]:
y_pred_bert_clf = bert_clf.predict(X)

print(classification_report(y, y_pred_bert_clf))

              precision    recall  f1-score   support

         0.0       0.99      0.97      0.98       153
         1.0       0.99      1.00      0.99       434

    accuracy                           0.99       587
   macro avg       0.99      0.98      0.99       587
weighted avg       0.99      0.99      0.99       587



# Save models

In [29]:
joblib.dump(pipeline_bow_lr, 'models/pipeline_bow_lr.joblib')
joblib.dump(pipeline_tfidf_lr, 'models/pipeline_tfidf_lr.joblib')
joblib.dump(pipeline_tfidf_svd_lgbm, 'models/pipeline_tfidf_svd_lgbm.joblib')
joblib.dump(pipeline_lsa_lr, 'models/pipeline_lsa_lr.joblib')
joblib.dump(pipeline_dummy, 'models/pipeline_dummy.joblib')
model_bert_ls.save('models/bert_sentence_lr')
bert_clf.save('models/bert_finetuned_model')

# Load models

In [None]:
pipeline_bow_lr = joblib.load('models/pipeline_bow_lr.joblib')
pipeline_tfidf_lr = joblib.load('models/pipeline_tfidf_lr.joblib')
pipeline_tfidf_svd_lgbm = joblib.load('models/pipeline_tfidf_svd_lgbm.joblib')
pipeline_lsa_lr = joblib.load('models/pipeline_lsa_lr.joblib')
pipeline_dummy = joblib.load('models/pipeline_dummy.joblib')

In [None]:
model_bert_ls_loaded = load_sentence_bert_lr('models/bert_sentence_lr', preprocessor=feature_extractor_bert)
bert_clf_loaded = FineTunedBertClassifier.load('models/bert_finetuned_model')