In [None]:

import numpy as np
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    f1_score,
    accuracy_score
)
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV

from imblearn.over_sampling import SMOTE

df = pd.read_csv(/train.csv")



print(df.head())
print("Class distribution:\n", df['label'].value_counts())
print("Rows:", len(df))


   id  label                                              tweet
0   1      0   @user when a father is dysfunctional and is s...
1   2      0  @user @user thanks for #lyft credit i can't us...
2   3      0                                bihday your majesty
3   4      0  #model   i love u take with u all the time in ...
4   5      0             factsguide: society now    #motivation
Class distribution:
 label
0    29720
1     2242
Name: count, dtype: int64
Rows: 31962


## 2) Basic Cleaning



In [None]:
def clean_tweet(text):
    if pd.isna(text):
        return ""
    
    # lowercase
    text = text.lower()
    
    # remove mentions (@user, @someone123)
    text = re.sub(r'@\w+', ' ', text)
    
    # remove urls
    text = re.sub(r'http\S+|www\.\S+', ' ', text)
    
    # replace html entities
    text = text.replace('&amp;', ' and ')
    
    # remove hashtags symbol but keep the word ( #love -> love )
    text = re.sub(r'#', ' ', text)
    
    # remove numbers and punctuation and emojis (keep only letters + spaces)
    text = re.sub(r'[^a-z\s]', ' ', text)
    
    # collapse multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# apply to your df
df['clean_tweet'] = df['tweet'].apply(clean_tweet)

df[['tweet', 'clean_tweet']].head()

Unnamed: 0,tweet,clean_tweet
0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i can t use cause they ...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,model i love u take with u all the time in ur
4,factsguide: society now #motivation,factsguide society now motivation


## 3) Stopwards cleaning

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)

df['clean_tweet_nostop'] = df['clean_tweet'].apply(remove_stopwords)

df[['tweet', 'clean_tweet', 'clean_tweet_nostop']].head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dimso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,tweet,clean_tweet,clean_tweet_nostop
0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i can t use cause they ...,thanks lyft credit use cause offer wheelchair ...
2,bihday your majesty,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model i love u take with u all the time in ur,model love u take u time ur
4,factsguide: society now #motivation,factsguide society now motivation,factsguide society motivation


## Data Split


In [8]:
X = df['clean_tweet_nostop']
y = df['label']

X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# For collecting results
results = []

def evaluate_model(name, y_true, y_pred):
    """Print detailed metrics and store summary in results."""
    print(f"\n## {name} ##")
    print(classification_report(y_true, y_pred, zero_division=0))
    print("Confusion matrix:")
    print(confusion_matrix(y_true, y_pred))

    f1_c1 = f1_score(y_true, y_pred, pos_label=1, zero_division=0)
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
    acc = accuracy_score(y_true, y_pred)

    results.append({
        "model": name,
        "f1_class1": f1_c1,
        "f1_macro": f1_macro,
        "accuracy": acc
    })

# MODEL 1: NAIVE BAYES

In [9]:
tfidf_nb = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    stop_words='english'
)

X_train_nb = tfidf_nb.fit_transform(X_train)
X_valid_nb = tfidf_nb.transform(X_valid)

nb_clf = MultinomialNB()
nb_clf.fit(X_train_nb, y_train)

y_pred_nb = nb_clf.predict(X_valid_nb)
evaluate_model("Naive Bayes", y_valid, y_pred_nb)


## Naive Bayes ##
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      5945
           1       0.97      0.25      0.40       448

    accuracy                           0.95      6393
   macro avg       0.96      0.63      0.69      6393
weighted avg       0.95      0.95      0.93      6393

Confusion matrix:
[[5942    3]
 [ 335  113]]


# MODEL 2: LOGISTIC REGRESSION (BALANCED)

In [10]:
tfidf_lr = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    stop_words='english'
)

X_train_lr = tfidf_lr.fit_transform(X_train)
X_valid_lr = tfidf_lr.transform(X_valid)

log_reg = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    n_jobs=-1
)
log_reg.fit(X_train_lr, y_train)

y_pred_lr = log_reg.predict(X_valid_lr)
evaluate_model("Logistic Regression (balanced)", y_valid, y_pred_lr)


## Logistic Regression (balanced) ##
              precision    recall  f1-score   support

           0       0.98      0.95      0.97      5945
           1       0.54      0.78      0.64       448

    accuracy                           0.94      6393
   macro avg       0.76      0.86      0.80      6393
weighted avg       0.95      0.94      0.94      6393

Confusion matrix:
[[5651  294]
 [ 100  348]]


# MODEL 3: SVC + SMOTE

In [11]:
tfidf_svm_smote = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    stop_words='english'
)

X_train_svm = tfidf_svm_smote.fit_transform(X_train)
X_valid_svm = tfidf_svm_smote.transform(X_valid)

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_svm, y_train)

print("\n## SMOTE Info ##")
print("Before SMOTE class balance:\n", y_train.value_counts())
print("After SMOTE class balance:\n", pd.Series(y_train_res).value_counts())

svc_smote = LinearSVC()
svc_smote.fit(X_train_res, y_train_res)

y_pred_svm_smote = svc_smote.predict(X_valid_svm)
evaluate_model("SVC + SMOTE", y_valid, y_pred_svm_smote)


## SMOTE Info ##
Before SMOTE class balance:
 label
0    23775
1     1794
Name: count, dtype: int64
After SMOTE class balance:
 label
0    23775
1    23775
Name: count, dtype: int64





## SVC + SMOTE ##
              precision    recall  f1-score   support

           0       0.98      0.94      0.96      5945
           1       0.48      0.74      0.58       448

    accuracy                           0.92      6393
   macro avg       0.73      0.84      0.77      6393
weighted avg       0.94      0.92      0.93      6393

Confusion matrix:
[[5582  363]
 [ 118  330]]


# MODEL 4: SVC + CALIBRATED PROBABILITIES + THRESHOLD


In [12]:
tfidf_svc_thr = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    stop_words='english'
)

X_train_thr = tfidf_svc_thr.fit_transform(X_train)
X_valid_thr = tfidf_svc_thr.transform(X_valid)

svc_base = LinearSVC(class_weight='balanced', dual='auto')
svc_cal = CalibratedClassifierCV(svc_base, cv=3, method="sigmoid")
svc_cal.fit(X_train_thr, y_train)

proba_valid = svc_cal.predict_proba(X_valid_thr)[:, 1]

best_thr = 0.5
best_f1 = 0.0

for thr in np.linspace(0.2, 0.8, 25):
    y_pred_thr_loop = (proba_valid >= thr).astype(int)
    f1 = f1_score(y_valid, y_pred_thr_loop, pos_label=1, zero_division=0)
    if f1 > best_f1:
        best_f1 = f1
        best_thr = thr

print(f"\n## Best Threshold for SVC + Calibrated ##")
print(f"Best threshold: {best_thr:.3f}, best F1(class 1): {best_f1:.4f}")

y_pred_thr = (proba_valid >= best_thr).astype(int)
evaluate_model(f"SVC + Calibrated (thr={best_thr:.3f})", y_valid, y_pred_thr)


## Best Threshold for SVC + Calibrated ##
Best threshold: 0.350, best F1(class 1): 0.7179

## SVC + Calibrated (thr=0.350) ##
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      5945
           1       0.75      0.69      0.72       448

    accuracy                           0.96      6393
   macro avg       0.86      0.84      0.85      6393
weighted avg       0.96      0.96      0.96      6393

Confusion matrix:
[[5843  102]
 [ 140  308]]


# Summary Table 


In [None]:

summary_df = pd.DataFrame(results)
summary_df = summary_df.sort_values(by="f1_class1", ascending=False)

print("\n\n## SUMMARY TABLE (sorted by F1 for class 1) ##")
print(summary_df.to_string(index=False))



## SUMMARY TABLE (sorted by F1 for class 1) ##
                         model  f1_class1  f1_macro  accuracy
  SVC + Calibrated (thr=0.350)   0.717949  0.848830  0.962146
Logistic Regression (balanced)   0.638532  0.802423  0.938370
                   SVC + SMOTE   0.578440  0.768567  0.924761
                   Naive Bayes   0.400709  0.686527  0.947130
