In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

from scipy.sparse import hstack, csr_matrix


df = pd.read_csv("../../csv_files/emoji_test.csv")

TEXT_COL = "text_clean"
LABEL_COL = "high_engagement"



df_model = df[[TEXT_COL, LABEL_COL]].dropna().copy()
df_model["post_length"] = df_model[TEXT_COL].str.split().str.len()


X_train_df, X_test_df = train_test_split(
    df_model,
    test_size=0.2,
    random_state=42,
    stratify=df_model[LABEL_COL]
)

y_train = X_train_df[LABEL_COL].values
y_test  = X_test_df[LABEL_COL].values


tfidf = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1, 2),
    min_df=2
)

X_train_text = tfidf.fit_transform(X_train_df[TEXT_COL])
X_test_text  = tfidf.transform(X_test_df[TEXT_COL])

X_train_num = csr_matrix(X_train_df[["post_length"]].values)
X_test_num  = csr_matrix(X_test_df[["post_length"]].values)

scaler = StandardScaler(with_mean=False)
X_train_num = scaler.fit_transform(X_train_num)
X_test_num  = scaler.transform(X_test_num)

X_train = hstack([X_train_text, X_train_num])
X_test  = hstack([X_test_text, X_test_num])

param_grid = {
    "C": [0.01, 0.1, 0.5, 1, 5, 10]
}

grid = GridSearchCV(
    LogisticRegression(
        max_iter=1000,
        solver="liblinear",
        class_weight="balanced",
        random_state=42
    ),
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)


y_pred = grid.best_estimator_.predict(X_test)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.71      0.75      1218
           1       0.38      0.50      0.43       424

    accuracy                           0.66      1642
   macro avg       0.59      0.61      0.59      1642
weighted avg       0.69      0.66      0.67      1642

Confusion Matrix:
[[863 355]
 [210 214]]
