In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
df = pd.read_csv("new_data_test.csv")

TEXT_COL = "text_clean"
LABEL_COL = "high_engagement"

In [4]:

df_model = df[[TEXT_COL, LABEL_COL]].dropna().copy()


X_train_df, X_test_df = train_test_split(
    df_model,
    test_size=0.2,
    random_state=42,
    stratify=df_model[LABEL_COL]
)

y_train = X_train_df[LABEL_COL].values
y_test  = X_test_df[LABEL_COL].values


tfidf = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1, 2),
    min_df=2
)

X_train = tfidf.fit_transform(X_train_df[TEXT_COL])
X_test  = tfidf.transform(X_test_df[TEXT_COL])


log_reg = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="liblinear",
    random_state=42
)

log_reg.fit(X_train, y_train)


y_pred = log_reg.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.87      0.89      1073
           1       0.67      0.75      0.71       368

    accuracy                           0.84      1441
   macro avg       0.79      0.81      0.80      1441
weighted avg       0.85      0.84      0.84      1441

Confusion Matrix:
[[936 137]
 [ 93 275]]
