In [7]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

df = pd.read_csv("../../yikyak_metadata.csv")

TEXT_COL = "text_clean"
LABEL_COL = "high_engagement"

NUM_COLS = [
     "text_length", "created_hour", "first_person_count", "second_person_count", "first_person_ratio", "second_person_ratio",
    "disagree_count", "has_disagree", "conflict_count", "has_conflict", "exclamations", "questions", "vader_neg", "vader_neu", "vader_pos", "vader_compound" 
]

use_cols = [TEXT_COL, LABEL_COL] + NUM_COLS
df_model = df[use_cols].dropna().copy()

df_model[NUM_COLS] = df_model[NUM_COLS].replace([np.inf, -np.inf], np.nan)
df_model[NUM_COLS] = df_model[NUM_COLS].fillna(0)


X_train_df, X_test_df = train_test_split(
    df_model,
    test_size=0.2,
    random_state=42,
    stratify=df_model[LABEL_COL]
)

y_train = X_train_df[LABEL_COL].values
y_test  = X_test_df[LABEL_COL].values


tfidf = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1,2),
    min_df=2
)

X_train = tfidf.fit_transform(X_train_df[TEXT_COL])
X_test  = tfidf.transform(X_test_df[TEXT_COL])



svm = LinearSVC(
    C=1,
    class_weight="balanced",
    max_iter=5000,
    random_state=42
)

svm.fit(X_train, y_train)


y_pred = svm.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.88      0.90      1758
           1       0.23      0.31      0.26       196

    accuracy                           0.82      1954
   macro avg       0.57      0.60      0.58      1954
weighted avg       0.85      0.82      0.84      1954

Confusion Matrix:
[[1551  207]
 [ 135   61]]


In [5]:
print(df[LABEL_COL].value_counts(normalize=True))


high_engagement
0    0.899703
1    0.100297
Name: proportion, dtype: float64
