In [56]:
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.calibration import calibration_curve

import warnings
warnings.filterwarnings('ignore')

In [57]:
df = pl.read_csv('/kaggle/input/unlaveled-data/unlabeled_data_revised.csv', separator=";")
df

from_username,profile_link,account_age,is_buzzer,total_post,followers
str,str,i64,i64,i64,i64
"""Ayudihutankecil""","""https://twitter.com/Ayudihutan…",9,1,20100,641
"""bang_ouz71""","""https://twitter.com/bang_ouz71""",7,1,2292,15
"""KowuLaode341884""","""https://twitter.com/KowuLaode3…",2,1,10100,98
"""Manusia2B""","""https://twitter.com/Manusia2B""",4,1,89700,669
"""PutraMahes59""","""https://twitter.com/PutraMahes…",2,0,2586,58
…,…,…,…,…,…
"""suaradotcom""","""https://twitter.com/suaradotco…",0,3,0,33320
"""kba_news""","""https://twitter.com/kba_news""",0,3,0,9657
"""BustoniFikri4""","""https://twitter.com/BustoniFik…",5,1,4106,66
"""SETYAKI22965702""","""https://twitter.com/SETYAKI229…",4,1,108900,21413


## Process Dataframe

In [58]:
df_cleaned = df.filter((pl.col("is_buzzer") != 3) & (pl.col("is_buzzer") != 9))
df_cleaned = df_cleaned.with_columns([
    # Per year (what you already have)
    (pl.col("total_post")/pl.col("account_age")).alias("post_frequency_per_year"),
    
    # Per month (divide year by 12)
    (pl.col("total_post")/(pl.col("account_age") * 12)).alias("post_frequency_per_month"),
    
    # Per day (multiply year by 365)
    (pl.col("total_post")/(pl.col("account_age") * 365)).alias("post_frequency_per_day")
])

In [59]:
df_cleaned

from_username,profile_link,account_age,is_buzzer,total_post,followers,post_frequency_per_year,post_frequency_per_month,post_frequency_per_day
str,str,i64,i64,i64,i64,f64,f64,f64
"""Ayudihutankecil""","""https://twitter.com/Ayudihutan…",9,1,20100,641,2233.333333,186.111111,6.118721
"""bang_ouz71""","""https://twitter.com/bang_ouz71""",7,1,2292,15,327.428571,27.285714,0.897065
"""KowuLaode341884""","""https://twitter.com/KowuLaode3…",2,1,10100,98,5050.0,420.833333,13.835616
"""Manusia2B""","""https://twitter.com/Manusia2B""",4,1,89700,669,22425.0,1868.75,61.438356
"""PutraMahes59""","""https://twitter.com/PutraMahes…",2,0,2586,58,1293.0,107.75,3.542466
…,…,…,…,…,…,…,…,…
"""Zaenal58788427""","""https://twitter.com/Zaenal5878…",2,1,19800,1014,9900.0,825.0,27.123288
"""bharatasoim""","""https://twitter.com/bharatasoi…",11,1,200,82,18.181818,1.515152,0.049813
"""BustoniFikri4""","""https://twitter.com/BustoniFik…",5,1,4106,66,821.2,68.433333,2.249863
"""SETYAKI22965702""","""https://twitter.com/SETYAKI229…",4,1,108900,21413,27225.0,2268.75,74.589041


In [60]:
df_cleaned['is_buzzer'].value_counts()

is_buzzer,count
i64,u32
0,381
1,580


## Data Modelling

In [61]:
features_to_normalize = ["account_age","followers", "total_post", "post_frequency_per_year", "post_frequency_per_month"]
X = df_cleaned.select(features_to_normalize).to_pandas()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=features_to_normalize)

y=df_cleaned.select("is_buzzer").to_pandas()
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

### Decision Tree

In [62]:
dt = DecisionTreeClassifier(
    random_state=42,
    class_weight=None,
    criterion='entropy',
    max_depth=5,
    min_samples_leaf=4,
    min_samples_split=15    
)


# Fit GridSearchCV
dt.fit(X_train, y_train)


# Make predictions with best model
y_pred = dt.predict(X_test)

### Classification Report

In [63]:
from sklearn.metrics import confusion_matrix
# Print classification report
print("\nClassification Report with Best Parameters:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features_to_normalize,
    'importance': dt.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report with Best Parameters:
              precision    recall  f1-score   support

           0       0.56      0.54      0.55        78
           1       0.69      0.71      0.70       115

    accuracy                           0.64       193
   macro avg       0.63      0.63      0.63       193
weighted avg       0.64      0.64      0.64       193


Feature Importance:
                    feature  importance
3   post_frequency_per_year    0.457674
2                total_post    0.277143
4  post_frequency_per_month    0.199352
1                 followers    0.042755
0               account_age    0.023075

Confusion Matrix:
[[42 36]
 [33 82]]


### Random Forest

In [64]:
# Create final model with best parameters
best_rf = RandomForestClassifier(
    class_weight=None,
    max_depth=10,
    max_features='sqrt',
    min_samples_leaf=2,
    min_samples_split=15,
    n_estimators=200,
    random_state=42
)

# Train on full training data
best_rf.fit(X_train, y_train)

### Classification Report

In [65]:
# Make predictions
y_pred = best_rf.predict(X_test)
y_pred_proba = best_rf.predict_proba(X_test)

# Get performance metrics
from sklearn.metrics import classification_report, confusion_matrix
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features_to_normalize,
    'importance': best_rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.44      0.48        78
           1       0.66      0.74      0.70       115

    accuracy                           0.62       193
   macro avg       0.60      0.59      0.59       193
weighted avg       0.61      0.62      0.61       193


Feature Importance:
                    feature  importance
4  post_frequency_per_month    0.267261
3   post_frequency_per_year    0.256467
2                total_post    0.205086
1                 followers    0.183330
0               account_age    0.087856

Confusion Matrix:
[[34 44]
 [30 85]]


### XGBoost

In [66]:
# Create model with best parameters
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    random_state=42,
    colsample_bytree=0.8,
    gamma=0.2,
    learning_rate=0.01,
    max_depth=5,
    min_child_weight=3,
    n_estimators=100,
    subsample=0.9,
    use_label_encoder=False,
    eval_metric='logloss'  
)

# Try SMOTE for balancing
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Train with balanced data and multiple evaluation metrics
eval_set = [(X_train_balanced, y_train_balanced), (X_test, y_test)]
xgb_model.fit(
    X_train_balanced, 
    y_train_balanced,
    eval_set=eval_set,
    early_stopping_rounds=50,
    verbose=True
)

# Make predictions and evaluate
y_pred = xgb_model.predict(X_test)
y_pred_proba = xgb_model.predict_proba(X_test)

[0]	validation_0-logloss:0.69128	validation_1-logloss:0.69256
[1]	validation_0-logloss:0.68959	validation_1-logloss:0.69194
[2]	validation_0-logloss:0.68808	validation_1-logloss:0.69061
[3]	validation_0-logloss:0.68672	validation_1-logloss:0.68969
[4]	validation_0-logloss:0.68501	validation_1-logloss:0.68866
[5]	validation_0-logloss:0.68330	validation_1-logloss:0.68824
[6]	validation_0-logloss:0.68183	validation_1-logloss:0.68748
[7]	validation_0-logloss:0.68033	validation_1-logloss:0.68656
[8]	validation_0-logloss:0.67893	validation_1-logloss:0.68590
[9]	validation_0-logloss:0.67755	validation_1-logloss:0.68534
[10]	validation_0-logloss:0.67610	validation_1-logloss:0.68440
[11]	validation_0-logloss:0.67458	validation_1-logloss:0.68341
[12]	validation_0-logloss:0.67302	validation_1-logloss:0.68292
[13]	validation_0-logloss:0.67161	validation_1-logloss:0.68228
[14]	validation_0-logloss:0.67024	validation_1-logloss:0.68142
[15]	validation_0-logloss:0.66868	validation_1-logloss:0.68109
[1

In [67]:
# Print detailed evaluation
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.60      0.55        78
           1       0.69      0.59      0.64       115

    accuracy                           0.60       193
   macro avg       0.59      0.60      0.59       193
weighted avg       0.61      0.60      0.60       193


Confusion Matrix:
[[47 31]
 [47 68]]
