In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedGroupKFold, GroupKFold, train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


In [None]:
df = pd.read_csv("../data/fake_users.csv")
df.head()

In [None]:
df["Fake"].hist()

In [None]:
len(df[df["Fake"] == 1]["UserId"].unique())

In [None]:
# Undersampling
df_fake = df[df["Fake"] == 1]

if False: # Undersample by User (Worse results)
    fake_cnt = len(df_fake["UserId"].unique())

    df_real = df[df["Fake"] == 0]
    sampled_real_user_ids = df_real["UserId"].sample(fake_cnt)

    df_real_sampled = df_real[df_real["UserId"].isin(sampled_real_user_ids)]

else: # Undersample by transaction
    df_real_sampled = df[df["Fake"] == 0].sample(df_fake.shape[0], random_state=0)
        

df = pd.concat([df_fake, df_real_sampled], axis=0).sample(frac=1.)

df["Fake"].hist()

In [None]:
user_ids = df.UserId
df.drop(columns="UserId", inplace=True)
df.drop(columns="Unnamed: 0", inplace=True)

In [None]:
X = df.drop(columns="Fake", axis=1)
y = df.Fake

In [None]:
user_le = LabelEncoder()
user_ids = user_le.fit_transform(user_ids)
user_ids

In [None]:
event_le = LabelEncoder()
cat_le = LabelEncoder()


X["Event"] = event_le.fit_transform(X.Event)
X["Category"] = cat_le.fit_transform(X.Category)
X

In [None]:
X

In [None]:
ohe = OneHotEncoder(handle_unknown='ignore')

X = ohe.fit_transform(X)
X

In [None]:

model = LogisticRegression()
p_grid={
    'C': [1, 2, 5, 7, 10, 20, 50],
    'max_iter': [20, 50, 100, 200, 500],
    'class_weight': ["balanced", None],
}

gkf = GroupKFold(n_splits=5)

gscv = GridSearchCV(estimator=model, param_grid=p_grid, cv=gkf, scoring='roc_auc')

lr = gscv.fit(X, y, groups=user_ids)

lr

In [None]:
lr.best_score_, lr.best_params_

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=0)
p_grid={
    'n_estimators': [1, 2, 5,],
    'max_depth': [2, 3, 5, 10, 20],
    'criterion': ["gini", "entropy"],
    'class_weight': ["balanced", None],
}

gkf = GroupKFold(n_splits=5)

gscv = GridSearchCV(estimator=model, param_grid=p_grid, cv=gkf, scoring='roc_auc')

rf = gscv.fit(X, y, groups=user_ids)

rf

In [None]:
rf.best_score_, rf.best_params_

In [None]:
df_test = pd.read_csv("../data/fake_users_test.csv")
df_test.head()

In [None]:
df_test["Event"] = event_le.transform(df_test.Event)
df_test["Category"] = cat_le.transform(df_test.Category)

df_test

In [None]:
df_test.sample(15)

In [None]:
# test user
test_user = df_test[df_test["UserId"] == "03E7EE785DT"]
test_user

In [None]:
X_test_user = ohe.transform(test_user[["Event", "Category"]])
rf.predict_proba(X_test_user)

In [None]:
sum(rf.predict_proba(X_test_user)[:,1]) / len(rf.predict(X_test_user))

In [None]:
output = {}
df_test["pred"] = None
for user, group in df_test.groupby("UserId"):
    pred = rf.predict_proba(ohe.transform(group[["Event", "Category"]]))
    output[user] = sum(pred[:,1]) / pred.shape[0]
    for idx, pred in zip(group.index, pred[:,1]):
        df_test.at[idx, "pred"] = pred

output

In [None]:
df_test.sample(10)

In [None]:
user_preds = df_test.groupby("UserId").first()
user_preds["pred"] = None
user_preds

In [None]:
user_preds["pred"] = [output[idx] for idx, val in user_preds["pred"].iteritems()]

user_preds.sample(20)

In [None]:
from sklearn.metrics import classification_report
print(
    classification_report(
        user_preds["Fake"], user_preds["pred"].apply(lambda x: 1 if x>.5 else 0)
    )
)

In [None]:
rf.best_estimator_

In [None]:
print(
    classification_report(
        df_test["Fake"], df_test["pred"].apply(lambda x: 1 if x>.50 else 0)
    )
)