In [12]:
import pandas as pd

pd.set_option("display.max_columns", None)
import numpy as np
import os
from datetime import datetime, date

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

from xgboost import XGBClassifier

In [13]:
def day_of_week_func(x):
    """
        Input: Normal date

        Return: Day in week
    """
    # Why don't need .dt if apply to Series
    # https://stackoverflow.com/questions/62803633/timestamp-object-has-no-attribute-dt#_=_
    x = pd.to_datetime(x).dayofweek
    day_of_week = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
    dict_dayofweek = {i:v for i,v in enumerate(day_of_week)}

    return dict_dayofweek[x]

def drop_column(df, columns_name):
    if isinstance(columns_name, list):
        pass
    else:
        columns_name = [columns_name]
        
    for column in columns_name:
        if column in df.columns:
            df.drop(columns=[column], inplace=True)

In [14]:
if os.name =='nt':
    dataset_path = r'D:\Coding_pratice\_Data\GalaxyEducation\ICK\Dataset'
else:
    dataset_path = '/Users/admin/_Work/Data/icanKID/Dataset'

# dataset = 'ICK_Active3thdays.csv'
dataset = '3days_cutoff.csv'
dataset_path = os.path.join(
    dataset_path, dataset
)
df_dataset = pd.read_csv(dataset_path)

if 'UserJoinedDate' in df_dataset.columns:
    df_dataset['DayJoined'] = pd.to_datetime(df_dataset['UserJoinedDate']).dt.dayofweek
    drop_column(df_dataset,'UserJoinedDate')

In [15]:
display(df_dataset.columns)
display(df_dataset.describe(include='all'))

Index(['UserID', 'TotalChild', 'No.Female', 'No.Male', 'Age', 'TotalOpentime',
       'TotalUsageTime', 'TotalRecords', 'TotalPayscreentime',
       'OpentimeDiscovery', 'OpentimeEntertainment', 'OpentimeLearn',
       'PaymentScreenTimeDiscovery', 'PaymentScreenTimeEntertainment',
       'PaymentScreenTimeLearn', 'RecordsDiscovery', 'RecordsEntertainment',
       'RecordsLearn', 'UsageTimeDiscovery', 'UsageTimeEntertainment',
       'UsageTimeLearn', 'MostInterest', 'TotalInterestRatio', 'InterestTrend',
       'MostAccumulate', 'TotalAccuRatio', 'AccuTrend', 'Other brand', 'apple',
       'huawei', 'lenovo', 'nokia', 'oppo', 'realme', 'redmi', 'samsung',
       'vivo', 'vsmart', 'xiaomi', 'cutoffDay', 'lateUser', 'Label',
       'DayJoined'],
      dtype='object')

Unnamed: 0,UserID,TotalChild,No.Female,No.Male,Age,TotalOpentime,TotalUsageTime,TotalRecords,TotalPayscreentime,OpentimeDiscovery,OpentimeEntertainment,OpentimeLearn,PaymentScreenTimeDiscovery,PaymentScreenTimeEntertainment,PaymentScreenTimeLearn,RecordsDiscovery,RecordsEntertainment,RecordsLearn,UsageTimeDiscovery,UsageTimeEntertainment,UsageTimeLearn,MostInterest,TotalInterestRatio,InterestTrend,MostAccumulate,TotalAccuRatio,AccuTrend,Other brand,apple,huawei,lenovo,nokia,oppo,realme,redmi,samsung,vivo,vsmart,xiaomi,cutoffDay,lateUser,Label,DayJoined
count,158852,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852,158852.0,158852.0,158852,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0,158852.0
unique,158852,,,,,,,,,,,,,,,,,,,,,3,,,3,,,,,,,,,,,,,,,,,,
top,00002520-3fe9-4d4b-9e1f-e4a72386bf47,,,,,,,,,,,,,,,,,,,,,learn,,,learn,,,,,,,,,,,,,,,,,,
freq,1,,,,,,,,,,,,,,,,,,,,,131034,,,90593,,,,,,,,,,,,,,,,,,
mean,,1.083883,0.422255,0.727136,5.211889,2.542039,25.18016,23.517576,3.971716,0.616077,0.665468,1.260494,1.131563,1.008763,1.83139,1.804038,2.854084,18.859454,4.016547,5.388887,15.774726,,0.917723,-0.054575,,0.082456,0.052043,0.022392,0.292486,0.010588,0.006346,0.005622,0.159513,0.024935,0.039534,0.28949,0.048064,0.020384,0.024073,5.627716,0.110738,0.065646,3.173677
std,,0.354592,0.617174,0.72558,2.230672,5.093853,35.704112,30.287767,11.304519,1.626864,1.75631,2.282891,3.896933,3.632862,4.567992,3.898808,7.872758,26.264017,9.453013,16.981067,24.015362,,0.143138,0.357552,,0.439583,0.30247,0.147955,0.454906,0.102354,0.079406,0.074766,0.366155,0.155928,0.194861,0.453527,0.213901,0.14131,0.153275,18.890023,0.313809,0.247663,2.045592
min,,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,-1.0,,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
25%,,1.0,0.0,0.0,4.0,0.0,3.82,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,2.2,,0.895483,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,1.0
50%,,1.0,0.0,1.0,5.0,1.0,12.97,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,8.18,,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0
75%,,1.0,1.0,1.0,6.0,3.0,31.93,30.0,3.0,0.0,1.0,2.0,0.0,0.0,2.0,2.0,2.0,24.0,3.35,2.37,19.62,,1.0,0.0,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0


In [16]:
df_dataset = pd.get_dummies(df_dataset, columns=['MostInterest', 'MostAccumulate'])

In [17]:
df_dataset.head()

Unnamed: 0,UserID,TotalChild,No.Female,No.Male,Age,TotalOpentime,TotalUsageTime,TotalRecords,TotalPayscreentime,OpentimeDiscovery,OpentimeEntertainment,OpentimeLearn,PaymentScreenTimeDiscovery,PaymentScreenTimeEntertainment,PaymentScreenTimeLearn,RecordsDiscovery,RecordsEntertainment,RecordsLearn,UsageTimeDiscovery,UsageTimeEntertainment,UsageTimeLearn,TotalInterestRatio,InterestTrend,TotalAccuRatio,AccuTrend,Other brand,apple,huawei,lenovo,nokia,oppo,realme,redmi,samsung,vivo,vsmart,xiaomi,cutoffDay,lateUser,Label,DayJoined,MostInterest_discovery,MostInterest_entertainment,MostInterest_learn,MostAccumulate_discovery,MostAccumulate_entertainment,MostAccumulate_learn
0,00002520-3fe9-4d4b-9e1f-e4a72386bf47,1,0.0,1.0,5.0,1.0,17.68,18.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,17.68,1.0,0.0,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,0,3,0,0,0,0,0,1,0,0,1
1,00007f9f-be36-4a0d-a2a8-167231588625,1,0.0,1.0,13.0,0.0,6.85,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,6.85,0.0,0.0,1.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,1,0,0,0,3,0,0,1,1,0,0,1,0,0
2,00015787-1ed9-45b9-9c5f-2613e2ca8409,1,1.0,0.0,7.0,13.0,133.59,118.0,67.0,6.0,1.0,6.0,29.0,5.0,33.0,30.0,16.0,72.0,57.4,10.03,66.16,0.57244,-0.385528,-0.031635,-0.397959,0,0,0,0,0,0,0,0,1,0,0,0,3,0,0,5,0,0,1,0,1,0
3,0001f256-edcf-4980-b84b-85d85745fd0f,1,0.0,1.0,11.0,0.0,42.56,61.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,60.0,0.63,0.0,41.93,0.999531,0.486409,1.0,0.486409,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,3,0,0,1,0,0,1
4,000234a9-a233-4476-9023-9b83f331b7f6,1,0.0,1.0,5.0,0.0,14.59,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,26.0,0.27,0.0,14.32,0.999275,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,3,0,0,6,0,0,1,1,0,0


In [7]:
"""
    Clean up dataframe
"""
# df_dataset = pd.concat([df_dataset, dummy], axis=1)
# dummy = pd.get_dummies(df_dataset['firstPaymentType'], prefix='paymentType')

'\n    Clean up dataframe\n'

In [18]:
"""
    Drop unnecessary columns and split data into train and validation
"""
User_ID = df_dataset[['UserID']]
drop_column(df_dataset, ['UserID','firstPaymentType', 'gapday'])
X = df_dataset.drop(columns=['Label'])
y = df_dataset['Label']

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=0, test_size=0.3)

In [19]:
X[X.isna().any(axis=1)]

Unnamed: 0,TotalChild,No.Female,No.Male,Age,TotalOpentime,TotalUsageTime,TotalRecords,TotalPayscreentime,OpentimeDiscovery,OpentimeEntertainment,OpentimeLearn,PaymentScreenTimeDiscovery,PaymentScreenTimeEntertainment,PaymentScreenTimeLearn,RecordsDiscovery,RecordsEntertainment,RecordsLearn,UsageTimeDiscovery,UsageTimeEntertainment,UsageTimeLearn,TotalInterestRatio,InterestTrend,TotalAccuRatio,AccuTrend,Other brand,apple,huawei,lenovo,nokia,oppo,realme,redmi,samsung,vivo,vsmart,xiaomi,cutoffDay,lateUser,DayJoined,MostInterest_discovery,MostInterest_entertainment,MostInterest_learn,MostAccumulate_discovery,MostAccumulate_entertainment,MostAccumulate_learn


In [20]:
np.unique(y_train, return_counts=True)

(array([0, 1]), array([103919,   7277]))

In [21]:
def evaluate_func(y_true, y_pred):
    print(classification_report(y_true, y_pred))

In [22]:
from sklearn.model_selection import GridSearchCV
def GridSearchFunc(model, param_grid, training_data, validate_data):
    X_train, y_train = training_data
    X_val, y_val = validate_data

    # Create a GridSearchCV object
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='f1',
        cv=5,
        n_jobs=-1, # Toggle this if you want to run out of CPU processes
        # n_jobs=8,
        verbose=2
    )

    # Fit the GridSearchCV object to the data
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters
    print(grid_search.best_params_)

    # Use the best estimator to make predictions
    best_cls_model = grid_search.best_estimator_
    y_pred = best_cls_model.predict(X_val)

    return y_pred, best_cls_model

In [23]:
gridsearch = False
"""
    'max_depth': 20, 'max_samples': 0.8, 'min_samples_split': 6, 'n_estimators': 500
"""
RF_parameters = {
    'criterion':'entropy', 'max_depth': 20, 'max_samples': 0.8, 'min_samples_split': 6, 'n_estimators': 500,
    'class_weight':'balanced'
}
cls_model = RandomForestClassifier(
        **RF_parameters)

if gridsearch:
    # Define the parameter grid
    param_grid = {
        'n_estimators': [500, 1000, 1500],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 6, 12],
        'max_samples': [0.6, 0.7, 0.8]
    }
    y_pred, cls_model = GridSearchFunc(
        cls_model,
        param_grid,
        (X_train, y_train),
        (X_val, y_val)
        )
else:
    cls_model.fit(X_train, y_train)
    y_pred = cls_model.predict(X_val)
evaluate_func(y_val, y_pred)

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     44505
           1       0.66      0.33      0.44      3151

    accuracy                           0.94     47656
   macro avg       0.80      0.66      0.71     47656
weighted avg       0.93      0.94      0.94     47656

