In [130]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [131]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [132]:
pd.set_option('display.max_columns',500)
pd.set_option('display.width',500)
pd.set_option('display.max_rows', 500)

In [133]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer

from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearnex.ensemble import RandomForestClassifier
import lightgbm
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score

In [134]:
# url = 'https://raw.githubusercontent.com/chethanbr86/my_DataScience_MachineLearning_projects/main/AV_Jobathon_June2022/train_wn75k28.csv'
train = pd.read_csv('/kaggle/input/jobathon-june-2022/train_wn75k28.csv', parse_dates=['created_at','signup_date'])
train.head(2)

In [135]:
# url = 'https://raw.githubusercontent.com/chethanbr86/my_DataScience_MachineLearning_projects/main/AV_Jobathon_June2022/train_wn75k28.csv'
test = pd.read_csv('/kaggle/input/jobathon-june-2022/test_Wf7sxXF.csv', parse_dates=['created_at','signup_date'])
test.head(2)

In [136]:
# url = 'https://raw.githubusercontent.com/chethanbr86/my_DataScience_MachineLearning_projects/main/AV_Jobathon_June2022/sample_submission_2zvVjBu.csv'
sample = pd.read_csv('/kaggle/input/jobathon-june-2022/sample_submission_2zvVjBu.csv')
sample.head(2)

In [137]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
 
    return df

In [138]:
train = reduce_mem_usage(train)

In [139]:
test = reduce_mem_usage(test)

In [140]:
def details(df):
    sum_null_values = df.isnull().sum()
    percent_null_values = 100* (sum_null_values/len(df))
    data_type = df.dtypes
    unique_values = df.nunique()

    table = pd.concat([sum_null_values,percent_null_values,data_type,unique_values], axis=1)
    table_col = table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Missing Values', 2 : 'Data_Type', 3: 'Unique values'})
    return table_col

In [141]:
details(train)

In [142]:
details(test)

In [143]:
train['buy'].value_counts(dropna=False)

In [144]:
round(100*train['buy'].value_counts(),2).plot(kind='pie', figsize=(6, 6), autopct='%1.2f%%')
plt.title("buy")
plt.legend(["1", "0"])
plt.show()

This is an imbalanced dataset where 0 means not buying and 1 means buying.

In [145]:
col_cat = [i for i in train.columns if train[i].dtype == object]
col_cat

In [146]:
col_num = [i for i in train.columns if train[i].dtype != object]
col_num.remove('id')
col_num.remove('created_at')
col_num.remove('signup_date')
col_num.remove('buy')

In [147]:
col_num

In [148]:
def catcol(df, col): 
    for cat in col:
        print(cat)
        print('--------------------------')
        print(df[cat].value_counts(dropna=False))
        print('=====================================================')

In [149]:
catcol(train,col_num)

In [150]:
train[col_num].describe(percentiles=(.01,.05,.25,.5,.75,.9,.95,.99))

outliers

In [151]:
def outlier(df, num):
    plt.figure(figsize=(15,30))
    for i in enumerate(num):
        plt.subplot(5,3,i[0]+1)
        sns.boxplot(df[i[1]])

    plt.figure(figsize=(15,30))
    for j in enumerate(num):
        plt.subplot(5,3,j[0]+1)
        sns.kdeplot(df[j[1]], color='orange')

In [152]:
outlier(train,col_num)

Handling null values in numerical column

In [153]:
train.groupby(train['products_purchased'].isnull())['buy'].mean()
train['products_purchased_buy'] = np.where(train['products_purchased'].isnull(), 0, 1)
train.drop(['products_purchased'],1,inplace=True)
test.drop(['products_purchased'],1,inplace=True)
train

In [154]:
train['products_purchased_buy'].value_counts()

Handling null values in date column and replacing it with median of dates

In [155]:
median_impute_date = train['signup_date'].median()
train['signup_date'] = train['signup_date'].replace(np.nan, median_impute_date)
train

In [156]:
median_impute_date = test['signup_date'].median()
test['signup_date'] = test['signup_date'].replace(np.nan, median_impute_date)
test

Handling Dates

In [157]:
train['created_week_day'] = train['created_at'].dt.day_name()
# train['created_day_no'] = train['created_at'].dt.day
train['created_month_name'] = train['created_at'].dt.month_name()
train['created_year'] = train['created_at'].dt.year

In [158]:
train['signup_week_day'] = train['signup_date'].dt.day_name()
# train['signup_day_no'] = train['signup_date'].dt.day
train['signup_month_name'] = train['signup_date'].dt.month_name()
train['signup_year'] = train['signup_date'].dt.year

In [159]:
test['created_week_day'] = test['created_at'].dt.day_name()
# train['created_day_no'] = train['created_at'].dt.day
test['created_month_name'] = test['created_at'].dt.month_name()
test['created_year'] = test['created_at'].dt.year

In [160]:
test['signup_week_day'] = test['signup_date'].dt.day_name()
# train['signup_day_no'] = train['signup_date'].dt.day
test['signup_month_name'] = test['signup_date'].dt.month_name()
test['signup_year'] = test['signup_date'].dt.year

In [161]:
train.head(2)

In [162]:
test.head(2)

ohe for all columns

In [163]:
col_cat_train = [i for i in train.columns if train[i].dtype == object]
print(col_cat_train)

In [164]:
col_cat_test = [i for i in test.columns if test[i].dtype == object]
print(col_cat_test)

In [165]:
ohe = OneHotEncoder(sparse=False)
ohe_df1 = pd.DataFrame(ohe.fit_transform(train[col_cat_train]),columns=ohe.get_feature_names())
train.index = ohe_df1.index
train = pd.concat([train, ohe_df1],1)
train.drop(['created_at','signup_date','created_week_day','created_month_name','created_year','signup_week_day','signup_month_name','signup_year'],1,inplace=True)

In [166]:
# ohe = OneHotEncoder(sparse=False)
ohe_df2 = pd.DataFrame(ohe.transform(test[col_cat_test]),columns=ohe.get_feature_names())
test.index = ohe_df2.index
test = pd.concat([test, ohe_df2],1)
test.drop(['created_at','signup_date','created_week_day','created_month_name','created_year','signup_week_day','signup_month_name','signup_year'],1,inplace=True)

In [167]:
train

In [168]:
test

Removing columns on basis of heatmap

In [169]:
col_num = [i for i in train.columns if train[i].dtype != object]
col_num.remove('id')
col_num.remove('buy')
print(col_num)

In [170]:
plt.figure(figsize=(20, 10))
sns.heatmap(train[col_num].corr(), annot=True)  #in col_num, there is not target, but see if its needed in heatmap
plt.show()

In [171]:
corr_matrix = train[col_num].corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.8 as they are highly correlated
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]
to_drop

Removing columns on basis of chi2

In [172]:
df = pd.DataFrame(columns=["Column_Name","Count"])
for cat in col_num:
    unique_value_count = len(train[cat].unique())
    df = df.append({'Column_Name': cat, "Count":int(unique_value_count)}, ignore_index=True)
columns_unique_value = np.array(df.Count.value_counts().index)
columns_unique_value

In [173]:
columns_to_drop_cat = set()
correlated_columns = dict()
for unique_value_count in columns_unique_value:
    if unique_value_count>1:
        categorical_columns = df.loc[df.Count==unique_value_count,'Column_Name']
        categorical_columns = categorical_columns.reset_index(drop=True)
        columns_length=len(categorical_columns)
        for col in range(columns_length-1):
            column_to_compare = categorical_columns[col]
            columns_compare_against = categorical_columns[(col+1):columns_length]
            chi_scores = chi2(train[columns_compare_against],train[column_to_compare])
            if column_to_compare not in columns_to_drop_cat:
                columns_to_be_dropped = [i for i in range(len(columns_compare_against)) if chi_scores[1][i]<=0.05]
                columns_to_drop_array = np.array(columns_compare_against)[columns_to_be_dropped]
                correlated_columns[column_to_compare]=columns_to_drop_array
                columns_to_drop_cat.update(columns_to_drop_array)

In [174]:
print(len(columns_to_drop_cat))
print(columns_to_drop_cat)

In [175]:
train.drop(columns_to_drop_cat,1,inplace=True)
train

In [176]:
train.columns

In [177]:
X_test = test[['campaign_var_1', 'campaign_var_2', 'user_activity_var_1', 'user_activity_var_2', 'user_activity_var_3', 'user_activity_var_5', 'user_activity_var_7', 'user_activity_var_11', 'x0_Friday', 'x1_February', 'x3_January']]
X_test

Train Test Split

In [178]:
X,y = train.drop(['id','buy'],axis=1),train['buy']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, random_state=23)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

Power Transformer

In [179]:
col_num_X_train = [i for i in X_train.columns if X_train[i].dtype != object]
col_num_X_train

In [180]:
col_num_X_val = [i for i in X_valid.columns if X_valid[i].dtype != object]
col_num_X_val

In [181]:
pt = PowerTransformer(copy=False)
X_train[col_num_X_train] = pt.fit_transform(X_train[col_num_X_train])
X_valid[col_num_X_val] = pt.transform(X_valid[col_num_X_val])

In [182]:
model_list = list()
resample  = list()
roc_train_list = list()
roc_valid_list = list()
auc_train_list = list()
auc_valid_list = list()
f1_score_train_list = list()
f1_score_val_list = list()

In [183]:
model_LR = LogisticRegression(random_state = 23)
model_DT = DecisionTreeClassifier(random_state = 23)
model_RF = RandomForestClassifier(random_state=23, oob_score = True, bootstrap=True)
model_LGBM = lightgbm.LGBMClassifier(objective = 'binary', random_state=23)
model_XGB = XGBClassifier(random_state=23, tree_method = 'gpu_hist', predictor='gpu_predictor', use_label_encoder=False)
model_GB = GradientBoostingClassifier(random_state=23)

In [184]:
params_LR = {
    'penalty': ['none','l1','l2','elasticnet'],
    'C': [0, 0.2,0.4,0.6,0.8,1],
    'solver' : ['newton-cg', 'sag','lbfgs', 'liblinear', 'saga'], 
    'max_iter': [5, 10, 20, 50, 100, 200],
    'l1_ratio': [0, 0.2,0.4,0.6,0.8,1]
} 

params_DT = {
    'max_depth': [5, 10, 20, 50, 100, 200],
    'min_samples_leaf': [5, 10, 20, 50, 100, 200],
    'min_samples_split' : [5, 10, 20, 50, 100, 200]
} 

params_RF = {    
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, 50, 100, 200],
    'min_samples_leaf': [5, 10, 20, 50, 100, 200],
    'min_samples_split' : [5, 10, 20, 50, 100, 200],
    'max_features': ['sqrt','log2'],
    'max_samples': [5, 10, 20, 50, 100, 200]
}

params_LGBM = {
    'n_estimators': [50, 100, 200],
    'boosting_type': ['gbdt','dart'],
    'max_depth': [5, 10, 20, 50, 100, 200], 
    'min_child_samples': [5, 10, 20, 50, 100, 200],
    'subsample': [0.2,0.4,0.6,0.8,1],        
    'learning_rate': np.linspace(0.01, 0.3, 10)
}

params_XGB = {
    'n_estimators': [50, 100, 200],
    'max_depth': np.arange(1,31,3),
    'min_split_loss': [5, 10, 20, 50, 100, 200],
    'sampling_method': ['uniform','gradient_based'],
    'subsample': [0.2, 0.4, 0.5, 0.6, 0.8, 1],
    'learning_rate': np.linspace(0.01, 0.3, 10)
}

params_GB = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, 50, 100, 200],
    'learning_rate': np.linspace(0.01, 0.3, 10)
}


In [185]:
def model_fit_evaluation1(model_model, X_train, y_train, X_valid, y_valid, algo=None, sampling=None):
    
    rcv_best = model_model.fit(X_train, y_train)
    
    y_train_prob = rcv_best.predict_proba(X_train)
    y_train_pred = rcv_best.predict(X_train)
    y_val_prob = rcv_best.predict_proba(X_valid)
    y_val_pred = rcv_best.predict(X_valid)
    
    matrix_train = confusion_matrix(y_train, y_train_pred)
    matrix_val = confusion_matrix(y_valid, y_val_pred)
    report_train = classification_report(y_train, y_train_pred)
    report_val = classification_report(y_valid, y_val_pred)
    roc_train = roc_auc_score(y_train, y_train_pred)
    roc_val = roc_auc_score(y_valid, y_val_pred)
    auc_train = roc_auc_score(y_train, y_train_prob[:,1])
    auc_val = roc_auc_score(y_valid, y_val_prob[:,1])
    f1score_train = f1_score(y_train, y_train_pred, average='weighted')
    f1score_val = f1_score(y_valid, y_val_pred, average='weighted')
    
    print('F1_score for train')
    print('='*60)
    print(f1score_train,"\n")
    print('F1_score for val')
    print('='*60)
    print(f1score_val,"\n")
    print('AUCROC for train')
    print('='*60)
    print(roc_train,"\n")
    print('AUCROC for Val')
    print('='*60)
    print(roc_val,"\n")
    print('Confusion Matrix for train')
    print('='*60)
    print(matrix_train,"\n")
    print('Confusion Matrix for val')
    print('='*60)
    print(matrix_val,"\n")
    print('Classification Report for train')
    print('='*60)
    print(report_train,"\n")
    print('Classification Report for val')
    print('='*60)
    print(report_val,"\n")
    print('AUC-ROC prob for train')
    print('='*60)
    print(auc_train,'\n')
    print('AUC-ROC prob for val')
    print('='*60)
    print(auc_val,'\n')
    print('Roc-Auc-Curve for Train set')
    print('='*60)
    print(plot_roc_curve(model_LR, X_train, y_train),'\n')
    print('Roc-Auc-Curve for Val set')
    print('='*60)
    print(plot_roc_curve(model_LR, X_valid, y_valid),'\n')
    
    model_list.append(algo)
    resample.append(sampling)
    roc_train_list.append(roc_train)
    roc_valid_list.append(roc_val)
    auc_train_list.append(auc_train)
    auc_valid_list.append(auc_val)
    f1_score_train_list.append(f1score_train)
    f1_score_val_list.append(f1score_val)

In [186]:
def model_fit_evaluation2(model_model, params, X_train, y_train, X_valid, y_valid, algo=None, sampling=None):
    
    rcv = RandomizedSearchCV(model_model, params, cv=3, scoring='f1_weighted', n_jobs=-1, verbose=1, random_state=23)
    rcv.fit(X_train, y_train)    
    rcv_best = rcv.best_estimator_
    
    print('\n')
    print('best estimator : ', rcv_best)
    print('best parameters: ', rcv.best_params_)
    print('best score: ', rcv.best_score_)
    print('\n')

    y_train_prob = rcv_best.predict_proba(X_train)
    y_train_pred = rcv_best.predict(X_train)
    y_val_prob = rcv_best.predict_proba(X_valid)
    y_val_pred = rcv_best.predict(X_valid)
    
    matrix_train = confusion_matrix(y_train, y_train_pred)
    matrix_val = confusion_matrix(y_valid, y_val_pred)
    report_train = classification_report(y_train, y_train_pred)
    report_val = classification_report(y_valid, y_val_pred)
    roc_train = roc_auc_score(y_train, y_train_pred)
    roc_val = roc_auc_score(y_valid, y_val_pred)
    auc_train = roc_auc_score(y_train, y_train_prob[:,1])
    auc_val = roc_auc_score(y_valid, y_val_prob[:,1])
    f1score_train = f1_score(y_train, y_train_pred, average='weighted')
    f1score_val = f1_score(y_valid, y_val_pred, average='weighted')
    
    print('F1_score for train')
    print('='*60)
    print(f1score_train,"\n")
    print('F1_score for val')
    print('='*60)
    print(f1score_val,"\n")
    print('AUCROC for train')
    print('='*60)
    print(roc_train,"\n")
    print('AUCROC for Val')
    print('='*60)
    print(roc_val,"\n")
    print('Confusion Matrix for train')
    print('='*60)
    print(matrix_train,"\n")
    print('Confusion Matrix for val')
    print('='*60)
    print(matrix_val,"\n")
    print('Classification Report for train')
    print('='*60)
    print(report_train,"\n")
    print('Classification Report for val')
    print('='*60)
    print(report_val,"\n")
    print('AUC-ROC prob for train')
    print('='*60)
    print(auc_train,'\n')
    print('AUC-ROC prob for val')
    print('='*60)
    print(auc_val,'\n')
    print('Roc-Auc-Curve for Train set')
    print('='*60)
    print(plot_roc_curve(model_LR, X_train, y_train),'\n')
    print('Roc-Auc-Curve for Val set')
    print('='*60)
    print(plot_roc_curve(model_LR, X_valid, y_valid),'\n')
    
    model_list.append(algo)
    resample.append(sampling)
    roc_train_list.append(roc_train)
    roc_valid_list.append(roc_val)
    auc_train_list.append(auc_train)
    auc_valid_list.append(auc_val)
    f1_score_train_list.append(f1score_train)
    f1_score_val_list.append(f1score_val)

In [187]:
def model_fit_evaluation3(model_model, params, X_train, y_train, X_valid, y_valid, algo=None, sampling=None):
    
    rcv = BayesSearchCV(model_model, params, cv=3, scoring='f1_weighted', n_jobs=-1, verbose=1, random_state=23)
    rcv.fit(X_train, y_train)    
    rcv_best = rcv.best_estimator_
    
    print('\n')
    print('best estimator : ', rcv_best)
    print('best parameters: ', rcv.best_params_)
    print('best score: ', rcv.best_score_)
    print('\n')

    y_train_prob = rcv_best.predict_proba(X_train)
    y_train_pred = rcv_best.predict(X_train)
    y_val_prob = rcv_best.predict_proba(X_valid)
    y_val_pred = rcv_best.predict(X_valid)
    
    matrix_train = confusion_matrix(y_train, y_train_pred)
    matrix_val = confusion_matrix(y_valid, y_val_pred)
    report_train = classification_report(y_train, y_train_pred)
    report_val = classification_report(y_valid, y_val_pred)
    roc_train = roc_auc_score(y_train, y_train_pred)
    roc_val = roc_auc_score(y_valid, y_val_pred)
    auc_train = roc_auc_score(y_train, y_train_prob[:,1])
    auc_val = roc_auc_score(y_valid, y_val_prob[:,1])
    f1score_train = f1_score(y_train, y_train_pred, average='weighted')
    f1score_val = f1_score(y_valid, y_val_pred, average='weighted')
    
    print('F1_score for train')
    print('='*60)
    print(f1score_train,"\n")
    print('F1_score for val')
    print('='*60)
    print(f1score_val,"\n")
    print('AUCROC for train')
    print('='*60)
    print(roc_train,"\n")
    print('AUCROC for Val')
    print('='*60)
    print(roc_val,"\n")
    print('Confusion Matrix for train')
    print('='*60)
    print(matrix_train,"\n")
    print('Confusion Matrix for val')
    print('='*60)
    print(matrix_val,"\n")
    print('Classification Report for train')
    print('='*60)
    print(report_train,"\n")
    print('Classification Report for val')
    print('='*60)
    print(report_val,"\n")
    print('AUC-ROC prob for train')
    print('='*60)
    print(auc_train,'\n')
    print('AUC-ROC prob for val')
    print('='*60)
    print(auc_val,'\n')
    print('Roc-Auc-Curve for Train set')
    print('='*60)
    print(plot_roc_curve(model_LR, X_train, y_train),'\n')
    print('Roc-Auc-Curve for Val set')
    print('='*60)
    print(plot_roc_curve(model_LR, X_valid, y_valid),'\n')
    
    model_list.append(algo)
    resample.append(sampling)
    roc_train_list.append(roc_train)
    roc_valid_list.append(roc_val)
    auc_train_list.append(auc_train)
    auc_valid_list.append(auc_val)
    f1_score_train_list.append(f1score_train)
    f1_score_val_list.append(f1score_val)

In [188]:
model_fit_evaluation1(model_LR, X_train, y_train, X_valid, y_valid, 'Logistic Regression', 'without HPT')

In [189]:
model_fit_evaluation1(model_DT, X_train, y_train, X_valid, y_valid, 'Decision Tree', 'without HPT')

In [190]:
model_fit_evaluation1(model_LGBM, X_train, y_train, X_valid, y_valid, 'LGBM', 'without HPT')

In [191]:
model_fit_evaluation1(model_RF, X_train, y_train, X_valid, y_valid, 'Random Forest', 'without HPT')

In [192]:
model_fit_evaluation1(model_XGB, X_train, y_train, X_valid, y_valid, 'XGB', 'without HPT')

In [193]:
model_fit_evaluation1(model_GB, X_train, y_train, X_valid, y_valid, 'GB', 'without HPT')

In [194]:
model_fit_evaluation2(model_LR, params_LR, X_train, y_train, X_valid, y_valid, 'LR', 'with HPT')

In [195]:
model_fit_evaluation2(model_DT, params_DT, X_train, y_train, X_valid, y_valid, 'DT', 'with HPT')

In [196]:
model_fit_evaluation3(model_DT, params_DT, X_train, y_train, X_valid, y_valid, 'DT', 'with HPT')

In [197]:
model_fit_evaluation2(model_LGBM, params_LGBM, X_train, y_train, X_valid, y_valid, 'LGBM', 'with HPT')

In [198]:
model_fit_evaluation3(model_LGBM, params_LGBM, X_train, y_train, X_valid, y_valid, 'LGBM', 'with HPT')

In [199]:
model_fit_evaluation2(model_RF, params_RF, X_train, y_train, X_valid, y_valid, 'RF', 'with HPT')

In [200]:
model_fit_evaluation3(model_RF, params_RF, X_train, y_train, X_valid, y_valid, 'RF', 'with HPT')

In [201]:
model_fit_evaluation2(model_XGB, params_XGB, X_train, y_train, X_valid, y_valid, 'XGB', 'with HPT')

In [203]:
# model_fit_evaluation3(model_XGB, params_XGB, X_train, y_train, X_valid, y_valid, 'XGB', 'with HPT')

In [204]:
model_fit_evaluation2(model_GB, params_GB, X_train, y_train, X_valid, y_valid, 'GB', 'with HPT')

In [205]:
model_fit_evaluation3(model_GB, params_GB, X_train, y_train, X_valid, y_valid, 'GB', 'with HPT')

In [206]:
eval_df = pd.DataFrame({'model': model_list, 'resample':resample, 'roc_train_list':roc_train_list, 'roc_valid_list':roc_valid_list, 'auc_train_list': auc_train_list, 'auc_valid_list': auc_valid_list, 'f1_score_train_list':f1_score_train_list, 'f1_score_val_list':f1_score_val_list})
eval_df

In [214]:
estimators = [('gb', GradientBoostingClassifier(random_state=23)), ('rf', RandomForestClassifier(random_state=23, oob_score = True, bootstrap=True)), ('lgbm', lightgbm.LGBMClassifier(objective = 'binary', random_state=23)), ('xgb', XGBClassifier(random_state=23, tree_method = 'gpu_hist', predictor='gpu_predictor', use_label_encoder=False)), ('dt', DecisionTreeClassifier(random_state = 23))]
sc = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(random_state=23))
# sc.get_params()

In [None]:
# LogisticRegression(C=0.4, l1_ratio=0.8, max_iter=50, penalty='none', random_state=23, solver='liblinear') 
# DecisionTreeClassifier(max_depth=10, min_samples_leaf=10, min_samples_split=100,random_state=23)
# DecisionTreeClassifier(max_depth=20, min_samples_leaf=10, min_samples_split=50,random_state=23) 
# LGBMClassifier(boosting_type='dart', learning_rate=0.20333333333333334, max_depth=20, objective='binary', random_state=23, subsample=0.6) 
# LGBMClassifier(boosting_type='dart', learning_rate=0.1711111111111111, max_depth=5, min_child_samples=5, n_estimators=200,objective='binary', random_state=23)
# RandomForestClassifier(max_depth=5, max_features='sqrt', max_samples=200, min_samples_leaf=50, min_samples_split=5, oob_score=True, random_state=23)
# RandomForestClassifier(max_depth=10, max_features='log2', max_samples=20, min_samples_leaf=10, min_samples_split=10, n_estimators=200, oob_score=True, random_state=23)
# xgb = {'subsample': 0.4, 'sampling_method': 'gradient_based', 'n_estimators': 50, 'min_split_loss': 5, 'max_depth': 19, 'learning_rate': 0.23555555555555557}
# GradientBoostingClassifier(learning_rate=0.3, max_depth=5, n_estimators=50,random_state=23)
# GradientBoostingClassifier(learning_rate=0.10666666666666666, max_depth=5,random_state=23)

In [207]:
X.shape

In [208]:
X_test.shape

In [209]:
col_num_X = [i for i in X.columns if X[i].dtype != object]
print(col_num_X)

In [210]:
col_num_X_test = [i for i in X_test.columns if X_test[i].dtype != object]
print(col_num_X_test)

In [212]:
X[col_num_X] = pt.fit_transform(X[col_num_X])
X_test[col_num_X_test] = pt.transform(X_test[col_num_X_test])

In [215]:
parameters = {
    'gb__n_estimators': [50, 100],
    'gb__max_depth': [5],
    'gb__learning_rate': [0.3,0.10666666666666666],
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [5,10],
    'rf__max_features': ['sqrt','log2'],
    'rf__max_samples': [20,200],
    'rf__min_samples_leaf': [10,50],
    'rf__min_samples_split': [5,10],
    'xgb__n_estimators': [50],
    'xgb__subsample': [0.4],
    'xgb__sampling_method': ['gradient_based'],
    'xgb__min_split_loss': [5],
    'xgb__max_depth': [19],
    'xgb__learning_rate': [0.23555555555555557],
    'dt__max_depth': [10,20],
    'dt__min_samples_leaf': [10],
    'dt__min_samples_split': [100,50],
    'lgbm__n_estimators': [100,200],
    'lgbm__boosting_type': ['dart'],
    'lgbm__learning_rate': [0.20333333333333334,0.1711111111111111],
    'lgbm__max_depth': [20,5],
    'lgbm__min_child_samples': [5,20],
    'lgbm__subsample': [0.2,1],
    'final_estimator': [LogisticRegression(C=0.4),
                        LogisticRegression(l1_ratio=0.8),
                        LogisticRegression(max_iter=50),
                        LogisticRegression(penalty='none'),
                        LogisticRegression(solver='liblinear')],
    'passthrough': [True, False]
}

rcv = RandomizedSearchCV(sc, parameters, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1, random_state=23)
rcv.fit(X, y)    
rcv_best = rcv.best_estimator_

print('\n')
print('best estimator : ', rcv_best)
print('best parameters: ', rcv.best_params_)
print('best score: ', rcv.best_score_)
print('\n')

y_train_prob = rcv_best.predict_proba(X)
y_train_pred = rcv_best.predict(X)
y_test_prob = rcv_best.predict_proba(X_test)
y_test_pred = rcv_best.predict(X_test)

matrix_train = confusion_matrix(y, y_train_pred)
# matrix_val = confusion_matrix(y_valid, y_val_pred)
report_train = classification_report(y, y_train_pred)
# report_val = classification_report(y_valid, y_val_pred)
roc_train = roc_auc_score(y, y_train_pred)
# roc_val = roc_auc_score(y_valid, y_val_pred)
auc_train = roc_auc_score(y, y_train_prob[:,1])
# auc_val = roc_auc_score(y_valid, y_val_prob[:,1])
f1score_train = f1_score(y, y_train_pred, average='weighted')
# f1score_val = f1_score(y_valid, y_val_pred, average='weighted')

print('F1_score for train')
print('='*60)
print(f1score_train,"\n")
# print('F1_score for val')
# print('='*60)
# print(f1score_val,"\n")
print('AUCROC for train')
print('='*60)
print(roc_train,"\n")
# print('AUCROC for Val')
# print('='*60)
# print(roc_val,"\n")
print('Confusion Matrix for train')
print('='*60)
print(matrix_train,"\n")
# print('Confusion Matrix for val')
# print('='*60)
# print(matrix_val,"\n")
print('Classification Report for train')
print('='*60)
print(report_train,"\n")
# print('Classification Report for val')
# print('='*60)
# print(report_val,"\n")
print('AUC-ROC prob for train')
print('='*60)
print(auc_train,'\n')
# print('AUC-ROC prob for val')
# print('='*60)
# print(auc_val,'\n')
print('Roc-Auc-Curve for Train set')
print('='*60)
print(plot_roc_curve(model_LR, X, y),'\n')
# print('Roc-Auc-Curve for Val set')
# print('='*60)
# print(plot_roc_curve(model_LR, X_valid, y_valid),'\n')

In [228]:
#train
y_prob_train = [i[1] for i in y_train_prob]
print(len(y_prob_train))

In [229]:
#test
y_prob = [i[1] for i in y_test_prob]
print(len(y_prob))

In [230]:
fpr, tpr, thresholds = metrics.roc_curve(y, y_prob_train)
plt.scatter(thresholds,np.abs(fpr+tpr-1))
plt.xlabel("Threshold")
plt.ylabel("|FPR + TPR - 1|")
plt.show()

In [231]:
k = thresholds[np.argmin(np.abs(fpr+tpr-1))]
k

In [232]:
predictions = y_prob # y_test_pred  #y_prob
submission = pd.DataFrame({ 'id' : test['id'], 'buy': predictions }, index=None)
submission

In [233]:
submission.buy = [0 if i < k else 1 for i in submission['buy']]  
submission

In [234]:
submission.shape

In [235]:
round(100*submission['buy'].value_counts(),2).plot(kind='pie', figsize=(6, 6), autopct='%1.2f%%')
plt.title("buy")
plt.legend(["1", "0"])
plt.show()

In [224]:
# submission.to_csv(r'C:\Users\getch\Documents\GitHub\my_DataScience_MachineLearning_projects\AV_Jobathon_June2022\submission.csv', index = False)