In [7]:
# importing libraries
import pandas            as pd                       # data science essentials
import matplotlib.pyplot as plt                      # data visualization
import seaborn           as sns                      # enhanced data viz
from sklearn.model_selection import train_test_split # train-test split
from sklearn.linear_model import LogisticRegression  # logistic regression
import statsmodels.formula.api as smf                # logistic regression
from sklearn.metrics import confusion_matrix         # confusion matrix
from sklearn.metrics import roc_auc_score            # auc score
from sklearn.neighbors import KNeighborsClassifier   # KNN for classification
from sklearn.neighbors import KNeighborsRegressor    # KNN for regression
from sklearn.preprocessing import StandardScaler     # standard scaler
from sklearn.tree import DecisionTreeClassifier      # classification trees
from sklearn.tree import plot_tree                   # tree plots

# new packages
from sklearn.model_selection import RandomizedSearchCV     # hyperparameter tuning
from sklearn.metrics import make_scorer              # customizable scorer


# new tools
from sklearn.ensemble import RandomForestClassifier     # random forest
from sklearn.ensemble import GradientBoostingClassifier # gbm

# loading data

file = "./A2data2.csv"
got = pd.read_csv(file)


# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)


# displaying the head of the dataset
got.head(n = 10)


Unnamed: 0,customer_id,sex_at_birth,marital_status,country,age,occupation,employment_type,type_of_client,inv_quan,sum_qua,total_spent
0,108008,F,Not_Married,United-States,22,Prof-specialty,Private,Personal,1,9,22.95
1,4176935,F,Not_Married,Canada,38,Sales,Private,Wholesaler,4,169,590.71
2,8285744,F,Married,Mexico,25,else_occu,Private,Wholesaler,1,124,186.41
3,8775662,M,Not_Married,United-States,37,Craft-repair,self,Wholesaler,4,289,610.77
4,10976223,F,Not_Married,Cuba,40,Exec-managerial,Private,Wholesaler,11,440,1008.83
5,12116759,F,Not_Married,United-States,26,Sales,Private,Wholesaler,2,147,292.34
6,19115873,M,Not_Married,United-States,40,Craft-repair,Private,Wholesaler,3,466,690.03
7,22181073,M,Married,United-States,48,else_occu,Private,Wholesaler,1,108,322.69
8,27288358,M,Married,United-States,53,else_occu,Private,Wholesaler,19,1199,5353.73
9,29824104,M,Not_Married,United-States,38,Other-service,gov,Wholesaler,2,461,1219.31


In [2]:
#Fill the 'NaN' values in the Dataset's variables for '0'...
NaN_fill =['customer_id', 'sex_at_birth', 'marital_status' , 'country' ,,'occupation', 'employment_type', 
           'type_of_client']



for col in got[NaN_fill]:
    if got[col].isnull().astype(int).sum() > 0:
        got[col] = got[col].fillna(0)
    got[col] = got[col].apply(lambda x: 1 if type(x) == str else x)
        
# for col in got[NaN_fill]: 
#      if got[col] is chr: got[col] = 1 

In [3]:
got.head(n = 10)

Unnamed: 0,S.No,name,title,male,culture,dateOfBirth,mother,father,heir,house,spouse,book1_A_Game_Of_Thrones,book2_A_Clash_Of_Kings,book3_A_Storm_Of_Swords,book4_A_Feast_For_Crows,book5_A_Dance_with_Dragons,isAliveFather,isAliveMother,isAliveSpouse,isAliveHeir,age,popularity,isAlive
0,1,1,1,0,1,0.0,0,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.016722,1
1,2,1,1,1,1,0.0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.0,0.0,0.050167,1
2,3,1,0,0,1,0.0,0,0,0,0,1,0,1,1,1,1,0.0,0.0,0.0,1.0,0.0,0.117057,1
3,4,1,0,0,1,0.0,0,0,0,1,0,0,1,1,1,1,0.0,0.0,0.0,0.0,0.0,0.745819,1
4,5,1,0,1,1,0.0,0,0,0,1,0,0,0,1,0,1,0.0,0.0,0.0,0.0,64.0,0.076923,0
5,6,1,1,1,1,0.0,0,0,0,1,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0301,0
6,7,1,1,1,1,0.0,0,0,0,0,0,0,1,1,0,1,0.0,0.0,0.0,0.0,0.0,0.103679,0
7,8,1,1,1,1,0.0,0,0,0,0,0,0,0,1,0,1,0.0,0.0,0.0,0.0,0.0,0.023411,1
8,9,1,1,1,1,0.0,0,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.013378,0
9,10,1,0,0,1,0.0,0,0,0,0,0,0,1,0,0,0,0.0,0.0,0.0,0.0,50.0,0.010033,1


In [4]:
#find the isAlive corr with the others 
df_corr = got.corr(method = 'pearson').round(decimals = 2)

df_corr['isAlive'].sort_values(ascending = False)

isAlive                       1.00
book2_A_Clash_Of_Kings        0.26
age                           0.05
book5_A_Dance_with_Dragons    0.01
book3_A_Storm_Of_Swords      -0.01
isAliveHeir                  -0.02
isAliveMother                -0.04
house                        -0.05
title                        -0.05
culture                      -0.05
dateOfBirth                  -0.06
spouse                       -0.06
isAliveSpouse                -0.08
book4_A_Feast_For_Crows      -0.08
isAliveFather                -0.12
S.No                         -0.12
heir                         -0.13
father                       -0.13
mother                       -0.14
male                         -0.16
book1_A_Game_Of_Thrones      -0.17
popularity                   -0.22
name                           NaN
Name: isAlive, dtype: float64

In [5]:
got.loc[ : ,'isAlive'].value_counts(normalize = True).round(decimals = 2)

1    0.73
0    0.27
Name: isAlive, dtype: float64

In [6]:
# decla-dataring explanatory variables
got_data = got.drop('isAlive', axis = 1)


# declaring response variable
got_target = got.loc[ : ,'isAlive']

In [7]:
# train-test split with stratification
x_train, x_test, y_train, y_test = train_test_split(
            got_data ,  # x
            got_target, # y
            test_size    = 0.10,
            random_state = 219,
            stratify     = got_target) # preserving balance


# merging training data for statsmodels
got_train = pd.concat([x_train, y_train], axis = 1)
#got_test = pd.concat([x_test, y_test], axis = 1)

In [8]:
for val in got_data:
    print(f" {val} + ")

 S.No + 
 name + 
 title + 
 male + 
 culture + 
 dateOfBirth + 
 mother + 
 father + 
 heir + 
 house + 
 spouse + 
 book1_A_Game_Of_Thrones + 
 book2_A_Clash_Of_Kings + 
 book3_A_Storm_Of_Swords + 
 book4_A_Feast_For_Crows + 
 book5_A_Dance_with_Dragons + 
 isAliveFather + 
 isAliveMother + 
 isAliveSpouse + 
 isAliveHeir + 
 age + 
 popularity + 


In [9]:
# instantiating a logistic regression model object
logistic_full = smf.logit(formula = """isAlive ~
 title + 
 male + 
 culture + 
 dateOfBirth +
 mother + 
 father + 
 heir + 
 house + 
 spouse + 
 book1_A_Game_Of_Thrones + 
 book2_A_Clash_Of_Kings + 
 book3_A_Storm_Of_Swords + 
 book4_A_Feast_For_Crows + 
 book5_A_Dance_with_Dragons + 
 isAliveFather + 
 isAliveMother + 
 isAliveSpouse + 
 isAliveHeir + 
 age + 
 popularity """, 
 data    = got_train)


# fitting the model object
results_full = logistic_full.fit()


# checking the results SUMMARY
results_full.summary2()

         Current function value: 0.488880
         Iterations: 35




0,1,2,3
Model:,Logit,Pseudo R-squared:,0.161
Dependent Variable:,isAlive,AIC:,1657.2594
Date:,2022-03-27 19:33,BIC:,1770.864
No. Observations:,1652,Log-Likelihood:,-807.63
Df Model:,20,LL-Null:,-962.51
Df Residuals:,1631,LLR p-value:,8.1917e-54
Converged:,0.0000,Scale:,1.0
No. Iterations:,35.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,1.7592,0.1919,9.1665,0.0000,1.3830,2.1353
title,-0.0940,0.1330,-0.7066,0.4798,-0.3547,0.1667
male,-0.6823,0.1386,-4.9214,0.0000,-0.9540,-0.4106
culture,-0.0950,0.1363,-0.6974,0.4856,-0.3621,0.1721
dateOfBirth,-0.0018,0.0006,-2.7158,0.0066,-0.0030,-0.0005
mother,-3.8065,2.6784,-1.4212,0.1553,-9.0561,1.4431
father,0.9168,1.5085,0.6077,0.5434,-2.0399,3.8735
heir,-18.2233,4866.2455,-0.0037,0.9970,-9555.8891,9519.4426
house,-0.5684,0.1711,-3.3229,0.0009,-0.9037,-0.2331


In [10]:
#build the candidate_dict 
candidate_dict = {

 # full model  'title', 'house','culture', 'dateOfBirth', 'mother', father', 'heir', 'spouse',
 'logit_full'   : [ 'male',  'book1_A_Game_Of_Thrones', 'book2_A_Clash_Of_Kings', 'dateOfBirth',
                    'title', 'house','culture', 'dateOfBirth', 'mother', 'father', 'heir', 'spouse',
                    'book3_A_Storm_Of_Swords', 'book4_A_Feast_For_Crows', 'book5_A_Dance_with_Dragons',  
                    'age', 'popularity'],
 

 # significant variables only (set 1)  'house', 'book4_A_Feast_For_Crows',
 'logit_sig'    : ['male','house', 'dateOfBirth', 'book1_A_Game_Of_Thrones', 'book2_A_Clash_Of_Kings', 
                   'book3_A_Storm_Of_Swords',  'popularity'
                   ],

 #
    
    
    
 # significant variables only (set 2) 'book4_A_Feast_For_Crows', 
 'logit_sig_2'  : ['book1_A_Game_Of_Thrones', 
                   'book3_A_Storm_Of_Swords', 'book5_A_Dance_with_Dragons', 
                   'dateOfBirth']

}

In [11]:

# printing candidate variable sets
print(f"""
/--------------------------\\
|Explanatory Variable Sets |
\\--------------------------/

Full Model:
-----------
{candidate_dict['logit_full']}


First Significant p-value Model:
--------------------------------
{candidate_dict['logit_sig']}


Second Significant p-value Model:
---------------------------------
{candidate_dict['logit_sig_2']}
""")


/--------------------------\
|Explanatory Variable Sets |
\--------------------------/

Full Model:
-----------
['male', 'book1_A_Game_Of_Thrones', 'book2_A_Clash_Of_Kings', 'dateOfBirth', 'title', 'house', 'culture', 'dateOfBirth', 'mother', 'father', 'heir', 'spouse', 'book3_A_Storm_Of_Swords', 'book4_A_Feast_For_Crows', 'book5_A_Dance_with_Dragons', 'age', 'popularity']


First Significant p-value Model:
--------------------------------
['male', 'house', 'dateOfBirth', 'book1_A_Game_Of_Thrones', 'book2_A_Clash_Of_Kings', 'book3_A_Storm_Of_Swords', 'popularity']


Second Significant p-value Model:
---------------------------------
['book1_A_Game_Of_Thrones', 'book3_A_Storm_Of_Swords', 'book5_A_Dance_with_Dragons', 'dateOfBirth']



In [12]:
# decla-dataring explanatory variables
# got_data = got.drop('isAlive', axis = 1)
got_data   =  got.loc[ : , candidate_dict['logit_sig']]

# declaring response variable
got_target = got.loc[ : ,'isAlive']

In [13]:
# train-test split with stratification
x_train, x_test, y_train, y_test = train_test_split(
            got_data ,  # x
            got_target, # y
            test_size    = 0.10,
            random_state = 219,
            stratify     = got_target) # preserving balance


# merging training data for statsmodels
got_train = pd.concat([x_train, y_train], axis = 1)
#got_test = pd.concat([x_test, y_test], axis = 1)

In [14]:
print(f"""

Response Variable Proportions (Training Set)
--------------------------------------------
{y_train.value_counts(normalize = True).round(decimals = 2)}



Response Variable Proportions (Testing Set)
--------------------------------------------
{y_test.value_counts(normalize = True).round(decimals = 2)}
""")



Response Variable Proportions (Training Set)
--------------------------------------------
1    0.73
0    0.27
Name: isAlive, dtype: float64



Response Variable Proportions (Testing Set)
--------------------------------------------
1    0.73
0    0.27
Name: isAlive, dtype: float64



In [15]:
# instantiating a logistic regression model object
logistic_small = smf.logit(formula = """isAlive ~  book2_A_Clash_Of_Kings""",
                           data    = got_train)


# fitting the model object
results_logistic = logistic_small.fit()


# checking the results SUMMARY
results_logistic.summary2() # summary2() has AIC and BIC

Optimization terminated successfully.
         Current function value: 0.546511
         Iterations 5


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.062
Dependent Variable:,isAlive,AIC:,1809.6732
Date:,2022-03-27 19:33,BIC:,1820.4927
No. Observations:,1652,Log-Likelihood:,-902.84
Df Model:,1,LL-Null:,-962.51
Df Residuals:,1650,LLR p-value:,8.814899999999999e-28
Converged:,1.0000,Scale:,1.0
No. Iterations:,5.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.3609,0.0774,4.6606,0.0000,0.2091,0.5127
book2_A_Clash_Of_Kings,1.2374,0.1158,10.6820,0.0000,1.0103,1.4644


In [16]:
for val in got_data:
    print(f" {val} + ")

 male + 
 house + 
 dateOfBirth + 
 book1_A_Game_Of_Thrones + 
 book2_A_Clash_Of_Kings + 
 book3_A_Storm_Of_Swords + 
 popularity + 


In [17]:
# INSTANTIATING the model object without hyperparameters
full_gbm_default = GradientBoostingClassifier(loss          = 'deviance',
                                              learning_rate = 0.1,
                                              n_estimators  = 500,
                                              criterion     = 'friedman_mse',
                                              max_depth     = 4,
                                              warm_start    = False,
                                              random_state  = 219)


# FIT step is needed as we are not using .best_estimator
full_gbm_default_fit = full_gbm_default.fit(x_train, y_train)


# PREDICTING based on the testing set
full_gbm_default_pred = full_gbm_default_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', full_gbm_default_fit.score(x_train, y_train).round(4))
print('Testing ACCURACY :', full_gbm_default_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = full_gbm_default_pred).round(4))

Training ACCURACY: 0.905
Testing ACCURACY : 0.7609
AUC Score        : 0.6478


In [18]:
# unpacking the confusion matrix
gbm_default_tn, \
gbm_default_fp, \
gbm_default_fn, \
gbm_default_tp = confusion_matrix(y_true = y_test, y_pred = full_gbm_default_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {gbm_default_tn}
False Positives: {gbm_default_fp}
False Negatives: {gbm_default_fn}
True Positives : {gbm_default_tp}
""")



True Negatives : 20
False Positives: 30
False Negatives: 14
True Positives : 120



In [19]:
# declaring a hyperparameter space
learn_range        = np.arange(0.1, 2.2, 0.4)
estimator_range    = np.arange(100, 1001, 25)
depth_range        = np.arange(2, 10, 2)
warm_start_range   = [True, False]

# # declaring a hyperparameter space
# learn_range        = np.arange(0.1, 2.2, 0.4)
# estimator_range    = np.arange(100, 801, 20)
# depth_range        = np.arange(2, 10, 2)
# criterion_range    = ['friedman_mse']
# warm_start_range   = [True, False]

# creating a hyperparameter grid
param_grid = {'learning_rate' : learn_range,
              'max_depth'     : depth_range,
              'n_estimators'  : estimator_range,
              'warm_start'    : warm_start_range}


# INSTANTIATING the model object without hyperparameters
full_gbm_grid = GradientBoostingClassifier(random_state = 219)


# GridSearchCV object
full_gbm_cv = RandomizedSearchCV(estimator     = full_gbm_grid,
                           param_distributions = param_grid,
                           cv                  = 3,
                           n_iter              = 1000,
                           random_state        = 219,
                           scoring             = make_scorer(roc_auc_score,
                                                 needs_threshold = False))


# FITTING to the FULL DATASET (due to cross-validation)
full_gbm_cv.fit(got_data, got_target)


# PREDICT step is not needed


# printing the optimal parameters and best score
print("Tuned Parameters  :", full_gbm_cv.best_params_)
print("Tuned Training AUC:", full_gbm_cv.best_score_.round(4))

Tuned Parameters  : {'warm_start': True, 'n_estimators': 775, 'max_depth': 8, 'learning_rate': 1.3000000000000003}
Tuned Training AUC: 0.6516


In [21]:
# checking the best estimator for the model
full_gbm_cv.best_estimator_

GradientBoostingClassifier(learning_rate=1.3000000000000003, max_depth=8,
                           n_estimators=775, random_state=219, warm_start=True)

In [111]:
# INSTANTIATING with best_estimator
gbm_tuned = GradientBoostingClassifier(learning_rate = 1.3000000000000003,
                                       max_depth     = 8,
                                       n_estimators  = 775,
                                       warm_start    = True,
                                       random_state  = 219)


# FITTING to the FULL DATASET (due to cross-validation)
gbm_tuned_fit = gbm_tuned.fit(got_data, got_target)


# PREDICTING based on the testing set
gbm_tuned_pred = gbm_tuned_fit.predict(x_test)


# SCORING the results
print('Training ACCURACY:', gbm_tuned_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', gbm_tuned_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = gbm_tuned_pred).round(4))

Training ACCURACY: 0.9098
Testing  ACCURACY: 0.9022
AUC Score        : 0.8701


In [112]:
# unpacking the confusion matrix
gbm_tuned_tn, \
gbm_tuned_fp, \
gbm_tuned_fn, \
gbm_tuned_tp = confusion_matrix(y_true = y_test, y_pred = gbm_tuned_pred).ravel()


# printing each result one-by-one
print(f"""
True Negatives : {gbm_tuned_tn}
False Positives: {gbm_tuned_fp}
False Negatives: {gbm_tuned_fn}
True Positives : {gbm_tuned_tp}
""")


True Negatives : 40
False Positives: 10
False Negatives: 8
True Positives : 126



In [113]:
# declaring model performance objects
gbm_train_acc = gbm_tuned_fit.score(x_train, y_train).round(4)
gbm_test_acc  = gbm_tuned_fit.score(x_test, y_test).round(4)
gbm_auc       = roc_auc_score(y_true  = y_test,
                              y_score = gbm_tuned_pred).round(4)

# appending to model_performance
model_performance = {
    
    'Model Name'    : ['Tuned GBM'],
           
    'AUC Score' : [gbm_auc],
    
    'Training Accuracy' : [gbm_train_acc],
           
    'Testing Accuracy'  : [gbm_test_acc],

    'Confusion Matrix'  : [(gbm_tuned_tn,
                            gbm_tuned_fp,
                            gbm_tuned_fn,
                            gbm_tuned_tp)]}


# converting model_performance into a DataFrame
model_performance = pd.DataFrame(model_performance)

In [114]:
model_performance

Unnamed: 0,Model Name,AUC Score,Training Accuracy,Testing Accuracy,Confusion Matrix
0,Tuned GBM,0.8701,0.9098,0.9022,"(40, 10, 8, 126)"


In [115]:
print(f"""
          my model, Tuned GBM
          AUC Score          is  {gbm_auc}, 
          Training Accuracy  is  {gbm_train_acc}
          Testing Accuracy   is  {gbm_test_acc}
          'Confusion Matrix' is  {(gbm_tuned_tn,
                            gbm_tuned_fp,
                            gbm_tuned_fn,
                            gbm_tuned_tp)}
      
      """)


          my model, Tuned GBM
          AUC Score          is  0.8701, 
          Training Accuracy  is  0.9098
          Testing Accuracy   is  0.9022
          'Confusion Matrix' is  (40, 10, 8, 126)
      
      
