In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from catboost import CatBoostClassifier

import xgboost as xgb

from lightgbm import LGBMClassifier

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [14]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

# **FUNCTIONS**

## **Scatter Plot**

In [17]:
def PlotScatter(x, y, title, xunit, yunit, format_x, x_size):
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")

    plt.title(title)
    plt.scatter(x, y, alpha=0.7)

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")

    plt.gca().set_xlabel(xunit, fontsize = 10)
    plt.gca().xaxis.set_label_coords(1.05, -0.025)
    plt.gca().set_ylabel(yunit, fontsize = 10)
    plt.gca().yaxis.set_label_coords(-0.16, .98)

    plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in plt.gca().get_yticks()])

    if (format_x == 1):
        plt.gca().set_xticklabels(['{:,.0f}'.format(x) for x in plt.gca().get_xticks()])

    if (x_size == 0):
        plt.gcf().set_size_inches(6, 5)
    else:
        plt.gcf().set_size_inches(x_size, 5)
    plt.show()
    plt.close()

## **Box Plot**

In [19]:
def PlotBoxPlot(df_x, df_y, title, yunit):
    plt.figure(figsize=(6, 5))
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")

    plt.title(title)
    sns.boxplot(x = df_x, y = df_y, palette = "mako")

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")

    plt.gca().set_ylabel(yunit, fontsize = 10)
    plt.gca().yaxis.set_label_coords(-0.16, .98)

    plt.gca().set_xlabel("", fontsize = 1)

    plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in plt.gca().get_yticks()])

    plt.show()
    plt.close()

In [20]:
def PlotBoxPlotV2(df_x, df_y, title, yunit, size_x, size_y):
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")

    plt.title(title)
    sns.boxplot(x = df_x, y = df_y, palette = "mako")

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")

    plt.gca().set_ylabel(yunit, fontsize = 10)
    plt.gca().yaxis.set_label_coords(-0.16, .98)

    plt.gca().set_xlabel("", fontsize = 1)

    plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in plt.gca().get_yticks()])

    plt.gcf().set_size_inches(size_x, size_y)

    plt.show()
    plt.close()

## **Regression Plot**

In [22]:
def PlotRegPlot(df_x, df_y, title, xunit, yunit):
    plt.figure(figsize=(6, 5))
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")

    plt.title(title)
    sns.regplot(x = df_x, y = df_y, line_kws={"color": "#31273F"})

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")

    plt.gca().set_ylabel(yunit, fontsize = 10)
    plt.gca().yaxis.set_label_coords(-0.16, .98)

    plt.gca().set_xlabel(xunit, fontsize = 10)
    plt.gca().xaxis.set_label_coords(1.05, -0.025)

    plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in plt.gca().get_yticks()])
    plt.gca().set_xticklabels(['{:,.0f}'.format(x) for x in plt.gca().get_xticks()])

    plt.show()
    plt.close()

## **Histogram**

In [24]:
def HistPlot(x, title, xunit, x_size):
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")

    plt.title(title)
    sns.histplot(x = x)

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")

    plt.gca().set_xlabel(xunit, fontsize = 10)
    plt.gca().xaxis.set_label_coords(0.98, -0.08)

    plt.ylabel("Count")

    if (x_size == 0):
        plt.gcf().set_size_inches(6, 5)
    else:
        plt.gcf().set_size_inches(x_size, 5)

    plt.show()
    plt.close()

## **Predicted vs Actual Values**

In [26]:
def PlotFitted(y_test, yhat, title):
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")
    
    plt.title(title)

    y_test1 = np.exp(y_test)
    yhat1 = np.exp(yhat)

    sns.scatterplot(x=y_test1.to_numpy().flatten(), y=yhat1, alpha=0.6)
    plt.plot([y_test1.min(), y_test1.max()], [y_test1.min(), y_test1.max()], 'r--')

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")
    
    plt.xlabel("Actual Values")
    plt.ylabel("Predicted Values")

    plt.gcf().set_size_inches(7, 5)

    plt.ylim(0,)
    plt.xlim(0,)
    
    plt.show()
    plt.close()
    
    
    print("\n- Mean Squared Error:", mean_squared_error(y_test, yhat))
    print("- Root Mean Squared Error:", root_mean_squared_error(y_test, yhat))

In [27]:
def PlotFittedV(yhat, y_test, model, x_data, y_data, title, xlabel):
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")

    plt.title(title)

    ax2 = sns.distplot(y_test, hist=False, color="r", label = "Actual Value")
    sns.distplot(yhat, hist=False, color="b", label="Predicted Value", ax = ax2)

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")

    plt.ylabel("Density")
    plt.xlabel(xlabel)
    plt.legend(['Actual Value', 'Predicted Value'], framealpha=0)

    plt.gcf().set_size_inches(7, 5)

    plt.show()
    plt.close()

    print("\n- Mean Squared Error:", mean_squared_error(y_test, yhat))
    print("- Root Mean Squared Error:", root_mean_squared_error(y_test, yhat))
    print("- Cross-Validation Score:", cross_val_score(model, x_data, y_data, cv=5).mean())

In [28]:
def PlotFittedV2(yhat, y_test, title, xlabel):
    plt.figure().patch.set_facecolor("0.85")
    plt.axes().set(facecolor = "0.85")

    plt.title(title)

    ax2 = sns.distplot(np.exp(y_test), hist=False, color="r", label = "Actual Value")
    sns.distplot(np.exp(yhat), hist=False, color="b", label="Predicted Value", ax = ax2)

    plt.grid(False)
    plt.gca().spines['bottom'].set_visible(True)
    plt.gca().spines['left'].set_visible(True)
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['bottom'].set_color("black")
    plt.gca().spines['left'].set_color("black")

    plt.ylabel("Density")
    plt.xlabel(xlabel)
    plt.legend(['Actual Value', 'Predicted Value'], framealpha=0)

    plt.gcf().set_size_inches(7, 5)

    plt.show()
    plt.close()

    print("\n- Mean Squared Error:", mean_squared_error(y_test, yhat))
    print("- Root Mean Squared Error:", root_mean_squared_error(y_test, yhat))

## **Line Plot**

In [30]:
def PlotLine(y, title, yunit, x_size, y_size):
  plt.figure().patch.set_facecolor("0.85")
  plt.axes().set(facecolor = "0.85")

  plt.title(title)
  plt.plot(y)

  plt.grid(False)
  plt.gca().spines['bottom'].set_visible(True)
  plt.gca().spines['left'].set_visible(True)
  plt.gca().spines['top'].set_visible(False)
  plt.gca().spines['right'].set_visible(False)
  plt.gca().spines['bottom'].set_color("black")
  plt.gca().spines['left'].set_color("black")

  plt.gca().set_xlabel("", fontsize = 0)
  #plt.gca().xaxis.set_label_coords(1.05, -0.025)
  plt.gca().set_ylabel(yunit, fontsize = 10)
  plt.gca().yaxis.set_label_coords(-0.16, .98)

  plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in plt.gca().get_yticks()])

  plt.gcf().set_size_inches(x_size, y_size)

  plt.show()
  plt.close()

## **K-Fold Target Encoding**

In [32]:
def KFTE(df, df_test, categ_f, target):
  kf = KFold(n_splits = 5, shuffle = True, random_state = 13)

  nome = categ_f + "_E"

  df[nome] = 0.0

  for train_idx, val_idx in kf.split(df):
      fold_train = df.iloc[train_idx]
      fold_valid  = df.iloc[val_idx]

      medias_feature = fold_train.groupby(categ_f)[target].mean()
      media_global = fold_train[target].mean()

      feature_encoded = fold_valid[categ_f].map(medias_feature).fillna(media_global)
      df.loc[val_idx, nome] = feature_encoded

  mapping = df.groupby(categ_f)[target].mean().to_dict()
  global_mean = df[target].mean()

  df_test[categ_f] = df_test[categ_f].map(mapping).fillna(global_mean)

## **Stratified K-Fold**

In [202]:
def StratKFold(df, features, target, n_splits, metric, model):
    x_data = df[features]
    y_data = df[target]
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=13)

    if (metric == True):
        metrics = {
            'accuracy': [],
            'precision': [],
            'recall': [],
            'f1': [],
            'auc': []
        }
    else:
        metrics = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(x_data, y_data)):
        
        x_train, x_test = x_data.iloc[train_idx].copy(), x_data.iloc[val_idx].copy()
        y_train, y_test = y_data.iloc[train_idx], y_data.iloc[val_idx]
    
        model.fit(x_train, y_train)
        yhat = model.predict(x_test)
        y_proba = model.predict_proba(x_test)[:, 1]

        if (metric == True):
            metrics['accuracy'].append(accuracy_score(y_test, yhat))
            metrics['precision'].append(precision_score(y_test, yhat))
            metrics['recall'].append(recall_score(y_test, yhat))
            metrics['f1'].append(f1_score(y_test, yhat))
            metrics['auc'].append(roc_auc_score(y_test, y_proba))
        else:
            metrics.append(accuracy_score(y_test, yhat))
    
    if (metric == True):
        for k, v in metrics.items():
            print(f"{k}: {np.mean(v)}")
    else:
        print(f"Acurracy: {np.mean(metrics)}")

In [221]:
def StratKFold_Select(df, features, target, n_splits, model, verbose):
    y_data = df[target]
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=13)

    #features_selected = []
    accuracy = 0

    for i in range(len(features)):
        f = features[0: i+1]
        x_data = df[f]
        acc = []
        
        for fold, (train_idx, val_idx) in enumerate(skf.split(x_data, y_data)):
            
            x_train, x_test = x_data.iloc[train_idx].copy(), x_data.iloc[val_idx].copy()
            y_train, y_test = y_data.iloc[train_idx], y_data.iloc[val_idx]
        
            model.fit(x_train, y_train)
            yhat = model.predict(x_test)
            acc.append(accuracy_score(y_test, yhat))
    
        if (np.mean(acc) > accuracy):
            features_selected = f
            if (verbose == True):
                print("Old Ac:", accuracy, "\nNew Ac:", np.mean(acc), "\nFeature:", features[i], "\nFeatures:", f, "\n")
            accuracy = np.mean(acc)

        #if (verbose == True):
            #print("Old Ac:", accuracy, "\nNew Ac:", np.mean(acc), "\nFeature:", features[i], "\nFeatures:", f, "\n")

        #accuracy = np.mean(acc)


    return features_selected

# **DATASET LOADING**

In [51]:
df = pd.read_csv("../data/processed/2.EDA_train.csv")
df_test = pd.read_csv("../data/processed/2.EDA_test.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_log,Pclass_1,Pclass_2,Pclass_3,Pclass_E,IsMale,Embarked_C,Embarked_Q,Embarked_S,Embarked_E,IsAgeEstimated,Stage,Stage_E,FreeFare,IsSingle,FarePerPerson,FamilySize,IsAlone,Title,Title_E,Deck,Deck_E,HasCabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2.110213,0.0,0.0,1.0,0.251948,1.0,0.0,0.0,1.0,0.339114,0.0,Adult,0.370884,0.0,0.0,7.25,2,0,Mr,0.167866,U,0.302198,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,4.280593,1.0,0.0,0.0,0.652439,0.0,1.0,0.0,0.0,0.592,0.0,Adult,0.369863,0.0,0.0,71.2833,2,0,Mrs,0.79,C,0.577778,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2.188856,0.0,0.0,1.0,0.245478,0.0,0.0,0.0,1.0,0.33925,0.0,Adult,0.363946,0.0,1.0,7.925,1,1,Miss,0.72028,U,0.298535,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,3.990834,1.0,0.0,0.0,0.620112,0.0,0.0,0.0,1.0,0.33925,0.0,Adult,0.363946,0.0,0.0,26.55,2,0,Mrs,0.80198,C,0.6,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,2.202765,0.0,0.0,1.0,0.251948,1.0,0.0,0.0,1.0,0.339114,0.0,Adult,0.370884,0.0,1.0,8.05,1,1,Mr,0.167866,U,0.302198,0


# **MODEL DEVELOPMENT AND EVALUATION**

In [59]:
features_KFold = ['IsMale', 'Title_E', 'FarePerPerson', 'Age', 'Pclass_E', 'Fare_log', 'SibSp', 'Deck_E', 'Embarked_E', 'FamilySize', 'Stage_E', 'Parch', 'HasCabin',
            'IsSingle', 'IsAgeEstimated', 'FreeFare', 'IsAlone']

Features_OHE = ['Age', 'Deck_E', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'FamilySize', 'Fare_log', 'FarePerPerson', 'FreeFare', 'HasCabin', 'IsAgeEstimated', 'IsAlone',
                'IsMale', 'IsSingle', 'Parch', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'SibSp', 'Stage_E', 'Title_E']

## **XGBOOST CLASSIFIER**

### **MODEL DEVELOPMENT**

#### **K-FOLD TARGET ENCODING**

In [207]:
xgbc = xgb.XGBClassifier(random_state=13)

StratKFold(df, features_KFold, 'Survived', 5, False, xgbc)

Acurracy: 0.8204506936162199


In [209]:
feature_importances = pd.DataFrame({'Feature': df[features_KFold].columns,
                                    'XGBC_Importance': xgbc.feature_importances_
                                   })
feature_importances = feature_importances.sort_values(by='XGBC_Importance', ascending=False)

feature_importances

Unnamed: 0,Feature,XGBC_Importance
1,Title_E,0.31469
9,FamilySize,0.149316
0,IsMale,0.082611
4,Pclass_E,0.06825
10,Stage_E,0.055283
7,Deck_E,0.04951
2,FarePerPerson,0.041254
14,IsAgeEstimated,0.040605
5,Fare_log,0.039984
8,Embarked_E,0.039087


In [211]:
features_xgbc = feature_importances.sort_values(by='XGBC_Importance', ascending=False)['Feature'].to_list()

In [225]:
xgbc = xgb.XGBClassifier(random_state=13)

features_xgbc_sel = StratKFold_Select(df, features_xgbc, 'Survived', 5, xgbc, False)
StratKFold(df, features_xgbc_sel, 'Survived', 5, True, xgbc)

accuracy: 0.8204569706860838
precision: 0.7754976706673467
recall: 0.7516624040920716
f1: 0.7623375889654943
auc: 0.8803014625003645


In [190]:
features_xgbc_sel

['Title_E',
 'FamilySize',
 'IsMale',
 'Stage_E',
 'FarePerPerson',
 'Fare_log',
 'Age',
 'IsSingle',
 'SibSp']

In [153]:
xgbc = xgb.XGBClassifier(random_state=13)

for i in range(len(features_xgbc)):
    features = features_xgbc[0: i+1]
    print(features)
    StratKFold(features, 'Survived', 5, False, xgbc)
    print("\n")

['Title_E']
Acurracy: 0.7890


['Title_E', 'FamilySize']
Acurracy: 0.8059


['Title_E', 'FamilySize', 'IsMale']
Acurracy: 0.8103


['Title_E', 'FamilySize', 'IsMale', 'Pclass_E']
Acurracy: 0.8047


['Title_E', 'FamilySize', 'IsMale', 'Pclass_E', 'Stage_E']
Acurracy: 0.8104


['Title_E', 'FamilySize', 'IsMale', 'Pclass_E', 'Stage_E', 'Deck_E']
Acurracy: 0.8037


['Title_E', 'FamilySize', 'IsMale', 'Pclass_E', 'Stage_E', 'Deck_E', 'FarePerPerson']
Acurracy: 0.8081


['Title_E', 'FamilySize', 'IsMale', 'Pclass_E', 'Stage_E', 'Deck_E', 'FarePerPerson', 'IsAgeEstimated']
Acurracy: 0.8036


['Title_E', 'FamilySize', 'IsMale', 'Pclass_E', 'Stage_E', 'Deck_E', 'FarePerPerson', 'IsAgeEstimated', 'Fare_log']
Acurracy: 0.8092


['Title_E', 'FamilySize', 'IsMale', 'Pclass_E', 'Stage_E', 'Deck_E', 'FarePerPerson', 'IsAgeEstimated', 'Fare_log', 'Embarked_E']
Acurracy: 0.8036


['Title_E', 'FamilySize', 'IsMale', 'Pclass_E', 'Stage_E', 'Deck_E', 'FarePerPerson', 'IsAgeEstimated', 'Fare_log', 'Embarke

In [None]:
for i in range(len(features_KFold)):
    print(features_KFold[0: i+1])

In [84]:
for i in range(len(features_KFold)):
    print(features_KFold[0: i+1])
#    for j in range(i):
#        print(features_KFold[0: j+1])

['IsMale']
['IsMale', 'Title_E']
['IsMale', 'Title_E', 'FarePerPerson']
['IsMale', 'Title_E', 'FarePerPerson', 'Age']
['IsMale', 'Title_E', 'FarePerPerson', 'Age', 'Pclass_E']
['IsMale', 'Title_E', 'FarePerPerson', 'Age', 'Pclass_E', 'Fare_log']
['IsMale', 'Title_E', 'FarePerPerson', 'Age', 'Pclass_E', 'Fare_log', 'SibSp']
['IsMale', 'Title_E', 'FarePerPerson', 'Age', 'Pclass_E', 'Fare_log', 'SibSp', 'Deck_E']
['IsMale', 'Title_E', 'FarePerPerson', 'Age', 'Pclass_E', 'Fare_log', 'SibSp', 'Deck_E', 'Embarked_E']
['IsMale', 'Title_E', 'FarePerPerson', 'Age', 'Pclass_E', 'Fare_log', 'SibSp', 'Deck_E', 'Embarked_E', 'FamilySize']
['IsMale', 'Title_E', 'FarePerPerson', 'Age', 'Pclass_E', 'Fare_log', 'SibSp', 'Deck_E', 'Embarked_E', 'FamilySize', 'Stage_E']
['IsMale', 'Title_E', 'FarePerPerson', 'Age', 'Pclass_E', 'Fare_log', 'SibSp', 'Deck_E', 'Embarked_E', 'FamilySize', 'Stage_E', 'Parch']
['IsMale', 'Title_E', 'FarePerPerson', 'Age', 'Pclass_E', 'Fare_log', 'SibSp', 'Deck_E', 'Embarked_E'

In [None]:
L = ["Michael Jackson", 10.1, 1982, "MJ", 1]

L[3:5]:["MJ", 1]  #The last index is one number larger than the last index