<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<hr style="height:.9px;border:none;color:#333;background-color:#333;" />

<br><h2>Script 03 | Changing the Prediction Threshold</h2>
<br>
Written by Chase Kusterer<br>
<a href="https://github.com/chase-kusterer">GitHub</a> | <a href="https://www.linkedin.com/in/kusterer/">LinkedIn</a>
<br><br><br>

<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<hr style="height:.9px;border:none;color:#333;background-color:#333;" />

In [None]:
# importing libraries
import pandas            as pd                       # data science essentials
import matplotlib.pyplot as plt                      # data visualization
import seaborn           as sns                      # enhanced data viz
from sklearn.model_selection import train_test_split # train-test split
from sklearn.linear_model import LogisticRegression  # logistic regression
import statsmodels.formula.api as smf                # logistic regression
from sklearn.metrics import confusion_matrix         # confusion matrix
from sklearn.metrics import roc_auc_score            # auc score
from sklearn.neighbors import KNeighborsClassifier   # KNN for classification
from sklearn.neighbors import KNeighborsRegressor    # KNN for regression
from sklearn.preprocessing import StandardScaler     # standard scaler
from sklearn.tree import DecisionTreeClassifier      # classification trees
from sklearn.tree import plot_tree                   # tree plots


# loading data
titanic = pd.read_excel('./datasets/titanic_feature_rich.xlsx')


# setting pandas print options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)


# displaying the head of the dataset
titanic.head(n = 5)

<br>

In [None]:
# train-test split with stratification
x_train, x_test, y_train, y_test = train_test_split(
            titanic_data,
            titanic_target,
            test_size    = 0.25,
            random_state = 219,
            stratify     = titanic_target) # preserving balance


# merging training data for statsmodels
titanic_train = pd.concat([x_train, y_train], axis = 1)

<br>

In [None]:
# explanatory sets from last session

# creating a dictionary to store candidate models

candidate_dict = {

 # full model
 'logit_full'   : ['age', 'sibsp', 'parch', 'fare', 'm_age', 'm_cabin',
                   'm_home_dest', 'potential_youth', 'child',
                   'number_of_names', 'pclass_1', 'pclass_2', 'female'],
 

 # significant variables only (set 1)
 'logit_sig'    : ['age' , 'sibsp', 'm_cabin', 'number_of_names',
                   'pclass_1', 'female'],
    
    
 # significant variables only (set 2)
 'logit_sig_2'  : ['age', 'm_cabin', 'number_of_names',
                   'pclass_2', 'pclass_3', 'female']

}

<br>

In [None]:
# INSTANTIATING a classification tree object
tree_pruned = DecisionTreeClassifier(max_depth        = 4,
                                     min_samples_leaf = 25,
                                     random_state     = 219)


# FITTING the training data
tree_pruned_fit = tree_pruned.fit(x_train, y_train)


# PREDICTING on new data
tree_pruned_pred = tree_pruned_fit.predict(x_test)


# SCORING the model
print('Training ACCURACY:', tree_pruned_fit.score(x_train, y_train).round(4))
print('Testing  ACCURACY:', tree_pruned_fit.score(x_test, y_test).round(4))
print('AUC Score        :', roc_auc_score(y_true  = y_test,
                                          y_score = tree_pruned_pred).round(4))


# saving scoring data for future use
pruned_tree_train_score = tree_pruned_fit.score(x_train, y_train).round(4) # accuracy
pruned_tree_test_score  = tree_pruned_fit.score(x_test, y_test).round(4) # accuracy


# saving auc score
pruned_tree_auc_score   = roc_auc_score(y_true  = y_test,
                                        y_score = tree_pruned_pred).round(4) # auc

<br>

In [None]:
# predictions for x_test
tree_pruned.predict(x_test)

<br>

In [None]:
# probabilities of prediction
tree_pruned.predict_proba(x_test)

<br>

In [None]:
# probabilities of prediction for 1 (positive class)
predictions_df = pd.DataFrame(data = tree_pruned.predict_proba(x_test))

# renaming columns
predictions_df.columns = ['negative', 'positive']

predictions_df.loc[ : , 'positive']

<br>

In [None]:
# joining predictions with y_test
predictions_df.join(pd.Series(y_test.reset_index(drop = True)))

<br>

In [None]:
# changing the prediction threshold

# placeholder column
predictions_df['predictions_at_60'] = 0


# looping to flag at 0.60 for positive class
for index, column in predictions_df.iterrows():
    
    if  predictions_df.loc[ index, 'positive' ] >= 0.60:
        predictions_df.loc[ index , 'predictions_at_60'] = 1
        

# outputting AUC score (p = 0.60 for positive class)
roc_auc_score(y_true  = y_test,
                        y_score = predictions_df['predictions_at_60']).round(4) # auc

<br>

In [None]:
# original AUC score (p = 0.50 for positive class)
roc_auc_score(y_true  = y_test,
                        y_score = tree_pruned_pred).round(4) # auc

<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<hr style="height:.9px;border:none;color:#333;background-color:#333;" /><br>

~~~
   ____ _                     _ 
  / ___| | __ _ ___ ___ _   _| |
 | |   | |/ _` / __/ __| | | | |
 | |___| | (_| \__ \__ \ |_| |_|
  \____|_|\__,_|___/___/\__, (_)
                        |___/   
                  
~~~

<hr style="height:.9px;border:none;color:#333;background-color:#333;" />
<hr style="height:.9px;border:none;color:#333;background-color:#333;" />

<br>