In [None]:
#Import your Libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics
%matplotlib inline

In [None]:
# %%timeit -n 1
# Load your data  -- start with CreditScoring.csv... then Life Expectancy - and then choose another one
df = pd.read_csv('./CreditScoring.csv')

In [None]:
len(df)

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.nunique()

In [None]:
df.corr()

In [None]:
# Basic Data Cleaning
df.columns = df.columns.str.lower().str.replace(' ', '_') # A
 
string_columns = list(df.dtypes[df.dtypes == 'object'].index) # B
 
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_') # C

In [None]:
df.head()

In [None]:
df.head().T

In [None]:
# Categorical Values will be encoded with the Dictionary Vectorizor
# Numerical Values: At a minimum - clean the missing values and 

In [None]:
# For instance - in the CreditScoring dataset - there are numerous 99999999 that need to be replaced
# Obviously don't run this with your dataset
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)
df = df[df.status != 'unk']   # Also make sure to treat the target variable

In [None]:
# Replace with your target variable --- df.YOUR_TARGET_VARIABLE  
# Also replace your X label
plt.figure(figsize=(6, 4))

sns.histplot(df.status, bins=40, color='black', alpha=1)
plt.ylabel('Frequency')
plt.xlabel('status')
plt.title('Distribution of prices')

plt.show()

In [None]:
# Check for nulls --- you do NOT want nulls when you train
df.isnull().sum()

In [None]:
df.head()

In [None]:
#delete columns --- this may or may NOT be needed.  As before - skip if you don't need it
# You will encounter times where you will want to delete columns.  This is how you do that.
# df = df.drop(['x5_latitude', 'x6_longitude', 'x1_transaction_date'], axis=1)
# df

In [None]:
'''
# Split the data into test, train, validation sets... 60/20/20
from sklearn.model_selection import train_test_split
# This gives the 80/20 train test split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)
# This splits df_train_full again so it is 60/20/20
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=11)
len(df_train), len(df_val), len(df_test)
# Replace nulls with 0's - these are pandas dataframes
df_train = df_train.fillna(0)
df_val = df_val.fillna(0)
df_test = df_test.fillna(0)
len(df_train),len(df_val),len(df_test)
'''

In [None]:
# Split the data into test, train, validation sets... 80/20
from sklearn.model_selection import train_test_split
# This gives the 80/20 train test split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)

len(df_train_full), len(df_test)
# Replace nulls with 0's - these are pandas dataframes
df_train_full = df_train_full.fillna(0)

df_test = df_test.fillna(0)
len(df_train_full),len(df_test)

In [None]:
#Split the y out into train/test/splits... these are numpy ndarrays ... msrp is your target variables
# Replace with your target variable!!!  
y_train = (df_train_full.status).values
y_test = (df_test.status).values
del df_train_full['status']
del df_test['status']


In [None]:
len(y_train),len(y_test)

In [None]:
# Convert these data frames into a LIST of DICTIONARIES (each element in the list is a dictionary (the record))
dict_train = df_train_full.to_dict(orient='records')
dict_test = df_test.to_dict(orient='records')

In [None]:
# Convert the LIST OF DICTIONARIES into a Feature Matrix (does all of the encoding)
from sklearn.feature_extraction import DictVectorizer
 
dv = DictVectorizer(sparse=False)
 
X_train = dv.fit_transform(dict_train)
X_test = dv.transform(dict_test)
features = dv.get_feature_names_out()  #Features as they exist in the Vectorized Dictionary (this is an ndarray)

In [None]:
X_test.shape

In [None]:
# Compare Algorithms
from sklearn.metrics import roc_auc_score
from time import time
from sklearn.metrics import explained_variance_score,mean_absolute_error,r2_score
from pandas import read_csv
from matplotlib import pyplot
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    start = time()
    kfold = KFold(n_splits=10, random_state=7, shuffle=True)
    model.fit(X_train, y_train)
    train_time = time() - start
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    predict_time = time()-start 
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    #y_pred = model.predict_proba(X_train)[:, 1]
    #auc = roc_auc_score(y_train, y_pred)
    print(msg)
    print("Score for each of the 10 K-fold tests: ",cv_results)
    print(model)
    print("\tTraining time: %0.3fs" % train_time)
    print("\tPrediction time: %0.3fs" % predict_time)
    #y_pred = model.predict(X_test)
    #print("\tExplained variance:", explained_variance_score(y_test, y_pred))
    print()
    
    
    
# boxplot algorithm comparison
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()

# Once you identify a single model or two - begin to investigate

In [None]:
# %%timeit -n 1
# if you uncomment %%timeit it will not put lr into memory
# Let's assume that the decision tree is the one we want to explore
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [None]:
dt.get_params()

In [None]:
type(X_train)
type(dv.get_feature_names_out())
type(dt.feature_importances_)

In [None]:
# These are the model properties.  You can call all of these
def get_properties(model):   
  return [i for i in model.__dict__ if i.endswith('_')] 
get_properties(dt)

In [None]:
from sklearn.tree import export_text 
 
tree_text = export_text(dt, feature_names=dv.feature_names_) 
print(tree_text)

In [None]:
feature_names=dv.feature_names_
# Evaluate the coefficients to learn what the model thinks is important in the predictions.
for i,j in zip(feature_names, dt.feature_importances_): print('%.3f' % j, i)

In [None]:
from sklearn.metrics import f1_score
y_pred = dt.predict_proba(X_test)[:, 1]
y_pred = y_pred.astype('int')
f1_score(y_test, y_pred, average=None)

In [None]:
y_pred.dtype

In [None]:
# https://github.com/sepandhaghighi/pycm
!pip install pycm
from pycm import ConfusionMatrix
cm = ConfusionMatrix(actual_vector=y_test,predict_vector=y_pred)
# cm = ConfusionMatrix(y_actu, y_pred, classes=[1,0,4])
print(cm)

In [None]:
type(y_test)

In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix

cnf_matrix = confusion_matrix(y_test, y_pred)
print(cnf_matrix)
#[[1 1 3]
# [3 2 2]
# [1 3 1]]

FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
TP = np.diag(cnf_matrix)
TN = cnf_matrix.sum() - (FP + FN + TP)

FP = FP.astype(float)
# print(FP)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/((TP+FN)+.01)
# Specificity or true negative rate
TNR = TN/((TN+FP)+.01)
# Precision or positive predictive value
PPV = TP/((TP+FP)+.01)
# Negative predictive value
NPV = TN/((TN+FN)+.01)
# Fall out or false positive rate
FPR = FP/((FP+TN)+.01)
# False negative rate
FNR = FN/((TP+FN)+.01)
# False discovery rate
FDR = FP/((TP+FP)+.01)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)


The AUC-ROC curve is only for binary classification problems. But we can extend it to multiclass classification problems by using the One vs All technique(calculating auc-roc curve considering each label at a time and all the other can be grouped as one  label)

In [None]:
from sklearn.metrics import classification_report
print("For classification report:")
print(classification_report(y_test , y_pred))

from sklearn.metrics import confusion_matrix
print("For confusion matrix")
print(confusion_matrix(y_test , y_pred))

In [None]:
pred_y = dt.predict(X_test)
print("The first 10 prediction {}".format(pred_y[:10].round(0)))
print("The real first 10 labels {}".format(y_test[:10]))



In [None]:
type(df_train_full.head(1))

In [None]:
# Use double brackets around the iloc to force it to return a pandas dataframe and not a series
# Then you can convert any record into a dictionary
df_train_full.iloc[[21]]

In [None]:
# How to convert any pandas row into a dictionary... needed for predictions
df_train_full.iloc[[213]].to_dict('records')[0]

In [None]:
# How to convert any pandas row into a dictionary... needed for predictions
df_train_full.head(21).to_dict('records')[0]

In [None]:
#car = df_train.head(1).to_dict('records')[0]
item = df_train_full.iloc[[213]].to_dict('records')[0]
actual = y_train[[213]]

In [None]:
# The item to be predicted is passed in.  
def model_prediction(item, dv, model):
    X = dv.transform([item])
    y_pred = model.predict(X)
    return y_pred[0]

In [None]:
model_prediction(item,dv,dt)

In [None]:
actual

In [None]:
dt.get_params()

# Hyperparameter Tuning

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2, 3, 4]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3)
grid_search_cv.fit(X_train, y_train)

In [None]:
grid_search_cv.best_estimator_    # this will output the best values for the hyperparameters

In [None]:
from sklearn.tree import export_graphviz
export_graphviz( 
 grid_search_cv.best_estimator_,
 out_file=('tree.dot'),
 feature_names=None,
 class_names=None,
 filled=True,
)

In [None]:
!pip install pydot
import pydot

(graph,) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png')

In [None]:
# You can change the params by editing the output of this and repeating the above steps.
dt.get_params()

In [None]:
#Many parameters will take a very long time to load
param = { 'max_depth': [2,3,5,20,40], 
         'max_leaf_nodes': [2,20,200]}

In [None]:
metrics.SCORERS.keys()

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search
search = GridSearchCV(dt, param, scoring='accuracy', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X_train, y_train)

In [None]:
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)