In [None]:
#Import your Libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics
%matplotlib inline

In [None]:
# %%timeit -n 1
# Load your data  -- start with CreditScoring.csv... then online retail
df = pd.read_csv('https://raw.githubusercontent.com/fenago/classify/main/data/CreditScoring.csv')

## Notes

This session covers data collection and some procedures of data preparation. 

**Commands, functions, and methods:** 

* `!wget` - Linux shell command for downloading data 
* `pd.read.csv()` - read csv files 
* `df.head()` - take a look of the dataframe 
* `df.head().T` - take a look of the transposed dataframe 
* `df.columns` - retrieve column names of a dataframe 
* `df.columns.str.lower()` - lowercase all the letters 
* `df.columns.str.replace(' ', '_')` - replace the space separator 
* `df.dtypes` - retrieve data types of all series 
* `df.index` - retrive indices of a dataframe
* `pd.to_numeric()` - convert a series values to numerical values. The `errors=coerce` argument allows making the transformation despite some encountered errors. 
* `df.fillna()` - replace NAs with some value 
* `(df.x == "yes").astype(int)` - convert x series of yes-no values to numerical values.
* `df['Weight'] = df['Weight'].astype(int)` - this takes a single column of data and converts the data type

In [None]:
len(df)

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.corr()

In [None]:
# Basic Data Cleaning
df.columns = df.columns.str.lower().str.replace(' ', '_') # A
 
string_columns = list(df.dtypes[df.dtypes == 'object'].index) # B
 
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_') # C

In [None]:
# MAKE SURE THAT YOU WRANGLE YOUR DATA.  THIS IS AN EXAMPLE OF THE TYPES OF THINGS THAT ARE NEEDED
# SKIP THIS CEL - IT IS ONLY TO REITERATE THE NEED TO CLEAN 
# For instance - in the CreditScoring dataset - there are numerous 99999999 that need to be replaced
# Obviously don't run this with your dataset
for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)
df = df[df.status != 'unk']   # Also make sure to treat the target variable

In [None]:
df['status'].unique()

In [None]:
df.head()

In [None]:
df.head().T

In [None]:
# Categorical Values will be encoded with the Dictionary Vectorizor
# Numerical Values: At a minimum - clean the missing values and consider scaling

### Create Visuals so you can gain a business understanding of your data

In [None]:
# Replace with your target variable --- df.YOUR_TARGET_VARIABLE  
# Look for major data imbalances
# Also replace your X label
# REPLACE YOUR TARGET VARIABLE
plt.figure(figsize=(6, 4))

sns.histplot(df.status, bins=40, color='black', alpha=1)
plt.ylabel('Frequency')
plt.xlabel('status')
plt.title('status')

plt.show()

## Notes
* (1) Check for NaN under a single DataFrame column:

* `df['your column name'].isnull().values.any()`

* (2) Count the NaN under a single DataFrame column:

`df['your column name'].isnull().sum()`

* (3) Check for NaN under an entire DataFrame:

`df.isnull().values.any()`

* (4) Count the NaN under an entire DataFrame:

`df.isnull().sum().sum()`

In [None]:
# Check for nulls --- you do NOT want nulls when you train
df.isnull().sum()

In [None]:
df = df[df.status != 0]
df.status.value_counts() 

In [None]:
df.head()

In [None]:
#delete columns --- this may or may NOT be needed.  As before - skip if you don't need it
# You will encounter times where you will want to delete columns.  This is how you do that.
# df = df.drop(['x5_latitude', 'x6_longitude', 'x1_transaction_date'], axis=1)
# df

In [None]:
# Split the data into test, train, validation sets... 80/20
from sklearn.model_selection import train_test_split
# This gives the 80/20 train test split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)

len(df_train_full), len(df_test)
# Replace nulls with 0's - these are pandas dataframes
df_train_full = df_train_full.fillna(0)

df_test = df_test.fillna(0)
len(df_train_full),len(df_test)

In [None]:
#Split the y out into train/test/splits... these are numpy ndarrays ... msrp is your target variables
# Replace with your target variable!!!  
y_train = (df_train_full.status).values
y_test = (df_test.status).values
del df_train_full['status']
del df_test['status']


In [None]:
len(y_train),len(y_test)

## Notes
* `np.all(np.isfinite(X_train))` - are there any infinite values in the dictionary?
* `np.any(np.isnan(X_train))` - are there any NaN in the dictionary?
* # Using isna() to select all rows with NaN under an entire array:
`df[df.isna().any(axis=1)]`
* # Using isnull() to select all rows with NaN under an entire array:
`df[df.isnull().any(axis=1)]`

In [None]:
# Convert these data frames into a LIST of DICTIONARIES (each element in the list is a dictionary (the record))
dict_train = df_train_full.to_dict(orient='records')
dict_test = df_test.to_dict(orient='records')

In [None]:
# Convert the LIST OF DICTIONARIES into a Feature Matrix (does all of the encoding)
from sklearn.feature_extraction import DictVectorizer
 
dv = DictVectorizer(sparse=False)
 
X_train = dv.fit_transform(dict_train)
X_test = dv.transform(dict_test)
features = dv.get_feature_names()  #Features as they exist in the Vectorized Dictionary (this is an ndarray)

In [None]:
features

In [None]:
X_test.shape

In [None]:
# Compare Algorithms with the Algorithm Harness
# https://scikit-learn.org/stable/supervised_learning.html#supervised-learning
from sklearn.metrics import roc_auc_score
from time import time
from sklearn.metrics import explained_variance_score,mean_absolute_error,r2_score
from pandas import read_csv
from matplotlib import pyplot
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
results = []
names = []
scoring = 'roc_auc'
# you can also use other metrics to score your models: https://iqcode.com/code/python/sklearn-cross-val-score-scoring-options
for name, model in models:
    start = time()
    kfold = KFold(n_splits=10, random_state=7, shuffle=True)
    model.fit(X_train, y_train)
    train_time = time() - start
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    predict_time = time()-start 
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    #y_pred = model.predict_proba(X_train)[:, 1]
    #auc = roc_auc_score(y_train, y_pred)
    print(msg)
    print("Score for each of the 10 K-fold tests: ",cv_results)
    print(model)
    print("\tTraining time: %0.3fs" % train_time)
    print("\tPrediction time: %0.3fs" % predict_time)
    #y_pred = model.predict(X_test)
    #print("\tExplained variance:", explained_variance_score(y_test, y_pred))
    print()
    
    
    
# boxplot algorithm comparison
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()

# Once you identify a single model or two - begin to investigate
## MODIFY THIS TO THE ALGORITHM THAT YOU CHOOSE.  THIS IS AN EXAMPLE WITH A TREE

In [None]:
# %%timeit -n 1
# if you uncomment %%timeit it will not put lr into memory
# Let's assume that the decision tree is the one we want to explore
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [None]:
# This will give you the list of Hyperparameters of your model
dt.get_params()

In [None]:
# You have converted the dataframe into a list of dictionaries - validate the headers
type(X_train)
type(dv.get_feature_names())
type(dt.feature_importances_)
dv.get_feature_names()

In [None]:
# These are the model properties.  You can call all of these
def get_properties(model):   
  return [i for i in model.__dict__ if i.endswith('_')] 
get_properties(dt)

In [None]:
# Rough view of the tree - hard to read but can be usable for research.  Will show a clean tree later
# from sklearn.tree import export_text 
 
# tree_text = export_text(dt, feature_names=dv.feature_names_) 
# print(tree_text)

In [None]:
# Very important.  Once you have a trained model - interegate the coefficients to see WHAT is important
feature_names=dv.feature_names_
# Evaluate the coefficients to learn what the model thinks is important in the predictions.
for i,j in zip(feature_names, dt.feature_importances_): print('%.3f' % j, i)

# Confusion Matrix represent predictions vs Actuals on Test Data
The predicted data results in the below diagram could be read in the following manner given 1 represents  (positive) - obviously - this will be different if the target has more than 2 options. <br />
![image info](https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Ftse1.mm.bing.net%2Fth%3Fid%3DOIP.g4tLqo_z92Hk7NZ-JXdziwHaD1%26pid%3DApi&f=1)

### True Positive (TP): True positive represents the value of correct predictions of positives out of actual positive cases. 
### False Positive (FP): False positive represents the value of incorrect positive predictions. 
### True Negative (TN): True negative represents the value of correct predictions of negatives out of actual negative cases. 
## False Negative (FN): False negative represents the value of incorrect negative predictions.

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import matplotlib.pyplot as plt
# predict_proba predicts the probability and predict just predicts the category
# y_pred = dt.predict_proba(X_test)[:, 1]
y_pred = dt.predict(X_test)

conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
#
# Print the confusion matrix using Matplotlib
#
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Oranges, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')
 
plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()

## Precision: 
Model precision score represents the model’s ability to correctly predict the positives out of all the positive predictions it made. The precision score is a useful measure of the success of prediction when the classes are very imbalanced. Mathematically, it represents the ratio of true positive to the sum of true positive and false positive.
#### Precision Score = TP / (FP + TP)

The precision score can be used in the scenario where the machine learning model is required to identify all positive examples without any false positives. For example, machine learning models are used in medical diagnosis applications where the doctor wants machine learning model will not provide a label of pneumonia if the patient does not have this disease. Oncologists want models that can identify all cancerous lesions without any false-positive results, and hence one would use a precision score in such cases.
The other example where the precision score can be useful is credit card fraud detection. In credit card fraud detection problems, classification models are evaluated using the precision score to determine how many positive samples were correctly classified by the classification model. You would not like to have a high number of false positives or else you might end up blocking many credit cards and hence a lot of frustrations with the end-users.

In [None]:
# Read this as a percentage - What proportion of positive identifications was actually correct?
print('Precision: %.3f' % precision_score(y_test, y_pred))

## Recall: 
Model recall score represents the model’s ability to correctly predict the positives out of actual positives. This is unlike precision which measures how many predictions made by models are actually positive out of all positive predictions made. For example: If your machine learning model is trying to identify positive reviews, the recall score would be what percent of those positive reviews did your machine learning model correctly predict as a positive. In other words, it measures how good our machine learning model is at identifying all actual positives out of all positives that exist within a dataset. The higher the recall score, the better the machine learning model is at identifying both positive and negative examples. Recall score is a useful measure of success of prediction when the classes are very imbalanced.  Mathematically, it represents the ratio of true positive to the sum of true positive and false negative.
#### Recall Score = TP / (FN + TP)

Recall score can be used in the scenario where the labels are not equally divided among classes. For example, if there is a class imbalance ratio of 20:80 (imbalanced data), then the recall score will be more useful than accuracy because it can provide information about how well the machine learning model identified rarer events.

In [None]:
print('Recall: %.3f' % recall_score(y_test, y_pred))

## Model accuracy:
is a machine learning model performance metric that is defined as the ratio of true positives and true negatives to all positive and negative observations. In other words, accuracy tells us how often we can expect our machine learning model will correctly predict an outcome out of the total number of times it made predictions. For example: Let’s assume that you were testing your machine learning model with a dataset of 100 records and that your machine learning model predicted all 90 of those instances correctly. The accuracy metric, in this case, would be: (90/100) = 90%. The accuracy rate is great but it doesn’t tell us anything about the errors our machine learning models make on new data we haven’t seen before.
Mathematically, it represents the ratio of the sum of true positive and true negatives out of all the predictions.  Use this measure with extreme caution.  It can be very misleading.
#### Accuracy Score = (TP + TN)/ (TP + FN + TN + FP)

In [None]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

## Model F1 score:
represents the model score as a function of precision and recall score. F-score is a machine learning model performance metric that gives equal weight to both the Precision and Recall for measuring its performance in terms of accuracy, making it an alternative to Accuracy metrics (it doesn’t require us to know the total number of observations). It’s often used as a single value that provides high-level information about the model’s output quality. This is a useful measure of the model in the scenarios where one tries to optimize either of precision or recall score and as a result, the model performance suffers. The following represents the aspects relating to issues with optimizing either precision or recall score:
Optimizing for recall helps with minimizing the chance of not detecting a malignant cancer. However, this comes at the cost of predicting malignant cancer in patients although the patients are healthy (a high number of FP).
Optimize for precision helps with correctness if the patient has a malignant cancer. However, this comes at the cost of missing malignant cancer more frequently (a high number of FN).
Mathematically, it can be represented as harmonic mean of precision and recall score.  Ranges from 0 to 1 and can be read as a percentage when multiplied by 100
#### F1 Score = 2* Precision Score * Recall Score/ (Precision Score + Recall Score/)

In [None]:
# This is just for the TRUE --- Not the False
print('F1 Score: %.3f' % f1_score(y_test, y_pred))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
# Look to see if your target variables are balanced.  If they are NOT then consider looking at Recall and Precision
# If your target variables are balanced - then consider using accuracy as the defining metric to judge the "goodness"
df['status'].value_counts()

In [None]:
# Same info can be found in the score report
# Be ver careful about the data types. predict_proba returns a float
# predict returns a different data type
from sklearn.metrics import f1_score
y_pred = dt.predict_proba(X_test)[:, 1]
y_pred = y_pred.astype('float')
f1_score(y_test, y_pred, average=None)

In [None]:
# Evalation Metrics (really more advanced but added here for context)
# https://github.com/sepandhaghighi/pycm
# Make sure the target variables in the y_test and y_pred are exactly the same units (not 0/1 in one and True/False in the other)
!pip install pycm
from pycm import ConfusionMatrix
cm = ConfusionMatrix(actual_vector=y_test,predict_vector=y_pred)
# cm = ConfusionMatrix(y_actu, y_pred, classes=[1,0,4])
print(cm)

In [None]:
# Classification metrics - you can skip or run and print the metric you woult like to look at.
import numpy as np
from sklearn.metrics import confusion_matrix

cnf_matrix = confusion_matrix(y_test, y_pred)
print(cnf_matrix)
#[[1 1 3]
# [3 2 2]
# [1 3 1]]

FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
TP = np.diag(cnf_matrix)
TN = cnf_matrix.sum() - (FP + FN + TP)

FP = FP.astype(float)
# print(FP)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/((TP+FN)+.01)
# Specificity or true negative rate
TNR = TN/((TN+FP)+.01)
# Precision or positive predictive value
PPV = TP/((TP+FP)+.01)
# Negative predictive value
NPV = TN/((TN+FN)+.01)
# Fall out or false positive rate
FPR = FP/((FP+TN)+.01)
# False negative rate
FNR = FN/((TP+FN)+.01)
# False discovery rate
FDR = FP/((TP+FP)+.01)
# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)


## Take a peek at your predictions

In [None]:
# Take a look at the first 10 rows and compare the predictions.
pred_y = dt.predict(X_test)
print("The first 10 prediction {}".format(pred_y[:10].round(0)))
print("The real first 10 labels {}".format(y_test[:10]))

## How to make a PREDICTION with brand new values

In [None]:
type(df_train_full.head(1))

In [None]:
# Use double brackets around the iloc to force it to return a pandas dataframe and not a series
# Then you can convert ANY record into a dictionary.
# This is important because when you want to make a prediction - it must be in this format
df_train_full.iloc[[21]]

In [None]:
# How to convert any pandas row into a dictionary... needed for predictions
df_train_full.iloc[[213]].to_dict('records')[0]

In [None]:
# How to convert any pandas row into a dictionary... needed for predictions
df_train_full.head(21).to_dict('records')[0]

In [None]:
#item = df_train.head(1).to_dict('records')[0]
item = df_train_full.iloc[[213]].to_dict('records')[0]
actual = y_train[[213]]

In [None]:
# The item to be predicted is passed in.  
def model_prediction(item, dv, model):
    X = dv.transform([item])
    y_pred = model.predict(X)
    return y_pred[0]

In [None]:
# The item to be predicted is passed in.  
def model_prediction_proba(item, dv, model):
    X = dv.transform([item])
    y_pred = model.predict_proba(X)[:, 1]
    return y_pred[0]

In [None]:
model_prediction(item,dv,dt)

In [None]:
actual

In [None]:
# How to convert any pandas row into a dictionary... needed for predictions
# This will give you a template that you can manipulate to create predictions with new values
df_train_full.iloc[[2]].to_dict('records')[0]

In [None]:
# The list above can be copied into here.  Take everything after a " : " and put it in a [] so you can pass it in as a pd.DF
# create a DataFrame by transforming scalar values to list   (toggle the weekend)

myItem = {'administrative': [6],
 'administrative_duration': [94.6],
 'informational': [0],
 'informational_duration': [2.0],
 'productrelated': [15],
 'productrelated_duration': [1933.559259],
 'bouncerates': [0.005333333],
 'exitrates': [0.026377261],
 'pagevalues': [167.806338478],
 'specialday': [0.0],
 'month': ['nov'],
 'operatingsystems': [2],
 'browser': [2],
 'region': [4],
 'traffictype': [2],
 'visitortype': ['returning_visitor'],
 'weekend': [False]}

In [None]:
newDF = pd.DataFrame.from_dict(myItem)

In [None]:
item = newDF.to_dict('records')[0]

In [None]:
model_prediction(item,dv,dt)

# Hyperparameter Tuning

In [None]:
# Look at the model parameters.   You can tune these.
dt.get_params()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
params = {'min_samples_leaf': [20, 3],'max_depth': [4,6,8],'min_samples_split': [2,4]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, verbose=1, cv=3)
grid_search_cv.fit(X_train, y_train)

In [None]:
grid_search_cv.best_estimator_    # this will output the best values for the hyperparameters

In [None]:
# Let's assume that the decision tree is the one we want to explore
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=4, min_samples_leaf=20, random_state=42)
dt.fit(X_train, y_train)

In [None]:
from sklearn.metrics import f1_score
y_pred = dt.predict_proba(X_test)[:, 1]
y_pred = y_pred.astype('int')
f1_score(y_test, y_pred, average=None)

In [None]:
from sklearn.tree import export_graphviz
export_graphviz( 
 grid_search_cv.best_estimator_,
 out_file=('tree.dot'),
 feature_names=dv.get_feature_names(),
 class_names=None,
 filled=True,
)

In [None]:
!pip install pydot
import pydot

(graph,) = pydot.graph_from_dot_file('tree.dot')
graph.write_png('tree.png')

In [None]:
# You can change the params by editing the output of this and repeating the above steps.
dt.get_params()

In [None]:
#Many parameters will take a very long time to load
param = { 'max_depth': [2,3,5,20,40], 
         'max_leaf_nodes': [2,20,200]}

In [None]:
metrics.SCORERS.keys()

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search
search = GridSearchCV(dt, param, scoring='accuracy', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X_train, y_train)

In [None]:
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)