<a href="https://colab.research.google.com/github/gvanathip/testing/blob/main/Classification_Case.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
from datetime import datetime
import numpy as np

import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

from sklearn import preprocessing

plt.style.use('ggplot')

In [None]:
from google.colab import files

In [None]:
#download dataset.csv at https://digitalventures-my.sharepoint.com/personal/sarunthorn_t_dv_co_th/_layouts/15/onedrive.aspx?id=%2Fpersonal%2Fsarunthorn%5Ft%5Fdv%5Fco%5Fth%2FDocuments%2FTest%20for%20Data%20scientist%20role&ga=1

In [None]:
uploaded = files.upload() #upload dataset.csv from local

In [None]:
df = pd.read_csv('dataset.csv',on_bad_lines='skip')

In [None]:
df.head()

# EDA and Data Cleansing

In [None]:
set(df.term)

In [None]:
df[df.term.isna()]

In [None]:
df.info()

In [None]:
df[pd.to_numeric(df['open_acc'], errors='coerce').isnull()]

In [None]:
df = df[df.term.notna()]

In [None]:
df = df[pd.to_numeric(df['open_acc'], errors='coerce').notnull()]

In [None]:
df['earliest_cr_line'] = pd.to_datetime(df.earliest_cr_line)

In [None]:
df['tenure_year'] = (pd.to_datetime("now") - df.earliest_cr_line)/ np.timedelta64(1, 'Y')

In [None]:
df.describe(include='all').T

In [None]:
df = df.drop(columns=['id', 'address','earliest_cr_line','emp_title','title'])

In [None]:
df1 = df.apply(pd.to_numeric, errors='ignore')

In [None]:
df1.describe(include='all').T

In [None]:
df1.info()

# Imbalance Checking

In [None]:
g = sns.countplot(df['loan_status'])
#g.set_xticklabels(['Not Fraud','Fraud'])
plt.show()

# Categorical Data Handling (one hot encoding)

In [None]:
obj_df = df1.drop(columns=['loan_status']).select_dtypes(include=['object']).copy()
obj_df.head()

In [None]:
obj_df.describe()

In [None]:
list(obj_df.columns)

In [None]:
dm = pd.get_dummies(obj_df,prefix=list(obj_df.columns))

In [None]:
dm

# Numerical Data Processing

In [None]:
set(df1.loan_status)

In [None]:
df1 = df1.replace({"loan_status":{"Fully Paid":0,"Charged Off":1}})

In [None]:
df2 = df1.select_dtypes(include=['int','float']).copy()

In [None]:
df2

# Categorical and Numerical Data Integration

In [None]:
df3 = pd.merge(df2, dm, left_index=True, right_index=True)

In [None]:
df3

In [None]:
df3.describe()

In [None]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

#https://stackoverflow.com/questions/31323499/sklearn-error-valueerror-input-contains-nan-infinity-or-a-value-too-large-for

In [None]:
df3 = clean_dataset(df3)

# Normalization

In [None]:
y = df3.loan_status.values

In [None]:
x = df3.drop(columns=['loan_status'])

In [None]:
d = preprocessing.normalize(x)
scaled_df = pd.DataFrame(d, columns=x.columns)
scaled_df.head()

In [None]:
x = scaled_df.values

# Under Sampling to solve imbalance problem

In [None]:
#Synthetic Minority Oversampling Technique (SMOTE)
#https://www.analyticsvidhya.com/blog/2020/07/10-techniques-to-deal-with-class-imbalance-in-machine-learning/

In [None]:
from collections import Counter

In [None]:
# import library
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42, replacement=True)# fit predictor and target variable
x_rus, y_rus = rus.fit_resample(x, y)

print('original dataset shape:', Counter(y))
print('Resample dataset shape', Counter(y_rus))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_rus, y_rus, test_size=0.20) 

## Feature Important and selection

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(X_train, y_train)

In [None]:
df3.drop(columns=['loan_status']).columns

In [None]:
sorted_idx = dtree.feature_importances_.argsort()
plt.barh(df3.drop(columns=['loan_status']).columns[sorted_idx], dtree.feature_importances_[sorted_idx])
plt.xlabel("Decision Tree Feature Importance")

In [None]:
feature_importance = pd.DataFrame(dtree.feature_importances_[sorted_idx], columns=['importance'])

In [None]:
feature_importance['name'] = df3.drop(columns=['loan_status']).columns[sorted_idx]

In [None]:
feature_selected = feature_importance[feature_importance.importance>0.02]

In [None]:
feature_selected = feature_selected.sort_values(by=['importance'], ascending=False)

In [None]:
plt.barh(feature_selected.name, feature_selected.importance)
plt.xlabel("Decision Tree Feature Importance")

# renew data only features selection then re split to train test

In [None]:
df4 = df3[feature_selected.name]

In [None]:
x1 = df4.values

In [None]:
rus = RandomUnderSampler(random_state=42, replacement=True)# fit predictor and target variable
x1_rus, y_rus = rus.fit_resample(x1, y)

print('original dataset shape:', Counter(y))
print('Resample dataset shape', Counter(y_rus))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x1_rus, y_rus, test_size=0.20) 

# Train models then select the best (highest roc_auc) with kfold

In [None]:
#https://www.projectpro.io/recipes/compare-sklearn-classification-algorithms-in-python

In [None]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
#models.append(('SVM', SVC()))

In [None]:
results = []
names = []
#scoring = 'accuracy'
scoring = 'roc_auc'
for name, model in models:
      kfold = model_selection.KFold(n_splits=3, random_state=50, shuffle=True)
      cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
      results.append(cv_results)
      names.append(name)
      msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
      print(msg)

In [None]:
fig = plt.figure(figsize=(10,10))
fig.suptitle('How to compare sklearn classification algorithms')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

# Fit Linear Discriminant Aanalysis Model (The best)

In [None]:
clf = LinearDiscriminantAnalysis()

In [None]:
clf.fit(X_train, y_train)

In [None]:
import pickle

# Save Model

In [None]:
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))