# <span style="color:red" font-size=30>Machine learning project</span>



#### Import libraries.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import isnan
!pip install plotly
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.feature_selection import RFECV,SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR, SVC, LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score



#### load dataset

In [2]:
data = []
with open('chronic_kidney_disease.arff', "r") as f:
    for line in f:
        line = line.replace('\n', '')
        data.append(line.split(','))


names = ['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba',
         'bgr', 'bu',  'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc',
         'rbcc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane',
         'class', 'no_name']

df = pd.DataFrame(data[29:], columns=names)

FileNotFoundError: [Errno 2] No such file or directory: 'chronic_kidney_disease.arff'

In [None]:
df

# Data understanding

In [None]:
df.describe(include='all')

In [None]:
del df['no_name']

In [None]:
df.shape

In [None]:
df = df.drop(400)

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df.isnull().values.any()

# Data cleaning

In [None]:
df1=df.replace('?',np.NaN)

In [None]:
df1

In [None]:
df1.describe()

In [None]:
df1.isnull().sum()

In [None]:
df1.info()

#### converting necessary columns to numerical type

In [None]:
df1['age'] = pd.to_numeric(df1['age'], errors='coerce')
df1['bp'] = pd.to_numeric(df1['bp'], errors='coerce')
df1['bgr'] = pd.to_numeric(df1['bgr'], errors='coerce')
df1['sc'] = pd.to_numeric(df1['sc'], errors='coerce')
df1['sod'] = pd.to_numeric(df1['sod'], errors='coerce')
df1['pot'] = pd.to_numeric(df1['pot'], errors='coerce')
df1['hemo'] = pd.to_numeric(df1['hemo'], errors='coerce')
df1['pcv'] = pd.to_numeric(df1['pcv'], errors='coerce')
df1['wbcc'] = pd.to_numeric(df1['wbcc'], errors='coerce')
df1['rbcc'] = pd.to_numeric(df1['rbcc'], errors='coerce')
df1['bu'] = pd.to_numeric(df1['bu'], errors='coerce')
df1['sg'] = pd.to_numeric(df1['sg'], errors='coerce')
df1['su'] = pd.to_numeric(df1['su'], errors='coerce')
df1['al'] = pd.to_numeric(df1['al'], errors='coerce')

In [None]:
df1.info()

# Feature engineering and selection

In [None]:
df1[['age','bp', 'bgr','bu','sc','sod','pot','hemo','pcv','wbcc','rbcc']] = df1[['age','bp', 'bgr','bu','sc','sod','pot','hemo','pcv','wbcc','rbcc']].fillna(df1[['age','bp', 'bgr','bu','sc','sod','pot','hemo','pcv','wbcc','rbcc']].median())


In [None]:
df1.isnull().sum()

In [None]:
df1=df1.replace('	?',np.NaN)
df1[['pcv','wbcc','rbcc']] = df1[['pcv','wbcc','rbcc']].fillna(df1[['pcv','wbcc','rbcc']].median())

In [None]:
df1.isnull().sum()

In [None]:
#IMPUTING MISSING CATEGORICAL VALUES
#['sg','al','su','rbc','pc','pcc','ba','htn','dm','cad','appet','pe','ane']
df1[['al']] = df1[['al']].fillna(df1['al'].mode()[0])
df1[['su']] = df1[['su']].fillna(df1['su'].mode()[0])
df1[['rbc']] = df1[['rbc']].fillna(df1['rbc'].mode()[0])
df1[['pc']] = df1[['pc']].fillna(df1['pc'].mode()[0])
df1[['ba']] = df1[['ba']].fillna(df1['ba'].mode()[0])
df1['htn'] = df1['htn'].fillna(df1['htn'].mode()[0])
df1['dm'] = df1['dm'].fillna(df1['dm'].mode()[0])
df1['cad'] = df1['cad'].fillna(df1['cad'].mode()[0])
df1['appet'] = df1['appet'].fillna(df1['appet'].mode()[0])
df1['pe'] = df1['pe'].fillna(df1['pe'].mode()[0])
df1['ane'] = df1['ane'].fillna(df1['ane'].mode()[0])
df1[['pcc']] = df1[['pcc']].fillna(df1['pcc'].mode()[0])
df1[['sg']] = df1[['sg']].fillna(df1['sg'].mode()[0])

In [None]:
df1.isnull().sum()

In [None]:
#replace incorrect values
df1['dm'].replace(to_replace = {'\tno':'no','\tyes':'yes',' yes':'yes'},inplace=True)
df1['cad'] = df1['cad'].replace(to_replace = '\tno', value='no')

In [None]:
df1.isnull().sum()

In [None]:
# looking at unique values in categorical columns
for col in df1:
    print(f"{col} has {df1[col].unique()} values\n")

In [None]:

# replace incorrect values
df1['dm'] = df1['dm'].replace(to_replace = '', value=np.NaN)
df1['appet'] = df1['appet'].replace(to_replace ='no', value=np.NaN)
df1['pe'] = df1['pe'].replace(to_replace = 'good', value=np.NaN)
df1['class'] = df1['class'].replace(to_replace = 'ckd\t', value=np.NaN)
df1['class'] = df1['class'].replace(to_replace = 'no', value=np.NaN)

In [None]:
# looking at unique values in categorical columns
for col in df1:
    print(f"{col} has {df1[col].unique()} values\n")

In [None]:
#IMPUTING MISSING VALUES
df1['dm'] = df1['dm'].fillna(df1['dm'].mode()[0])
df1['appet'] = df1['appet'].fillna(df1['appet'].mode()[0])
df1['pe'] = df1['pe'].fillna(df1['pe'].mode()[0])
df1[['class']] = df1[['class']].fillna(df1['class'].mode()[0])

In [None]:
df1.info()

In [None]:
# looking at unique values in categorical columns
for col in df1:
    print(f"{col} has {df1[col].unique()} values\n")

In [None]:
# Extracting categorical and numerical columns
cat_cols = [col for col in df1.columns if df1[col].dtype == 'object']
num_cols = [col for col in df1.columns if df1[col].dtype != 'object']
num_cols

#### There is some ambugity present in the columns we have to fix that.

In [None]:
#Encoding
df1['ane'] = df1['ane'].map({'no': 0, 'yes': 1})
df1['ane'] = pd.to_numeric(df1['ane'], errors='coerce')
df1['pe'] = df1['pe'].map({'no': 0, 'yes': 1})
df1['pe'] = pd.to_numeric(df1['pe'], errors='coerce')
df1['cad'] = df1['cad'].map({'no': 0, 'yes': 1})
df1['cad'] = pd.to_numeric(df1['cad'], errors='coerce')
df1['dm'] = df1['dm'].map({'no': 0, 'yes': 1})
df1['dm'] = pd.to_numeric(df1['dm'], errors='coerce')
df1['htn'] = df1['htn'].map({'no': 0, 'yes': 1})
df1['htn'] = pd.to_numeric(df1['htn'], errors='coerce')
df1['class'] = df1['class'].map({'ckd': 0, 'notckd': 1})
df1['class'] = pd.to_numeric(df1['class'], errors='coerce')

In [None]:
for col in df1:
    print(f"{col} has {df1[col].unique()} values\n")

In [None]:
cat_cols1 = [col for col in df1.columns if df1[col].dtype == 'object']

In [None]:
cat_cols1

#### As all of the categorical columns have 2 categories we can use label encoder

In [None]:
le = LabelEncoder()

for col in cat_cols1:
    df1[col] = le.fit_transform(df1[col])

In [None]:
df1.describe()

In [None]:
df1

#### Normalisation

In [None]:
# Normalisation
df1= df1.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [None]:
df1

### Detecting the outliers

In [None]:
g = sns.pairplot(df1, vars =['bp','bgr','bu','sc','sod','pot','hemo','pcv','wbcc','rbcc'],hue = 'class')
g.map_diag(sns.distplot)
g.add_legend()
g.fig.suptitle('outliers plot', fontsize = 20)
g.fig.subplots_adjust(top= 0.9);

In [None]:
sns.boxplot(x=df1["bgr"],fliersize=10,orient="h")

In [None]:
sns.boxplot(x=df1["bp"],fliersize=10,orient="h")

In [None]:
sns.boxplot(x=df1["bu"],fliersize=10,orient="h")

In [None]:
sns.boxplot(x=df1["sc"],fliersize=10,orient="h")

In [None]:
sns.boxplot(x=df1["sod"],fliersize=10,orient="h")

In [None]:
sns.boxplot(x=df1["pot"],fliersize=10,orient="h")

In [None]:
sns.boxplot(x=df1["hemo"],fliersize=10,orient="h")

In [None]:
sns.boxplot(x=df1["pcv"],fliersize=10,orient="h")

In [None]:
sns.boxplot(x=df1["wbcc"],fliersize=10,orient="h")

In [None]:
sns.boxplot(x=df1["rbcc"],fliersize=10,orient="h")

### Fixing and dealing with the outliers

In [None]:
new_df1 = df1.copy()

In [None]:
for x in ['hemo']:
    q75,q25 = np.percentile(new_df1.loc[:,x],[75,25])
    intr_qr = q75-q25

    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
    #remplacer les out par nan
    new_df1.loc[new_df1[x]<min,x]=new_df1['hemo'].mean()
    new_df1.loc[new_df1[x]>max,x]=new_df1['hemo'].mean()

In [None]:
sns.boxplot(x=new_df1["hemo"],fliersize=10,orient="h")

In [None]:
for x in ['pot']:
    q75,q25 = np.percentile(new_df1.loc[:,x],[75,25])
    intr_qr = q75-q25

    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
    #remplacer les out par nan
    new_df1.loc[new_df1[x]<min,x]=new_df1['pot'].mean()
    new_df1.loc[new_df1[x]>max,x]=new_df1['pot'].mean()

In [None]:
sns.boxplot(x=new_df1["pot"],fliersize=10,orient="h")

In [None]:
for x in ['bp']:
    q75,q25 = np.percentile(new_df1.loc[:,x],[75,25])
    intr_qr = q75-q25

    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
    #remplacer les out par nan
    new_df1.loc[new_df1[x]<min,x]=new_df1['bp'].mean()
    new_df1.loc[new_df1[x]>max,x]=new_df1['bp'].mean()

In [None]:
sns.boxplot(x=new_df1["bp"],fliersize=10,orient="h")

In [None]:
for x in ['bgr']:
    q75,q25 = np.percentile(new_df1.loc[:,x],[75,25])
    intr_qr = q75-q25

    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
    #remplacer les out par nan
    new_df1.loc[new_df1[x]<min,x]=new_df1['bgr'].mean()
    new_df1.loc[new_df1[x]>max,x]=new_df1['bgr'].mean()

In [None]:
sns.boxplot(x=new_df1["bgr"],fliersize=10,orient="h")

In [None]:
for x in ['bu']:
    q75,q25 = np.percentile(new_df1.loc[:,x],[75,25])
    intr_qr = q75-q25

    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
    #remplacer les out par nan
    new_df1.loc[new_df1[x]<min,x]=new_df1['bu'].mean()
    new_df1.loc[new_df1[x]>max,x]=new_df1['bu'].mean()

In [None]:
sns.boxplot(x=new_df1["bu"],fliersize=10,orient="h")

In [None]:
for x in ['sc']:
    q75,q25 = np.percentile(new_df1.loc[:,x],[75,25])
    intr_qr = q75-q25

    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
    #remplacer les out par nan
    new_df1.loc[new_df1[x]<min,x]=new_df1['sc'].mean()
    new_df1.loc[new_df1[x]>max,x]=new_df1['sc'].mean()

In [None]:
sns.boxplot(x=new_df1["sc"],fliersize=10,orient="h")

In [None]:
for x in ['sod']:
    q75,q25 = np.percentile(new_df1.loc[:,x],[75,25])
    intr_qr = q75-q25

    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
    #remplacer les out par nan
    new_df1.loc[new_df1[x]<min,x]=new_df1['sod'].mean()
    new_df1.loc[new_df1[x]>max,x]=new_df1['sod'].mean()

In [None]:
sns.boxplot(x=new_df1["sod"],fliersize=10,orient="h")

In [None]:
for x in ['pcv']:
    q75,q25 = np.percentile(new_df1.loc[:,x],[75,25])
    intr_qr = q75-q25

    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
    #remplacer les out par nan
    new_df1.loc[new_df1[x]<min,x]=new_df1['pcv'].mean()
    new_df1.loc[new_df1[x]>max,x]=new_df1['pcv'].mean()


In [None]:
sns.boxplot(x=new_df1["pcv"],fliersize=10,orient="h")

In [None]:
for x in ['rbcc']:
    q75,q25 = np.percentile(new_df1.loc[:,x],[75,25])
    intr_qr = q75-q25

    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
    #remplacer les out par nan
    new_df1.loc[new_df1[x]<min,x]=new_df1['rbcc'].mean()
    new_df1.loc[new_df1[x]>max,x]=new_df1['rbcc'].mean()


In [None]:
sns.boxplot(x=new_df1["rbcc"],fliersize=10,orient="h")

In [None]:
for x in ['wbcc']:
    q75,q25 = np.percentile(new_df1.loc[:,x],[75,25])
    intr_qr = q75-q25

    max = q75+(1.5*intr_qr)
    min = q25-(1.5*intr_qr)
    #remplacer les out par nan
    new_df1.loc[new_df1[x]<min,x]=new_df1['wbcc'].mean()
    new_df1.loc[new_df1[x]>max,x]=new_df1['wbcc'].mean()

In [None]:
sns.boxplot(x=new_df1["wbcc"],fliersize=10,orient="h")

# Visualization

In [3]:
# looking at categorical columns

plt.figure(figsize = (20, 15))
plotnumber = 1

for column in cat_cols:
    if plotnumber <= 11:
        ax = plt.subplot(3, 4, plotnumber)
        sns.countplot(df1[column], palette = 'husl')
        plt.xlabel(column)

    plotnumber += 1

plt.tight_layout()
plt.show()

NameError: name 'cat_cols' is not defined

<Figure size 2000x1500 with 0 Axes>

In [4]:
# checking numerical features distribution
plt.figure(figsize = (20, 15))
plotnumber = 1

for column in num_cols:
    if plotnumber <= 14:
        ax = plt.subplot(3, 5, plotnumber)
        sns.distplot(df1[column])
        plt.xlabel(column)

    plotnumber += 1

plt.tight_layout()
plt.show()

NameError: name 'num_cols' is not defined

<Figure size 2000x1500 with 0 Axes>

In [None]:
my_labels =('chkn','notckd')
ax = df1['class'].value_counts().plot(kind='pie', figsize=(12,12), autopct='%1.1f%%', labels=None)
ax.axes.get_yaxis().set_visible(False)
plt.legend(loc=5, labels=my_labels)

# Feature selection

In [5]:
#matrice de correlation
plt.figure(figsize = (15, 8))
sns.heatmap(df1.corr(), annot = True, linewidths = 2, linecolor = 'white')
plt.show()

NameError: name 'df1' is not defined

<Figure size 1500x800 with 0 Axes>

In [6]:
df1

NameError: name 'df1' is not defined

In [7]:
##target='class'

In [8]:
y = df1['class']
X = df1.loc[:, df.columns != 'class']


#checking for NA values
X.isna().values.any()
y.isna().values.any()

NameError: name 'df1' is not defined

In [9]:
svc = SVC(kernel="linear")

rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(3),
              scoring='accuracy')
rfecv.fit(X, y)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plotting number of features and cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
print(rfecv.ranking_)

NameError: name 'X' is not defined

# Modeling

### split data

In [10]:
ind_col = [col for col in df1.columns if col != 'class']
dep_col = 'class'

X = df1[ind_col]
y = df1[dep_col]

NameError: name 'df1' is not defined

In [11]:
# splitting data intp training and test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

NameError: name 'X' is not defined

### Balancing Data

In [12]:
len(y_train[y_train==1]), len(y_train[y_train==0]), y_train.shape

NameError: name 'y_train' is not defined

### Under Sampling

In [13]:
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE

In [14]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()

X_train_down,y_train_down = rus.fit_resample(X_train, y_train)

print(len(y_train_down[y_train_down==0]), len(y_train_down[y_train_down==1]))
print(len(X_train_down))

NameError: name 'X_train' is not defined

### Over Sampling

In [None]:
# RandomOverSampler to handle imbalanced data
from imblearn.over_sampling import RandomOverSampler
os =  RandomOverSampler(sampling_strategy=1)

X_train, y_train = os.fit_resample(X_train, y_train)

print(len(y_train[y_train==0]), len(y_train[y_train==1]))
print(len(X_train))

## Logistic Regression

In [None]:
def lr_grid_search(X, y):
    model = LogisticRegression()

    # Create a dictionary of all values we want to test
    solvers = ['newton-cg', 'lbfgs', 'liblinear']
    penalty = ['l2']
    c_values = [100, 10, 1.0, 0.1, 0.01]

    # define grid search
    param_grid = dict(solver=solvers, penalty=penalty, C=c_values)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='accuracy')
    grid_result = grid_search.fit(X, y)

    return grid_result.best_params_

In [15]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
lr_grid_search(X_train, y_train)

NameError: name 'lr_grid_search' is not defined

### Over sample Logistic

In [16]:
from sklearn import metrics
from sklearn.metrics import classification_report
lr = LogisticRegression(C=100, penalty='l2', solver='newton-cg')
lr.fit(X_train,y_train)

y_pred_lr = lr.predict(X_test)

print(metrics.classification_report(y_test, y_pred_lr))

lr_score = lr.score(X_train,y_train)
print(lr_score)

lr_score = lr.score(X_test,y_test)
print(lr_score)

NameError: name 'X_train' is not defined

In [17]:
lr_tacc = lr.score(X_test,y_test)
lr_train_acc = lr.score(X_train, y_train)

NameError: name 'X_test' is not defined

### Confusion matrix of Logistic Regression Model

In [18]:
cm = metrics.confusion_matrix(y_test, y_pred_lr, labels=[1,0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                         columns = [i for i in ["Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

NameError: name 'y_test' is not defined

### Under Sample Logistic

In [19]:
lr = LogisticRegression(C=100, penalty='l2', solver='newton-cg')
lr.fit(X_train_down,y_train_down)

y_pred_lr = lr.predict(X_test)

print(metrics.classification_report(y_test, y_pred_lr))

lr_score = lr.score(X_train_down,y_train_down)
print(lr_score)

lr_score = lr.score(X_test,y_test)
print(lr_score)

NameError: name 'X_train_down' is not defined

In [20]:
cm = metrics.confusion_matrix(y_test, y_pred_lr, labels=[1,0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                         columns = [i for i in ["Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

NameError: name 'y_test' is not defined

## Decision Tree Classifier

In [21]:
def dtree_grid_search(X, y):
    #create a dictionary of all values we want to test
    param_grid = { 'criterion':['gini','entropy'],'max_depth': np.arange(2, 15)}
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

    # decision tree model
    dtree = DecisionTreeClassifier()

    #use gridsearch to test all values
    dtree_gscv = GridSearchCV(dtree, param_grid, cv=cv, n_jobs=-1, scoring='accuracy')
    #fit model to data
    dtree_gscv.fit(X, y)

    return dtree_gscv.best_params_

In [22]:
from sklearn.tree import DecisionTreeClassifier
dtree_grid_search(X_train, y_train)

NameError: name 'X_train' is not defined

### Over Sample Decision Tree

In [23]:
dTree = DecisionTreeClassifier(criterion = 'entropy', max_depth = 7)
dTree.fit(X_train, y_train)

print(dTree.score(X_train,y_train))
print(dTree.score(X_test,y_test))

y_pred_dtree = dTree.predict(X_test)

print(metrics.classification_report(y_test, y_pred_dtree))

NameError: name 'X_train' is not defined

In [24]:
dt_tacc = dTree.score(X_test,y_test)
dt_train_acc = dTree.score(X_train, y_train)

NameError: name 'X_test' is not defined

### Confusion Matrix of Decision Tree Classifier

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred_dtree, labels=[1,0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                         columns = [i for i in ["Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

<AxesSubplot:>

### Under Sample Decision Tree

In [None]:
dtree_grid_search(X_train_down, y_train_down)

{'criterion': 'entropy', 'max_depth': 14}

In [None]:
dTree = DecisionTreeClassifier(criterion = 'entropy', max_depth =14)
dTree.fit(X_train_down, y_train_down)

print(dTree.score(X_train_down,y_train_down))
print(dTree.score(X_test,y_test))

y_pred_dtree = dTree.predict(X_test)

print(metrics.classification_report(y_test, y_pred_dtree))

1.0
0.9833333333333333
              precision    recall  f1-score   support

         0.0       1.00      0.97      0.99        72
         1.0       0.96      1.00      0.98        48

    accuracy                           0.98       120
   macro avg       0.98      0.99      0.98       120
weighted avg       0.98      0.98      0.98       120



In [None]:
cm = metrics.confusion_matrix(y_test, y_pred_dtree, labels=[1,0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                         columns = [i for i in ["Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

<AxesSubplot:>

## Ensemble learning - AdaBoost

In [None]:
def ada_grid_search(X, y):
    #create a dictionary of all values we want to test
    param_grid = {'n_estimators':[10, 50, 100, 500], 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0]}
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

    # AdaBoost model
    ada = AdaBoostClassifier()

    # Use gridsearch to test all values
    ada_gscv = GridSearchCV(ada, param_grid, n_jobs=-1, cv=cv, scoring='accuracy')
    #fit model to data
    grid_result = ada_gscv.fit(X, y)

    return ada_gscv.best_params_

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada_grid_search(X_train, y_train)

{'learning_rate': 0.1, 'n_estimators': 50}

### Over Sample AdaBoost

In [None]:
abcl = AdaBoostClassifier(n_estimators=50, learning_rate = 0.1)
abcl = abcl.fit(X_train, y_train)

y_pred_abcl = abcl.predict(X_test)

print(abcl.score(X_train, y_train))
print(abcl.score(X_test,y_test))

print(metrics.classification_report(y_test, y_pred_abcl))

0.9972067039106145
1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        72
         1.0       1.00      1.00      1.00        48

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120



In [None]:
ada_train_acc = abcl.score(X_train, y_train)
ada_tacc = abcl.score(X_test,y_test)

### Confusion Matrix AdaBoosting model

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred_abcl, labels=[1,0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                         columns = [i for i in ["Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

<AxesSubplot:>

### Under Sample AdaBoost

In [None]:
ada_grid_search(X_train_down, y_train_down)

{'learning_rate': 0.1, 'n_estimators': 50}

In [None]:
abcl = AdaBoostClassifier(n_estimators=50, learning_rate = 0.1)
abcl = abcl.fit(X_train_down, y_train_down)

y_pred_abcl = abcl.predict(X_test)

print(abcl.score(X_train_down, y_train_down))
print(abcl.score(X_test,y_test))

print(metrics.classification_report(y_test, y_pred_abcl))

1.0
1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        72
         1.0       1.00      1.00      1.00        48

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120



In [None]:
cm = metrics.confusion_matrix(y_test, y_pred_abcl, labels=[1,0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                         columns = [i for i in ["Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

<AxesSubplot:>

# Random forest classifier

In [None]:
def rf_grid_search(X, y):
    #create a dictionary of all values we want to test
    param_grid = {
    'n_estimators': [5,10,20,40,50,60,70,80,100],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
    }
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

    # Random Forest model
    rf = RandomForestClassifier()

    #use gridsearch to test all values
    rf_gscv = GridSearchCV(rf, param_grid, cv=cv, n_jobs=-1, scoring='accuracy')
    #fit model to data
    rf_gscv.fit(X, y)

    return rf_gscv.best_params_

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_grid_search(X_train, y_train)

{'criterion': 'gini',
 'max_depth': 6,
 'max_features': 'log2',
 'n_estimators': 60}

### Over Sample Random Forest

In [None]:
rfcl = RandomForestClassifier(n_estimators=60, max_features='log2', max_depth=6, criterion='gini')
rfcl = rfcl.fit(X_train, y_train)

y_pred_rf = rfcl.predict(X_test)

print(rfcl.score(X_train,y_train))
print(rfcl.score(X_test,y_test))

print(metrics.classification_report(y_test, y_pred_rf))

0.9972067039106145
1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        72
         1.0       1.00      1.00      1.00        48

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120



In [None]:
rf_tacc = rfcl.score(X_test,y_test)
rf_train_acc = rfcl.score(X_train, y_train)

### Confusion matrix of Random Forest Classifier Model

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred_rf, labels=[1,0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                         columns = [i for i in ["Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

<AxesSubplot:>

### Under Sample Random Forest

In [None]:
rf_grid_search(X_train_down, y_train_down)

{'criterion': 'gini',
 'max_depth': 6,
 'max_features': 'sqrt',
 'n_estimators': 10}

In [None]:
rfcl = RandomForestClassifier(n_estimators=10, max_features='sqrt', max_depth=6, criterion='gini')
rfcl = rfcl.fit(X_train_down, y_train_down)

y_pred_rf = rfcl.predict(X_test)

print(rfcl.score(X_train_down,y_train_down))
print(rfcl.score(X_test,y_test))

print(metrics.classification_report(y_test, y_pred_rf))

0.995049504950495
0.9833333333333333
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99        72
         1.0       0.98      0.98      0.98        48

    accuracy                           0.98       120
   macro avg       0.98      0.98      0.98       120
weighted avg       0.98      0.98      0.98       120



In [None]:
cm = metrics.confusion_matrix(y_test, y_pred_rf, labels=[1,0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                         columns = [i for i in ["Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

<AxesSubplot:>

# kNN

In [None]:
def knn_grid_search(X, y):
    #create a dictionary of all values we want to test
    k_range = list(range(1, 31))
    param_grid = dict(n_neighbors=k_range)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

    knn = KNeighborsClassifier()

    #use gridsearch to test all values
    knn_gscv = GridSearchCV(knn, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    #fit model to data
    knn_gscv.fit(X, y)

    return knn_gscv.best_params_

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_grid_search(X_train, y_train)

{'n_neighbors': 2}

### Over Sample kNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

print(knn.score(X_train, y_train))
print(knn.score(X_test, y_test))

print(metrics.classification_report(y_test, y_pred_knn))

0.994413407821229
1.0
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        72
         1.0       1.00      1.00      1.00        48

    accuracy                           1.00       120
   macro avg       1.00      1.00      1.00       120
weighted avg       1.00      1.00      1.00       120



In [None]:
knn_tacc = knn.score(X_test, y_test)
knn_train_acc = knn.score(X_train, y_train)

### Confusion Matrix of kNN

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred_knn, labels=[1,0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                         columns = [i for i in ["Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

<AxesSubplot:>

### Under Sample kNN

In [None]:
knn_grid_search(X_train_down, y_train_down)

{'n_neighbors': 2}

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train_down, y_train_down)

y_pred_knn = knn.predict(X_test)

print(knn.score(X_train_down, y_train_down))
print(knn.score(X_test, y_test))

print(metrics.classification_report(y_test, y_pred_knn))

1.0
0.9916666666666667
              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99        72
         1.0       0.98      1.00      0.99        48

    accuracy                           0.99       120
   macro avg       0.99      0.99      0.99       120
weighted avg       0.99      0.99      0.99       120



In [None]:
cm = metrics.confusion_matrix(y_test, y_pred_knn, labels=[1,0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                         columns = [i for i in ["Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

<AxesSubplot:>

# SVM

In [None]:
def svm_grid_search(X, y):
    #create a dictionary of all values we want to test
    param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001, 0.4, 0.2, 0.8],'kernel': ['rbf', 'poly', 'sigmoid']}
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

    svm = SVC()

    #use gridsearch to test all values
    svm_gscv = RandomizedSearchCV(estimator = svm,
                           param_distributions = param_grid,
                           scoring = 'accuracy',
                           cv = cv,
                           n_jobs = -1)
    #fit model to data
    svm_gscv.fit(X, y)

    return svm_gscv.best_params_

In [None]:
from sklearn.model_selection import RandomizedSearchCV
svm_grid_search(X_train, y_train)

{'kernel': 'rbf', 'gamma': 1, 'C': 100}

### Over Sample SVM

In [None]:
from sklearn import svm
svm = SVC(gamma=1, C=100, kernel='rbf', probability=True)

svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)

print(svm.score(X_train, y_train))
print(svm.score(X_test, y_test))

print(metrics.classification_report(y_test, y_pred_svm))

1.0
0.9916666666666667
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99        72
         1.0       1.00      0.98      0.99        48

    accuracy                           0.99       120
   macro avg       0.99      0.99      0.99       120
weighted avg       0.99      0.99      0.99       120



In [None]:
svm_tacc = svm.score(X_test, y_test)
svm_train_acc = svm.score(X_train, y_train)

### Confusion Matrix of SVM

In [None]:
cm = metrics.confusion_matrix(y_test, y_pred_svm, labels=[1,0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                         columns = [i for i in ["Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

<AxesSubplot:>

### Under Sample SVM

In [None]:
svm_grid_search(X_train_down, y_train_down)

{'kernel': 'poly', 'gamma': 1, 'C': 0.1}

In [None]:
from sklearn import svm
svm = SVC(gamma=1, C=0.1, kernel='poly', probability=True)

svm.fit(X_train_down, y_train_down)

y_pred_svm = svm.predict(X_test)

print(svm.score(X_train_down, y_train_down))
print(svm.score(X_test, y_test))

print(metrics.classification_report(y_test, y_pred_svm))

0.995049504950495
0.9833333333333333
              precision    recall  f1-score   support

         0.0       1.00      0.97      0.99        72
         1.0       0.96      1.00      0.98        48

    accuracy                           0.98       120
   macro avg       0.98      0.99      0.98       120
weighted avg       0.98      0.98      0.98       120



In [None]:
cm = metrics.confusion_matrix(y_test, y_pred_svm, labels=[1,0])

df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
                         columns = [i for i in ["Predict 1", "Predict 0"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True, fmt='g')

<AxesSubplot:>

# Comparision of all Models

In [None]:
def comp_model(model_list, model_train_acc_list, model_test_acc_list):
    data = {"Model Name": model_list, "Train Accuracy(%)": [i*100 for i in model_train_acc_list], "Test Accuracy(%)": [i*100 for i in model_test_acc_list]}
    Comparision = pd.DataFrame(data)
    return Comparision

In [None]:
model_list = ["Logistic Regression", "Decision Tree Classifier", "AdaBoost", "Random Forest Classifier", "kNN", "SVM"]
model_train_acc_list = [lr_train_acc, dt_train_acc, ada_train_acc, rf_train_acc, knn_train_acc, svm_train_acc]
model_test_acc_list = [lr_tacc, dt_tacc, ada_tacc, rf_tacc, knn_tacc, svm_tacc]
comp_model(model_list, model_train_acc_list, model_test_acc_list)

Unnamed: 0,Model Name,Train Accuracy(%),Test Accuracy(%)
0,Logistic Regression,99.441341,100.0
1,Decision Tree Classifier,100.0,98.333333
2,AdaBoost,99.72067,100.0
3,Random Forest Classifier,99.72067,100.0
4,kNN,99.441341,100.0
5,SVM,100.0,99.166667
