In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split

from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import plot_precision_recall_curve, precision_recall_curve
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
from imblearn.over_sampling import SMOTE


In [2]:
stroke_df=pd.read_csv('data/stroke-data.csv')

In [3]:
stroke_df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
X_train, X_test, y_train, y_test=train_test_split(
    stroke_df.drop('stroke', axis=1, inplace=False),
    stroke_df['stroke'], 
    test_size=0.2,
    stratify=stroke_df['stroke'],
    random_state=123
)

In [5]:
train_df=X_train

train_df['stroke']=y_train

In [6]:
test_df=X_test
test_df['stroke']=y_test

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4088 entries, 795 to 2819
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4088 non-null   int64  
 1   gender             4088 non-null   object 
 2   age                4088 non-null   float64
 3   hypertension       4088 non-null   int64  
 4   heart_disease      4088 non-null   int64  
 5   ever_married       4088 non-null   object 
 6   work_type          4088 non-null   object 
 7   Residence_type     4088 non-null   object 
 8   avg_glucose_level  4088 non-null   float64
 9   bmi                3923 non-null   float64
 10  smoking_status     4088 non-null   object 
 11  stroke             4088 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 415.2+ KB


In [8]:
train_df['bmi'].mean()

28.93163395360697

In [9]:
train_df['bmi'].fillna(train_df['bmi'].mean(), inplace=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4088 entries, 795 to 2819
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4088 non-null   int64  
 1   gender             4088 non-null   object 
 2   age                4088 non-null   float64
 3   hypertension       4088 non-null   int64  
 4   heart_disease      4088 non-null   int64  
 5   ever_married       4088 non-null   object 
 6   work_type          4088 non-null   object 
 7   Residence_type     4088 non-null   object 
 8   avg_glucose_level  4088 non-null   float64
 9   bmi                4088 non-null   float64
 10  smoking_status     4088 non-null   object 
 11  stroke             4088 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 415.2+ KB


In [10]:
test_df=test_df.dropna()

In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 986 entries, 2245 to 1173
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 986 non-null    int64  
 1   gender             986 non-null    object 
 2   age                986 non-null    float64
 3   hypertension       986 non-null    int64  
 4   heart_disease      986 non-null    int64  
 5   ever_married       986 non-null    object 
 6   work_type          986 non-null    object 
 7   Residence_type     986 non-null    object 
 8   avg_glucose_level  986 non-null    float64
 9   bmi                986 non-null    float64
 10  smoking_status     986 non-null    object 
 11  stroke             986 non-null    int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 100.1+ KB


In [12]:
fig=px.histogram(train_df, x='age')
fig.update_layout(
title="Age Histogram",
width=400,
height=400)
fig.show()

In [13]:
fig=px.box(train_df, y="age")
fig.show()

In [14]:
fig=px.box(train_df, x='stroke',y="age")
fig.show()

In [15]:
data=pd.crosstab(train_df['smoking_status'], train_df['stroke'])
data.head(5)

stroke,0,1
smoking_status,Unnamed: 1_level_1,Unnamed: 2_level_1
Unknown,1194,33
formerly smoked,663,58
never smoked,1443,70
smokes,589,38


In [16]:
fig=make_subplots(
rows=1,
cols=1
)

trace=go.Bar(x=['unknown', 'formerly smoked', 'never smoked', 'smokes'],y=[1194,663,1443,589],name='no stroke')
fig.append_trace(trace,1,1)
trace=go.Bar(x=['unknown', 'formerly smoked', 'never smoked', 'smokes'],y=[33,58,70,38], name='stroke')
fig.append_trace(trace, 1,1)
fig.show()

In [17]:
trace=go.Bar(x=['unknown', 'formerly smoked', 'never smoked', 'smokes'],y=[1194,663,1443,589],name='no stroke')
fig.append_trace(trace,1,1)
trace=go.Bar(x=['unknown', 'formerly smoked', 'never smoked', 'smokes'],y=[33,58,70,38], name='stroke')
fig.append_trace(trace, 1,1)
fig.show()

In [18]:
data.index

Index(['Unknown', 'formerly smoked', 'never smoked', 'smokes'], dtype='object', name='smoking_status')

In [19]:
data.columns

Int64Index([0, 1], dtype='int64', name='stroke')

In [20]:
data[0]

smoking_status
Unknown            1194
formerly smoked     663
never smoked       1443
smokes              589
Name: 0, dtype: int64

In [21]:
fig=make_subplots(
rows=1,
cols=1
)

trace=go.Bar(x=data.index,y=data[0],name='no stroke')
fig.append_trace(trace,1,1)
trace=go.Bar(x=data.index,y=data[1], name='stroke')
fig.append_trace(trace, 1,1)
fig.show()

In [22]:
fig=px.box(train_df, x='stroke', y='age')
fig.show()

In [23]:
fig=px.box(train_df, y="avg_glucose_level")
fig.show()

In [24]:
px.box(train_df, y='bmi')
fig.show()

In [25]:
train_df.info

<bound method DataFrame.info of          id  gender   age  hypertension  heart_disease ever_married  \
795   60777  Female  31.0             0              0          Yes   
4106  50545    Male  41.0             0              0          Yes   
1318   7195    Male  50.0             0              1           No   
4846  27801  Female  34.0             0              0          Yes   
532   31564  Female  25.0             0              0          Yes   
...     ...     ...   ...           ...            ...          ...   
3803    365  Female  44.0             1              0          Yes   
4764  25149  Female   3.0             0              0           No   
167   43364    Male  79.0             1              0          Yes   
1013  45824  Female  77.0             1              0          Yes   
2819  28183  Female  13.0             0              0           No   

          work_type Residence_type  avg_glucose_level        bmi  \
795        Govt_job          Rural             

In [26]:
train_df.nunique

<bound method DataFrame.nunique of          id  gender   age  hypertension  heart_disease ever_married  \
795   60777  Female  31.0             0              0          Yes   
4106  50545    Male  41.0             0              0          Yes   
1318   7195    Male  50.0             0              1           No   
4846  27801  Female  34.0             0              0          Yes   
532   31564  Female  25.0             0              0          Yes   
...     ...     ...   ...           ...            ...          ...   
3803    365  Female  44.0             1              0          Yes   
4764  25149  Female   3.0             0              0           No   
167   43364    Male  79.0             1              0          Yes   
1013  45824  Female  77.0             1              0          Yes   
2819  28183  Female  13.0             0              0           No   

          work_type Residence_type  avg_glucose_level        bmi  \
795        Govt_job          Rural          

In [27]:
train_df['smoking_status'].value_counts()

never smoked       1513
Unknown            1227
formerly smoked     721
smokes              627
Name: smoking_status, dtype: int64

In [28]:
train_df['gender'].value_counts()

Female    2394
Male      1693
Other        1
Name: gender, dtype: int64

In [29]:
train_df['ever_married'].value_counts()

Yes    2693
No     1395
Name: ever_married, dtype: int64

In [30]:
train_df['ever_married']=train_df['ever_married'].replace({'Yes':1, 'No':0})

In [31]:
train_df['ever_married'].value_counts()

1    2693
0    1395
Name: ever_married, dtype: int64

In [32]:
test_df['ever_married']=test_df['ever_married'].replace({'Yes':1, 'No':0})

In [33]:
train_df['Residence_type'].value_counts()

Urban    2076
Rural    2012
Name: Residence_type, dtype: int64

In [34]:
train_df['Residence_type']=train_df['Residence_type'].replace({'Urban':1,'Rural':0 })

In [35]:
train_df['Residence_type'].value_counts()

1    2076
0    2012
Name: Residence_type, dtype: int64

In [36]:
test_df['Residence_type']=test_df['Residence_type'].replace({'Urban':1,'Rural':0 })
test_df['Residence_type'].value_counts()

1    504
0    482
Name: Residence_type, dtype: int64

In [37]:
train_df=train_df[train_df['gender']!='Other']

In [38]:
train_df['gender'].value_counts()

Female    2394
Male      1693
Name: gender, dtype: int64

In [39]:
train_df['gender']=train_df['gender'].replace({'Female':1, 'Male':0})

In [40]:
train_df['gender'].value_counts()

1    2394
0    1693
Name: gender, dtype: int64

In [41]:
test_df['gender']=test_df['gender'].replace({'Female':1, 'Male':0})

In [42]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4087 entries, 795 to 2819
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4087 non-null   int64  
 1   gender             4087 non-null   int64  
 2   age                4087 non-null   float64
 3   hypertension       4087 non-null   int64  
 4   heart_disease      4087 non-null   int64  
 5   ever_married       4087 non-null   int64  
 6   work_type          4087 non-null   object 
 7   Residence_type     4087 non-null   int64  
 8   avg_glucose_level  4087 non-null   float64
 9   bmi                4087 non-null   float64
 10  smoking_status     4087 non-null   object 
 11  stroke             4087 non-null   int64  
dtypes: float64(3), int64(7), object(2)
memory usage: 415.1+ KB


In [43]:
train_df=pd.get_dummies(train_df)

In [44]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4087 entries, 795 to 2819
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              4087 non-null   int64  
 1   gender                          4087 non-null   int64  
 2   age                             4087 non-null   float64
 3   hypertension                    4087 non-null   int64  
 4   heart_disease                   4087 non-null   int64  
 5   ever_married                    4087 non-null   int64  
 6   Residence_type                  4087 non-null   int64  
 7   avg_glucose_level               4087 non-null   float64
 8   bmi                             4087 non-null   float64
 9   stroke                          4087 non-null   int64  
 10  work_type_Govt_job              4087 non-null   uint8  
 11  work_type_Never_worked          4087 non-null   uint8  
 12  work_type_Private               

In [45]:
train_df.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
795,60777,1,31.0,0,0,1,0,103.55,20.5,0,1,0,0,0,0,0,1,0,0
4106,50545,0,41.0,0,0,1,1,84.1,29.3,0,1,0,0,0,0,0,0,1,0
1318,7195,0,50.0,0,1,0,1,85.82,31.9,0,0,0,1,0,0,0,0,1,0
4846,27801,1,34.0,0,0,1,1,113.26,27.6,0,0,0,1,0,0,0,0,1,0
532,31564,1,25.0,0,0,1,0,90.65,20.9,0,0,0,1,0,0,1,0,0,0


In [46]:
test_df=pd.get_dummies(test_df)

In [47]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 986 entries, 2245 to 1173
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              986 non-null    int64  
 1   gender                          986 non-null    int64  
 2   age                             986 non-null    float64
 3   hypertension                    986 non-null    int64  
 4   heart_disease                   986 non-null    int64  
 5   ever_married                    986 non-null    int64  
 6   Residence_type                  986 non-null    int64  
 7   avg_glucose_level               986 non-null    float64
 8   bmi                             986 non-null    float64
 9   stroke                          986 non-null    int64  
 10  work_type_Govt_job              986 non-null    uint8  
 11  work_type_Never_worked          986 non-null    uint8  
 12  work_type_Private               

In [48]:
train_df=train_df.drop('id', axis=1, inplace=False)

In [49]:
test_df=test_df.drop('id', axis=1, inplace=False)

In [50]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4087 entries, 795 to 2819
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gender                          4087 non-null   int64  
 1   age                             4087 non-null   float64
 2   hypertension                    4087 non-null   int64  
 3   heart_disease                   4087 non-null   int64  
 4   ever_married                    4087 non-null   int64  
 5   Residence_type                  4087 non-null   int64  
 6   avg_glucose_level               4087 non-null   float64
 7   bmi                             4087 non-null   float64
 8   stroke                          4087 non-null   int64  
 9   work_type_Govt_job              4087 non-null   uint8  
 10  work_type_Never_worked          4087 non-null   uint8  
 11  work_type_Private               4087 non-null   uint8  
 12  work_type_Self-employed         

In [51]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 986 entries, 2245 to 1173
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   gender                          986 non-null    int64  
 1   age                             986 non-null    float64
 2   hypertension                    986 non-null    int64  
 3   heart_disease                   986 non-null    int64  
 4   ever_married                    986 non-null    int64  
 5   Residence_type                  986 non-null    int64  
 6   avg_glucose_level               986 non-null    float64
 7   bmi                             986 non-null    float64
 8   stroke                          986 non-null    int64  
 9   work_type_Govt_job              986 non-null    uint8  
 10  work_type_Never_worked          986 non-null    uint8  
 11  work_type_Private               986 non-null    uint8  
 12  work_type_Self-employed         

In [52]:
fig=px.imshow(train_df.corr(), text_auto=True)
fig.update_layout(
width=800,
height=800
)
fig.show()

In [53]:
X_train=train_df.drop('stroke', axis=1, inplace=False)
y_train=train_df['stroke']

In [54]:
X_test=test_df.drop('stroke', axis=1, inplace=False)
y_test=test_df['stroke']

In [55]:
scaler=StandardScaler()

In [56]:
X_train=pd.DataFrame(data=scaler.fit_transform(X_train), columns=X_train.columns)

In [57]:
X_test=pd.DataFrame(data=scaler.transform(X_test), columns=X_test.columns)

In [58]:
balance=SMOTE()
X_train_res, y_train_res=balance.fit_resample(X_train,y_train.ravel())

In [59]:
X_train.head(3)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0.840943,-0.543706,-0.3312,-0.243633,0.719471,-1.016033,-0.062116,-1.100193,2.59908,-0.070126,-1.150426,-0.439245,-0.394748,-0.654997,2.162496,-0.766682,-0.425692
1,-1.189142,-0.101211,-0.3312,-0.243633,0.719471,0.98422,-0.489323,0.047848,2.59908,-0.070126,-1.150426,-0.439245,-0.394748,-0.654997,-0.462429,1.304322,-0.425692
2,-1.189142,0.297034,-0.3312,4.104529,-1.38991,0.98422,-0.451544,0.387042,-0.384751,-0.070126,0.869243,-0.439245,-0.394748,-0.654997,-0.462429,1.304322,-0.425692


In [60]:
X_test=pd.DataFrame(data=scaler.transform(X_test), columns=X_test.columns)

In [61]:
lr_model=LogisticRegression()

In [62]:
y_train

795     0
4106    0
1318    0
4846    0
532     0
       ..
3803    0
4764    0
167     1
1013    0
2819    0
Name: stroke, Length: 4087, dtype: int64

In [63]:
lr_model.fit(X_train, y_train)

In [64]:
y_pred_lr=lr_model.predict(X_test)

In [65]:
print (classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       943
           1       0.00      0.00      0.00        43

    accuracy                           0.96       986
   macro avg       0.48      0.50      0.49       986
weighted avg       0.91      0.96      0.94       986



In [66]:
model_lr=LogisticRegression()
grid_params_lr={
    'penalty': ['none','elasticnet', ']1', ']2'],
    'C':[0.001, 0.01,0.1,1,10,100]
}
grid_lr=GridSearchCV(model_lr, grid_params_lr, verbose=1, cv=5, scoring='recall')
grid_lr.fit(X_train_res, y_train_res)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [67]:
y_pred_lr=grid_lr.predict(X_test)

In [68]:
print (classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       943
           1       0.00      0.00      0.00        43

    accuracy                           0.96       986
   macro avg       0.48      0.50      0.49       986
weighted avg       0.91      0.96      0.93       986



In [69]:
y_pred_lr_res=grid_lr.predict(X_test)

In [70]:
print (classification_report(y_test, y_pred_lr_res))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       943
           1       0.00      0.00      0.00        43

    accuracy                           0.96       986
   macro avg       0.48      0.50      0.49       986
weighted avg       0.91      0.96      0.93       986



In [71]:
models={
    'Logistic Regression':LogisticRegression(),
    'SGD Classifier':SGDClassifier(),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifer': RandomForestClassifier(),
    'K Neighbors': KNeighborsClassifier(),
    'Gradient Boosting Classifier': GradientBoostingClassifier(),
    'Multi-Layer Perceptron': MLPClassifier(),
    "SVC": SVC(),
    "Extra Trees Classifier": ExtraTreesClassifier(),
    'Ada Boost Classifier': AdaBoostClassifier()
}
for key, model in models.items():
    print (key)
    model.fit(X_train_res, y_train_res)
    y_pred=model.predict(X_test)
    
    print (classification_report(y_test, y_pred))
    print()

Logistic Regression
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       943
           1       0.00      0.00      0.00        43

    accuracy                           0.96       986
   macro avg       0.48      0.50      0.49       986
weighted avg       0.91      0.96      0.94       986


SGD Classifier
              precision    recall  f1-score   support

           0       0.95      0.86      0.90       943
           1       0.00      0.00      0.00        43

    accuracy                           0.82       986
   macro avg       0.47      0.43      0.45       986
weighted avg       0.91      0.82      0.86       986


Decision Tree Classifier
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       943
           1       0.00      0.00      0.00        43

    accuracy                           0.96       986
   macro avg       0.48      0.50      0.49       986
weighted avg

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       943
           1       0.00      0.00      0.00        43

    accuracy                           0.96       986
   macro avg       0.48      0.50      0.49       986
weighted avg       0.91      0.96      0.94       986


K Neighbors
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       943
           1       0.17      0.14      0.15        43

    accuracy                           0.93       986
   macro avg       0.57      0.55      0.56       986
weighted avg       0.93      0.93      0.93       986


Gradient Boosting Classifier


              precision    recall  f1-score   support

           0       0.96      0.52      0.67       943
           1       0.04      0.49      0.08        43

    accuracy                           0.52       986
   macro avg       0.50      0.50      0.38       986
weighted avg       0.92      0.52      0.65       986


Multi-Layer Perceptron


              precision    recall  f1-score   support

           0       0.96      1.00      0.98       943
           1       0.00      0.00      0.00        43

    accuracy                           0.96       986
   macro avg       0.48      0.50      0.49       986
weighted avg       0.91      0.96      0.94       986


SVC


              precision    recall  f1-score   support

           0       0.96      1.00      0.98       943
           1       0.00      0.00      0.00        43

    accuracy                           0.96       986
   macro avg       0.48      0.50      0.49       986
weighted avg       0.91      0.96      0.94       986


Extra Trees Classifier


              precision    recall  f1-score   support

           0       0.96      1.00      0.98       943
           1       0.00      0.00      0.00        43

    accuracy                           0.96       986
   macro avg       0.48      0.50      0.49       986
weighted avg       0.91      0.96      0.94       986


Ada Boost Classifier


              precision    recall  f1-score   support

           0       0.96      1.00      0.98       943
           1       0.00      0.00      0.00        43

    accuracy                           0.96       986
   macro avg       0.48      0.50      0.49       986
weighted avg       0.91      0.96      0.94       986




In [72]:
model_sgd=SGDClassifier()
grid_params_sgd={
    'loss': ['hinge', 'log_loss','log','modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
}
grid_sgd=GridSearchCV(model_sgd, grid_params_sgd, verbose=1, cv=5, scoring='recall')
grid_sgd.fit(X_train_res, y_train_res)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [73]:
print (classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       943
           1       0.00      0.00      0.00        43

    accuracy                           0.96       986
   macro avg       0.48      0.50      0.49       986
weighted avg       0.91      0.96      0.93       986



In [74]:
y_pred_sgd=grid_sgd.predict(X_test)

In [75]:
print (classification_report(y_test, y_pred_sgd))

              precision    recall  f1-score   support

           0       0.95      0.86      0.90       943
           1       0.00      0.00      0.00        43

    accuracy                           0.82       986
   macro avg       0.47      0.43      0.45       986
weighted avg       0.91      0.82      0.86       986



In [76]:
model_mlp=MLPClassifier()
model_mlp.fit(X_train_res,y_train_res)
print()




In [77]:
y_pred_mlp=model_mlp.predict(X_test)

In [78]:
y_pred_mlp

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [79]:
print(classification_report(y_test,y_pred_mlp))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       943
           1       0.00      0.00      0.00        43

    accuracy                           0.96       986
   macro avg       0.48      0.50      0.49       986
weighted avg       0.91      0.96      0.94       986



In [80]:
model_svc=SVC()
model_svc.fit(X_train_res,y_train_res)


In [81]:
y_pred_svc=model_svc.predict(X_test)

In [82]:
print (classification_report(y_test,y_pred_svc))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       943
           1       0.00      0.00      0.00        43

    accuracy                           0.96       986
   macro avg       0.48      0.50      0.49       986
weighted avg       0.91      0.96      0.94       986



In [83]:
model_dtc=DecisionTreeClassifier()
grid_params_dtc={
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'min_samples_split': [2 , 4, 8, 16],
    'max_depth': [None, 5, 10, 15, 20],
}
grid_dtc=GridSearchCV(model_dtc, grid_params_dtc, cv=5, verbose=1, scoring='recall')
grid_dtc.fit(X_train_res, y_train_res)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


In [84]:
y_pred_dtc=grid_dtc.predict(X_test)

In [85]:
print(classification_report(y_test, y_pred_dtc))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       943
           1       0.00      0.00      0.00        43

    accuracy                           0.96       986
   macro avg       0.48      0.50      0.49       986
weighted avg       0.91      0.96      0.94       986



In [86]:
cf_dtc=confusion_matrix(y_test, y_pred_dtc)

In [87]:
fig=px.imshow(cf_dtc, text_auto=True)
fig.show()

In [88]:
#model_mlp=MLPClassifier()
#grid_params_mlp={
#    'solver': ['lbfgs','sgd', 'adam'],
#    'activation' : ['identity', 'logistic', 'tanh', 'relu'],
#    'alpha': [0.0001, 0.001, 0.01],
#    'learning_rate' : ['constant', 'invscaling', 'adaptive'],
#    'learning_rate_init' : [0.0001, 0.001, 0.01]
#}
#grid_mlp=GridSearchCV(model_mlp, grid_params_mlp, cv=5, verbose=1, scoring='recall')
#grid_mlp.fit(X_train_res, y_train_res)

In [89]:
#solver=3
#activation=4
#alpha=3
#learning_rate=3
#learning_rate_init=3

In [90]:
cf_lr=confusion_matrix(y_test, y_pred_dtc)

In [91]:
fig=px.imshow(cf_lr, text_auto=True)
fig.show()