# Building Predictive Models

In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

## Import Data

In [2]:
# set the path of the processed data
processed_data_path = os.path.join(os.path.pardir,'data','processed')
train_file_path = os.path.join(processed_data_path,'train.csv')
test_file_path = os.path.join(processed_data_path,'test.csv')


In [3]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)


### diagnosis of heart disease

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294 entries, 0 to 293
Data columns (total 33 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              294 non-null    int64  
 1   diagnosed               294 non-null    int64  
 2   age                     294 non-null    float64
 3   sex                     294 non-null    float64
 4   trestbps                294 non-null    float64
 5   chol                    294 non-null    int64  
 6   fbs                     294 non-null    int64  
 7   restecg                 294 non-null    int64  
 8   thalach                 294 non-null    float64
 9   exang                   294 non-null    int64  
 10  oldpeak                 294 non-null    float64
 11  slope                   294 non-null    int64  
 12  ca                      294 non-null    float64
 13  thal                    294 non-null    float64
 14  num                     294 non-null    in

In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 33 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              303 non-null    int64  
 1   diagnosed               303 non-null    int64  
 2   age                     303 non-null    float64
 3   sex                     303 non-null    float64
 4   trestbps                303 non-null    float64
 5   chol                    303 non-null    int64  
 6   fbs                     303 non-null    int64  
 7   restecg                 303 non-null    int64  
 8   thalach                 303 non-null    float64
 9   exang                   303 non-null    int64  
 10  oldpeak                 303 non-null    float64
 11  slope                   303 non-null    int64  
 12  ca                      303 non-null    float64
 13  thal                    303 non-null    float64
 14  num                     303 non-null    in

### diagnosis of heart disease

In [6]:
test_df.num.value_counts()

0    164
1     55
2     36
3     35
4     13
Name: num, dtype: int64

In [7]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,diagnosed,age,sex,trestbps,chol,fbs,restecg,thalach,exang,...,cp_3.0,cp_4.0,cat_A,cat_B,cat_C,cat_D,cat_E,cat_F,cat_G,cat_Z
0,0,0,28.0,1.0,130.0,132,0,2,185.0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,29.0,1.0,120.0,243,0,0,160.0,0,...,0,0,0,0,0,0,1,0,0,0
2,2,0,29.0,1.0,140.0,242,0,0,170.0,0,...,0,0,0,0,0,0,1,0,0,0
3,3,0,30.0,0.0,170.0,237,0,1,170.0,0,...,0,0,0,0,0,0,1,0,0,0
4,4,0,31.0,0.0,100.0,219,0,1,150.0,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,diagnosed,age,sex,trestbps,chol,fbs,restecg,thalach,exang,...,cp_3.0,cp_4.0,cat_A,cat_B,cat_C,cat_D,cat_E,cat_F,cat_G,cat_Z
0,0,-888,63.0,1.0,145.0,233,1,2,150.0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,-888,67.0,1.0,160.0,286,0,2,108.0,1,...,0,1,0,1,0,0,0,0,0,0
2,2,-888,67.0,1.0,120.0,229,0,2,129.0,1,...,0,1,0,0,1,0,0,0,0,0
3,3,-888,37.0,1.0,130.0,250,0,0,187.0,0,...,1,0,0,0,0,0,0,1,0,0
4,4,-888,41.0,0.0,130.0,204,0,2,172.0,0,...,0,0,0,0,0,0,1,0,0,0


### Data Preparation

In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 33 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              303 non-null    int64  
 1   diagnosed               303 non-null    int64  
 2   age                     303 non-null    float64
 3   sex                     303 non-null    float64
 4   trestbps                303 non-null    float64
 5   chol                    303 non-null    int64  
 6   fbs                     303 non-null    int64  
 7   restecg                 303 non-null    int64  
 8   thalach                 303 non-null    float64
 9   exang                   303 non-null    int64  
 10  oldpeak                 303 non-null    float64
 11  slope                   303 non-null    int64  
 12  ca                      303 non-null    float64
 13  thal                    303 non-null    float64
 14  num                     303 non-null    in

In [10]:
test_df['diagnosed'] = test_df['num'].apply(lambda x: 0 if x == 0 else 1)

In [11]:
import sklearn

In [12]:
# train test split
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=109)

X_train = train_df
y_train = train_df['diagnosed']

X_test = test_df
y_test = test_df['diagnosed'] #.ravel()


print (X_train.shape,y_train.shape)
print (X_test.shape, y_test.shape)

(294, 33) (294,)
(303, 33) (303,)


## Logistic Regression Model

In [13]:
# import function
from sklearn.linear_model import LogisticRegression

In [14]:
# create model
model_lr_1 = LogisticRegression(random_state=0)

In [15]:
# train model
model_lr_1.fit(X_train, y_train)

LogisticRegression(random_state=0)

In [16]:
# evaluate model
print('score for logistic regression - version 1 :{0:.2f}'.format(model_lr_1.score(X_test, y_test)))

score for logistic regression - version 1 :0.52


In [17]:
# model coefficients
model_lr_1.coef_

array([[ 3.34252441e-01,  4.35828505e-02, -5.83052313e-01,
        -2.96404731e-03, -1.23602987e-01, -1.87936610e-02,
         2.00333609e-03, -1.21635721e-02, -7.87707688e-02,
         4.12116435e-03, -5.94004657e-03,  1.99284967e-03,
         0.00000000e+00, -7.17636924e-03,  4.35828505e-02,
        -7.65908759e-03, -2.98628876e-02, -1.60856270e-03,
         3.17820354e-02, -9.03445673e-03,  1.68595427e-03,
        -3.55788423e-03, -1.44204437e-02,  2.54168087e-04,
         1.03756574e-02, -2.57678541e-03, -7.35037528e-04,
        -2.38737082e-02,  1.68721389e-02,  2.84517154e-03,
         1.08650870e-04,  0.00000000e+00,  1.10674200e-05]])

# Part 2

### Hyperparameter Optimization

In [18]:
# base model
model_lr = LogisticRegression(random_state=0)

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
parameters = {'C':[1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty':['l1','l2']}
clf = GridSearchCV(model_lr, param_grid=parameters, cv=3)

In [21]:
X2_test = test_df


In [22]:
y2_test = test_df['num'].ravel()

In [23]:
clf.fit(X2_test, y2_test)

GridSearchCV(cv=3, estimator=LogisticRegression(random_state=0),
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0],
                         'penalty': ['l1', 'l2']})

In [24]:
clf.best_params_

{'C': 10.0, 'penalty': 'l2'}

In [25]:
print ('best score:{0:.2f}'.format(clf.best_score_))

best score:0.71


In [26]:
# evaluate model
print('score for logistic regression - version 2 :{0:.2f}'.format(clf.score(X2_test,y2_test)))

score for logistic regression - version 2 :0.76


## Feature Normalization and Standardization

In [27]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

### Feature Normalization

In [28]:
# feature normalization
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [29]:
X_train_scaled[:,0].min(),X_train_scaled[:,0].max()

(0.0, 1.0)

In [30]:
# normalize test data
X_test_scaled = scaler.transform(X_test)

### Feature Standardization

In [31]:
# feature standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Create model after standardization

In [32]:
# base model
model_lr = LogisticRegression()
parameters={'C':[1.0,10.0,50.0, 100.0, 1000.0],'penalty':['l1','l2']}
clf = GridSearchCV(model_lr,param_grid=parameters,cv=3)
clf.fit(X_train_scaled,y_train)

GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0],
                         'penalty': ['l1', 'l2']})

In [33]:
rfc_predict = clf.predict(X_test_scaled)
rfc_predict[:200]

array([0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1], dtype=int64)

In [34]:
#evaluating Random Forest Classifier
print(classification_report(y_test, rfc_predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       164
           1       1.00      1.00      1.00       139

    accuracy                           1.00       303
   macro avg       1.00      1.00      1.00       303
weighted avg       1.00      1.00      1.00       303



In [35]:
clf.best_score_

1.0

In [36]:
# evaluate model
print ('score for logistic regression - version 2: {0:.2f}'.format(clf.score(X_test_scaled,y_test)))

score for logistic regression - version 2: 1.00


## Model Persistence

In [37]:
# import pickle library
import pickle

In [38]:
# create the file paths
model_file_path = os.path.join(os.path.pardir,'models','lr_model.pkl')
scaler_file_path = os.path.join(os.path.pardir,'models','lr_scaler.pkl')


In [39]:
# open the files to write
model_file_pickle = open(model_file_path,'wb')
scaler_file_pickle = open(scaler_file_path,'wb')


In [40]:
# persist the model and scaler
pickle.dump(clf,model_file_pickle)
pickle.dump(scaler,scaler_file_pickle)

In [41]:
#close the file
model_file_pickle.close()
scaler_file_pickle.close()

### load the persisted file

In [42]:
# open files in read mode
model_file_pickle = open(model_file_path,'rb')
scaler_file_pickle = open(scaler_file_path,'rb')
# load files
clf_loaded = pickle.load(model_file_pickle)
scaler_loaded = pickle.load(scaler_file_pickle)
#close files
model_file_pickle.close()
scaler_file_pickle.close()



In [43]:
clf_loaded

GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0],
                         'penalty': ['l1', 'l2']})

In [44]:
scaler_loaded

StandardScaler()

In [45]:
# transform the test data using loaded scaler object
X_test_scaled = scaler_loaded.transform(X_test)
# calculate the score using loaded model object
print ('score for persisted logistic regression :{0:.2f}'.format(clf_loaded.score(X_test_scaled,y_test)))

score for persisted logistic regression :1.00


# Random Forest Classifier

In [46]:
from sklearn.ensemble import RandomForestClassifier
#training Random Forest Classifier and making prediction
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
rfc_predict = rfc.predict(X_test)
rfc_predict[:2000]

array([0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,

In [47]:
#evaluating Random Forest Classifier
print(classification_report(y_test, rfc_predict))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00       164
           1       1.00      1.00      1.00       139

    accuracy                           1.00       303
   macro avg       1.00      1.00      1.00       303
weighted avg       1.00      1.00      1.00       303



## Support Vector Machine

In [48]:
from sklearn.svm import SVC
# training Support Vector Machine and making prediction
svmc = SVC()
svmc.fit(X_train,y_train)
svmc_predict = svmc.predict(X_test)
svmc_predict[:2000]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [49]:
print(classification_report(y_test, rfc_predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       164
           1       1.00      1.00      1.00       139

    accuracy                           1.00       303
   macro avg       1.00      1.00      1.00       303
weighted avg       1.00      1.00      1.00       303



## Decision Tree

In [50]:
from sklearn.tree import DecisionTreeClassifier
# training Decision Tree Classifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
dt_predict = decision_tree.predict(X_test)
dt_predict[:2000]


array([0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,

In [51]:
print(classification_report(y_test, dt_predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       164
           1       1.00      1.00      1.00       139

    accuracy                           1.00       303
   macro avg       1.00      1.00      1.00       303
weighted avg       1.00      1.00      1.00       303



In [52]:
from sklearn.naive_bayes import GaussianNB
## Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
nb_predict = nb.predict(X_test)
nb_predict[:2000]

array([0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,

In [53]:
print(classification_report(y_test, nb_predict))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       164
           1       1.00      1.00      1.00       139

    accuracy                           1.00       303
   macro avg       1.00      1.00      1.00       303
weighted avg       1.00      1.00      1.00       303



In [54]:
## Logistic Regression
lr = LogisticRegression(max_iter=1000)
result = lr.fit(X_train, y_train)
lr_predict = lr.predict(X_test)
lr_predict[:2000]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [55]:
print(classification_report(y_test, lr_predict))

              precision    recall  f1-score   support

           0       0.54      0.65      0.59       164
           1       0.46      0.36      0.40       139

    accuracy                           0.51       303
   macro avg       0.50      0.50      0.50       303
weighted avg       0.51      0.51      0.51       303



## Ensemble Learning

In [56]:
# import function
from sklearn.linear_model import LogisticRegression
# creating instances of classifiers to be used in the ensemble
lr = LogisticRegression()
svm = SVC()
rf = RandomForestClassifier(n_estimators=100)
nb = GaussianNB()
dt = DecisionTreeClassifier()

### voting classifier that combines four different estimators

In [57]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

evc = VotingClassifier(estimators=[('svm',svm),('nb',nb),('rf',rf),('lr',lr)], voting='hard')

evc.fit(X_train,y_train)
evc.score(X_test, y_test)

0.7953795379537953

### bagging classifier with 100 random forest estimators

In [58]:
from sklearn.ensemble import BaggingClassifier

bg = BaggingClassifier(rf, max_samples=0.6, max_features=1.0, n_estimators=100)

bg.fit(X_train, y_train)
bg.score(X_test, y_test)

1.0

### bagging classifier with 100 support vector machine estimators

In [59]:
bg = BaggingClassifier(nb, max_samples=0.6, max_features=1.0, n_estimators = 100)

bg.fit(X_train, y_train)
bg.score(X_test, y_test)

1.0

### Prepare data for Machine Learning API using Flask

In [60]:
import bz2
import pickle

# Load Model and Scaler Files
#model_path = os.path.join(os.path.pardir,os.path.pardir,'models')
model_path = os.path.join('','','models')

model_filepath = os.path.join(model_path, 'lr_model.pkl')
scaler_filepath = os.path.join(model_path, 'lr_scaler.pkl')



In [61]:
model_filepath

'models\\lr_model.pkl'

In [62]:
scaler_filepath

'models\\lr_scaler.pkl'

In [63]:
sfile = bz2.BZ2File(model_filepath, 'w')
pickle.dump(X_test, sfile)

In [64]:
sfile = bz2.BZ2File(scaler_filepath, 'w')
pickle.dump(y_test, sfile)