In [1]:
import numpy as np
import pandas as pd
import io
import requests
from matplotlib import pyplot as plt
import pickle
import os
from scipy.stats import zscore
from pandas.api.types import CategoricalDtype
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import cross_val_score
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
import sweetviz as sv

In [2]:
df=pd.read_csv('cardio_train.csv')
df

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,988,22469,1,155,69.0,130,80,2,2,0,0,1,0
1,989,14648,1,163,71.0,110,70,1,1,0,0,1,1
2,990,21901,1,165,70.0,120,80,1,1,0,0,1,0
3,991,14549,2,165,85.0,120,80,1,1,1,1,1,0
4,992,23393,1,155,62.0,120,80,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69296,99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
69297,99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
69298,99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
69299,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


# Analysing the Data & Applying AutoEDA

In [3]:
advert_report = sv.analyze(df)

:FEATURES DONE:                    |█████████████████████| [100%]   00:26  -> (00:00 left)
:PAIRWISE DONE:                    |█████████████████████| [100%]   00:08  -> (00:00 left)


Creating Associations graph... DONE!


In [4]:
advert_report.show_html('cardio_train.html')

Report cardio_train.html was generated! NOTEBOOK/COLAB USERS: no browser will pop up, the report is saved in your notebook/colab files.


In [5]:
df1 = sv.compare(df[100:],df[:100])
df1.show_html('Compare.html')

:FEATURES DONE:                    |█████████████████████| [100%]   00:10  -> (00:00 left)
:PAIRWISE DONE:                    |█████████████████████| [100%]   00:11  -> (00:00 left)


Creating Associations graph... DONE!
Report Compare.html was generated! NOTEBOOK/COLAB USERS: no browser will pop up, the report is saved in your notebook/colab files.


# Removing the Outliers

In [6]:
z=np.abs(zscore(df))
z

array([[1.73243929, 1.21601824, 0.73302344, ..., 0.23864069, 0.49376463,
        0.99917784],
       [1.73240428, 1.95391548, 0.73302344, ..., 0.23864069, 0.49376463,
        1.00082284],
       [1.73236927, 0.98580185, 0.73302344, ..., 0.23864069, 0.49376463,
        0.99917784],
       ...,
       [1.7338761 , 0.16325352, 1.36421285, ..., 4.19040018, 2.02525646,
        1.00082284],
       [1.73394612, 1.20061644, 0.73302344, ..., 0.23864069, 2.02525646,
        1.00082284],
       [1.73398113, 0.43417421, 0.73302344, ..., 0.23864069, 0.49376463,
        0.99917784]])

In [7]:
threshold=3
print(np.where(z>3))

(array([    3,     3,    13, ..., 69296, 69297, 69298], dtype=int64), array([ 9, 10,  8, ...,  9,  4, 10], dtype=int64))


In [8]:
z[15][9]

0.31072843551154616

In [9]:
z[29][12]

1.0008228374844872

In [10]:
df_new=df[(z<3).all(axis=1)]
df_new

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,988,22469,1,155,69.0,130,80,2,2,0,0,1,0
1,989,14648,1,163,71.0,110,70,1,1,0,0,1,1
2,990,21901,1,165,70.0,120,80,1,1,0,0,1,0
4,992,23393,1,155,62.0,120,80,1,1,0,0,1,0
5,995,21143,1,164,61.0,100,70,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69293,99990,18792,1,161,56.0,170,90,1,1,0,0,1,1
69294,99991,19699,1,172,70.0,130,90,1,1,0,0,1,1
69295,99992,21074,1,165,80.0,150,80,1,1,0,0,1,1
69299,99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [11]:
df.shape

(69301, 13)

In [12]:
df_new.shape

(55140, 13)

In [13]:
feature_cols = ['id','age','gender','height','weight','ap_hi','ap_lo','cholesterol','gluc','smoke','alco','active']
x = df[feature_cols]
x.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,988,22469,1,155,69.0,130,80,2,2,0,0,1
1,989,14648,1,163,71.0,110,70,1,1,0,0,1
2,990,21901,1,165,70.0,120,80,1,1,0,0,1
3,991,14549,2,165,85.0,120,80,1,1,1,1,1
4,992,23393,1,155,62.0,120,80,1,1,0,0,1


In [14]:
feature_cols = ['cardio']
y = df[feature_cols]
y.head()

Unnamed: 0,cardio
0,0
1,1
2,0
3,0
4,0


In [15]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.35,random_state=40)

# Applying the Logistic Regression

In [16]:
lg=LogisticRegression()

In [17]:
lg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

# Applying Coefficient

In [18]:
lg.coef_

array([[-7.67238387e-07,  1.08973665e-04,  2.51070092e-01,
        -4.52623218e-02,  1.93643257e-02,  2.93646906e-02,
         4.55866495e-04,  4.69956236e-01, -1.09130423e-01,
        -1.19366586e-01, -1.71645669e-01, -2.34074937e-01]])

In [19]:
lg.intercept_

array([-0.46292581])

In [20]:
lg.score(x_train,y_train)

0.705072705072705

# Apply Cross-validation

In [21]:
cross_val_model = LogisticRegression(random_state=0)
scores = cross_val_score(cross_val_model, x_train, 
         y_train, cv=5)
print(np.mean(scores))

0.7034073924660119


In [22]:
pred=lg.predict(x_test)
print(pred)

[1 1 1 ... 0 0 1]


# Scaling the Data

In [23]:
feature_scaler = StandardScaler()
x_train=feature_scaler.fit_transform(x_train)
y_test=feature_scaler.transform(x_test)

# Training and Cross Validation

In [24]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=300,random_state=50)

# Applying Grid SearchCV model

In [25]:
grid_param = {
    'n_estimators': [100,300,500,800,1000],
    'criterion':['gini','entropy'],
    'bootstrap':[True,False]
}

In [26]:
gd_sr = GridSearchCV(estimator = classifier,
                    param_grid=grid_param,
                    scoring='accuracy',
                    cv=3,
                    n_jobs=1)

In [27]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [28]:
for train_index , test_index in kf.split([1,2,3,4,5,6,7]):
    print(train_index,test_index)

[3 4 5 6] [0 1 2]
[0 1 2 5 6] [3 4]
[0 1 2 3 4] [5 6]


Predicted output in 3 number of folds

In [29]:
def get_score(model,x_train,x_test,y_train,y_test):
    model.fit(x_train,y_train)
    return model.score(x_test, y_test)

In [30]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

In [31]:
pred=lg.predict(x_test)
print("Predicted Results : ", pred)

Predicted Results :  [1 1 1 ... 0 0 1]


In [32]:
# To save the model
from sklearn.externals import joblib
joblib.dump(pred, 'rf_regressor.pkl')

['rf_regressor.pkl']

# Output:
We have successfully built the predicted model by using Logistic Regression to classify the patients to be healthy or suffering from cardiovascular disease.