In [99]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn import svm
from sklearn.metrics import mean_absolute_error

In [100]:
data = pd.read_csv('D:/Data Science Hacktiv8/data/healthcare-dataset-stroke-data.csv')

In [101]:
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [102]:
data_inf = data.sample(10, random_state=15)
data_train_test = data.drop(data_inf.index)
data_train_test.reset_index(drop=True, inplace=True)
data_inf.reset_index(drop=True, inplace=True)
data_inf

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,46875,Male,35.0,0,0,Yes,Private,Urban,145.23,32.3,never smoked,0
1,28108,Female,62.0,0,0,Yes,Private,Rural,82.57,27.5,Unknown,0
2,6726,Female,31.0,0,0,Yes,Private,Urban,73.31,45.0,never smoked,0
3,56855,Male,46.0,0,0,Yes,Private,Urban,137.77,29.3,never smoked,0
4,40371,Female,47.0,0,0,Yes,Private,Urban,62.47,26.5,never smoked,0
5,46068,Male,58.0,0,0,No,Self-employed,Rural,170.93,30.7,Unknown,0
6,57667,Male,12.0,0,0,No,children,Urban,70.07,24.5,formerly smoked,0
7,58761,Male,52.0,0,0,Yes,Private,Urban,87.51,30.5,formerly smoked,0
8,41942,Female,37.0,0,0,Yes,Private,Urban,247.87,42.6,never smoked,0
9,46130,Female,57.0,0,0,Yes,Self-employed,Urban,142.31,35.2,smokes,0


In [103]:
def find_normal_boundaries(df, variable):
    upper_boundary = df[variable].mean() + 3 * df[variable].std()
    lower_boundary = df[variable].mean() - 3 * df[variable].std()

    return upper_boundary, lower_boundary

def find_skewed_boundaries(df, variable, distance):
    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

    return upper_boundary, lower_boundary

In [104]:
data['age'].skew()

upper_boundary1, lower_boundary1 = find_normal_boundaries(data, 'age')
upper_boundary1, lower_boundary1

print('Older than 111 : {}'.format(len(data[data['age'] > upper_boundary1])))
print('% of older than 111 : {}'.format(len(data[data['age'] > upper_boundary1]) / len(data) * 100))

Older than 111 : 0
% of older than 111 : 0.0


In [105]:
data['hypertension'].skew()

upper_boundary2, lower_boundary2 = find_skewed_boundaries(data, 'hypertension', 1.5)
upper_boundary2, lower_boundary2


(0.0, 0.0)

In [106]:
data['heart_disease'].skew()

upper_boundary3, lower_boundary3 = find_skewed_boundaries(data, 'heart_disease', 1.5)
upper_boundary3, lower_boundary3

(0.0, 0.0)

In [107]:
data['avg_glucose_level'].skew()

upper_boundary4, lower_boundary4 = find_skewed_boundaries(data, 'avg_glucose_level', 1.5)
upper_boundary4, lower_boundary4

print('% right end outliers : {}'.format(len(data[data['avg_glucose_level'] > upper_boundary4]) / len(data) * 100))
print('% left end outliers  : {}'.format(len(data[data['avg_glucose_level'] < lower_boundary4]) / len(data) * 100))

data['avg_glucose_level'] = np.where(data['avg_glucose_level']>upper_boundary4,upper_boundary4,np.where(data['avg_glucose_level']<lower_boundary4,lower_boundary4,data['avg_glucose_level']))

% right end outliers : 12.270058708414872
% left end outliers  : 0.0


In [108]:
data['bmi'].skew()

upper_boundary5, lower_boundary5 = find_skewed_boundaries(data, 'bmi', 1.5)
upper_boundary5, lower_boundary5

print('% right end outliers : {}'.format(len(data[data['bmi'] > upper_boundary5]) / len(data) * 100))
print('% left end outliers  : {}'.format(len(data[data['bmi'] < lower_boundary5]) / len(data) * 100))

X_train = data[(data['bmi'] < upper_boundary5) & (data['bmi'] > lower_boundary5)]

% right end outliers : 2.152641878669276
% left end outliers  : 0.0


In [109]:
data.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [110]:
data['bmi'].fillna(data['bmi'].mean(), inplace = True)

In [111]:
data['gender'].value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

In [112]:
data['gender'] = data['gender'].replace('Other','Female')

In [113]:
X = data.drop(['id','stroke'], axis = 1)
y = data['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=15)

In [114]:
num_columns = X_train.select_dtypes(include=np.number).columns.tolist()
cat_columns = X_train.select_dtypes(include=['object']).columns.tolist()

print('Numerical Columns : ', num_columns)
print('Categorical Columns : ', cat_columns)

Numerical Columns :  ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
Categorical Columns :  ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']


In [115]:
X_train_num = X_train[num_columns]
X_train_cat = X_train[cat_columns]

X_test_num = X_test[num_columns]
X_test_cat = X_test[cat_columns]

In [116]:
scaler = MinMaxScaler()
scaler.fit(X_train_num)

X_train_num_scaled = scaler.transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

encoder = OrdinalEncoder()
encoder.fit(X_train_cat)

X_train_cat_enc = encoder.transform(X_train_cat)
X_test_cat_enc = encoder.transform(X_test_cat)

In [117]:
X_train_final = np.concatenate([X_train_num_scaled, X_train_cat_enc], axis=1)
X_test_final = np.concatenate([X_test_num_scaled, X_test_cat_enc], axis=1)

X_train_final_df = pd.DataFrame(X_train_final, columns=[num_columns + cat_columns])
X_train_final_df

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender,ever_married,work_type,Residence_type,smoking_status
0,0.133301,0.0,0.0,0.348134,0.187717,0.0,0.0,4.0,1.0,2.0
1,0.084473,0.0,0.0,0.262873,0.044032,0.0,0.0,4.0,0.0,0.0
2,0.645996,0.0,0.0,0.912748,0.207416,1.0,1.0,0.0,0.0,2.0
3,0.328613,0.0,0.0,0.073619,0.180765,1.0,0.0,3.0,0.0,2.0
4,0.426270,0.0,0.0,0.182777,0.308227,0.0,1.0,2.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
4083,0.438477,0.0,0.0,0.170785,0.235226,1.0,1.0,2.0,0.0,1.0
4084,0.987793,0.0,0.0,0.296750,0.180765,1.0,1.0,0.0,0.0,2.0
4085,0.975586,0.0,1.0,0.419652,0.202781,0.0,1.0,3.0,0.0,2.0
4086,0.938965,0.0,0.0,1.000000,0.250290,0.0,1.0,2.0,1.0,2.0


In [118]:
model = svm.SVC()

In [119]:
model.fit(X_train_final, y_train)

SVC()

In [120]:
y_pred_train = model.predict(X_train_final)
y_pred_test = model.predict(X_test_final)

In [121]:
mean_absolute_error(y_train, y_pred_train)

0.0474559686888454

In [122]:
mean_absolute_error(y_test, y_pred_test)

0.053816046966731895

In [123]:
data_inf_num = data_inf[num_columns]
data_inf_cat = data_inf[cat_columns]

data_inf_num_scaled = scaler.transform(data_inf_num)
data_inf_cat_enc = encoder.transform(data_inf_cat)

data_inf_final = np.concatenate([data_inf_num_scaled, data_inf_cat_enc], axis=1)
data_inf_final_df = pd.DataFrame(data_inf_final, columns=[num_columns + cat_columns])
data_inf_final_df 

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender,ever_married,work_type,Residence_type,smoking_status
0,0.42627,0.0,0.0,0.788795,0.243337,1.0,1.0,2.0,1.0,2.0
1,0.755859,0.0,0.0,0.240289,0.187717,0.0,1.0,2.0,0.0,0.0
2,0.377441,0.0,0.0,0.15923,0.390498,0.0,1.0,2.0,1.0,2.0
3,0.560547,0.0,0.0,0.723493,0.208575,1.0,1.0,2.0,1.0,2.0
4,0.572754,0.0,0.0,0.06434,0.17613,0.0,1.0,2.0,1.0,2.0
5,0.707031,0.0,0.0,1.013765,0.224797,1.0,0.0,3.0,0.0,0.0
6,0.145508,0.0,0.0,0.130868,0.152955,1.0,0.0,4.0,1.0,1.0
7,0.633789,0.0,0.0,0.283532,0.22248,1.0,1.0,2.0,1.0,1.0
8,0.450684,0.0,0.0,1.687274,0.362688,0.0,1.0,2.0,1.0,2.0
9,0.694824,0.0,0.0,0.763234,0.276941,0.0,1.0,3.0,1.0,3.0


In [124]:
y_pred_inf = model.predict(data_inf_final)
y_pred_inf_df = pd.DataFrame(y_pred_inf, columns=['Stroke - Prediction'])
y_pred_inf_df

Unnamed: 0,Stroke - Prediction
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


In [125]:
pd.concat([data_inf, y_pred_inf_df], axis=1)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,Stroke - Prediction
0,46875,Male,35.0,0,0,Yes,Private,Urban,145.23,32.3,never smoked,0,0
1,28108,Female,62.0,0,0,Yes,Private,Rural,82.57,27.5,Unknown,0,0
2,6726,Female,31.0,0,0,Yes,Private,Urban,73.31,45.0,never smoked,0,0
3,56855,Male,46.0,0,0,Yes,Private,Urban,137.77,29.3,never smoked,0,0
4,40371,Female,47.0,0,0,Yes,Private,Urban,62.47,26.5,never smoked,0,0
5,46068,Male,58.0,0,0,No,Self-employed,Rural,170.93,30.7,Unknown,0,0
6,57667,Male,12.0,0,0,No,children,Urban,70.07,24.5,formerly smoked,0,0
7,58761,Male,52.0,0,0,Yes,Private,Urban,87.51,30.5,formerly smoked,0,0
8,41942,Female,37.0,0,0,Yes,Private,Urban,247.87,42.6,never smoked,0,0
9,46130,Female,57.0,0,0,Yes,Self-employed,Urban,142.31,35.2,smokes,0,0


In [126]:
model.score(X_test_final, y_test)

0.9461839530332681

In [127]:
model.score(X_train_final, y_train)

0.9525440313111546