In [301]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn import metrics

In [272]:
data = pd.read_csv('D:/Data Science Hacktiv8/data/healthcare-dataset-stroke-data.csv')

In [273]:
data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [274]:
def find_normal_boundaries(df, variable):
    upper_boundary = df[variable].mean() + 3 * df[variable].std()
    lower_boundary = df[variable].mean() - 3 * df[variable].std()

    return upper_boundary, lower_boundary

def find_skewed_boundaries(df, variable, distance):
    IQR = df[variable].quantile(0.75) - df[variable].quantile(0.25)

    lower_boundary = df[variable].quantile(0.25) - (IQR * distance)
    upper_boundary = df[variable].quantile(0.75) + (IQR * distance)

    return upper_boundary, lower_boundary

data['age'].skew()

upper_boundary1, lower_boundary1 = find_normal_boundaries(data, 'age')
upper_boundary1, lower_boundary1

print('Older than 111 : {}'.format(len(data[data['age'] > upper_boundary1])))
print('% of older than 111 : {}'.format(len(data[data['age'] > upper_boundary1]) / len(data) * 100))

data['hypertension'].skew()

upper_boundary2, lower_boundary2 = find_skewed_boundaries(data, 'hypertension', 1.5)
upper_boundary2, lower_boundary2

data['heart_disease'].skew()

upper_boundary3, lower_boundary3 = find_skewed_boundaries(data, 'heart_disease', 1.5)
upper_boundary3, lower_boundary3

data['avg_glucose_level'].skew()

upper_boundary4, lower_boundary4 = find_skewed_boundaries(data, 'avg_glucose_level', 1.5)
upper_boundary4, lower_boundary4

print('% right end outliers : {}'.format(len(data[data['avg_glucose_level'] > upper_boundary4]) / len(data) * 100))
print('% left end outliers  : {}'.format(len(data[data['avg_glucose_level'] < lower_boundary4]) / len(data) * 100))

data['avg_glucose_level'] = np.where(data['avg_glucose_level']>upper_boundary4,upper_boundary4,np.where(data['avg_glucose_level']<lower_boundary4,lower_boundary4,data['avg_glucose_level']))

data['bmi'].skew()

upper_boundary5, lower_boundary5 = find_skewed_boundaries(data, 'bmi', 1.5)
upper_boundary5, lower_boundary5

print('% right end outliers : {}'.format(len(data[data['bmi'] > upper_boundary5]) / len(data) * 100))
print('% left end outliers  : {}'.format(len(data[data['bmi'] < lower_boundary5]) / len(data) * 100))

X_train = data[(data['bmi'] < upper_boundary5) & (data['bmi'] > lower_boundary5)]

data.isnull().sum()

data['bmi'].fillna(data['bmi'].mean(), inplace = True)

data['gender'].value_counts()

data['gender'] = data['gender'].replace(['Other'],'Female')

X = data.drop('stroke', axis = 1)
y = data['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=15)

X_train.drop('id', axis=1, inplace=True)
X_test.drop('id', axis=1, inplace=True)

num_columns = X_train.select_dtypes(include=np.number).columns.tolist()
cat_columns = X_train.select_dtypes(include=['object']).columns.tolist()

print('Numerical Columns : ', num_columns)
print('Categorical Columns : ', cat_columns)

X_train_num = X_train[num_columns]
X_train_cat = X_train[cat_columns]

X_test_num = X_test[num_columns]
X_test_cat = X_test[cat_columns]

scaler = MinMaxScaler()
scaler.fit(X_train_num)

X_train_num_scaled = scaler.transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

encoder = OrdinalEncoder()
encoder.fit(X_train_cat)

X_train_cat_enc = encoder.transform(X_train_cat)
X_test_cat_enc = encoder.transform(X_test_cat)

X_train_final = np.concatenate([X_train_num_scaled, X_train_cat_enc], axis=1)
X_test_final = np.concatenate([X_test_num_scaled, X_test_cat_enc], axis=1)

X_train_final_df = pd.DataFrame(X_train_final, columns=[num_columns + cat_columns])
X_train_final_df

Older than 111 : 0
% of older than 111 : 0.0
% right end outliers : 12.270058708414872
% left end outliers  : 0.0
% right end outliers : 2.152641878669276
% left end outliers  : 0.0
Numerical Columns :  ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
Categorical Columns :  ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender,ever_married,work_type,Residence_type,smoking_status
0,0.133301,0.0,0.0,0.348134,0.187717,0.0,0.0,4.0,1.0,2.0
1,0.084473,0.0,0.0,0.262873,0.044032,0.0,0.0,4.0,0.0,0.0
2,0.645996,0.0,0.0,0.912748,0.207416,1.0,1.0,0.0,0.0,2.0
3,0.328613,0.0,0.0,0.073619,0.180765,1.0,0.0,3.0,0.0,2.0
4,0.426270,0.0,0.0,0.182777,0.308227,0.0,1.0,2.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
4083,0.438477,0.0,0.0,0.170785,0.235226,1.0,1.0,2.0,0.0,1.0
4084,0.987793,0.0,0.0,0.296750,0.180765,1.0,1.0,0.0,0.0,2.0
4085,0.975586,0.0,1.0,0.419652,0.202781,0.0,1.0,3.0,0.0,2.0
4086,0.938965,0.0,0.0,1.000000,0.250290,0.0,1.0,2.0,1.0,2.0


In [290]:
clf = svm.SVC(kernel='poly')

In [291]:
clf.fit(X_train_final, y_train)

SVC(kernel='poly')

In [292]:
y_pred_train = clf.predict(X_train_final)
y_pred_test = clf.predict(X_test_final)

In [293]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_test))

Accuracy: 0.9461839530332681
