In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

In [2]:
# Import the data
stroke_info_df = pd.read_csv("data/stroke_data.csv")
stroke_info_df

Unnamed: 0.1,Unnamed: 0,Gender,Age,Hypertension,Heart Disease,Ever Married,Work Type,Residence Type,Average Glucose Level,BMI,Smoking Status,Stroke
0,0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,5105,Female,80.0,1,0,Yes,Private,Urban,83.75,28.9,never smoked,0
5106,5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [3]:
# Drop the Unnamed column
stroke_info_df = stroke_info_df.drop('Unnamed: 0', axis=1)
stroke_info_df

Unnamed: 0,Gender,Age,Hypertension,Heart Disease,Ever Married,Work Type,Residence Type,Average Glucose Level,BMI,Smoking Status,Stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.9,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,28.9,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
# Convert categorical data to numeric with `pd.get_dummies`
stroke_info_df = pd.get_dummies(stroke_info_df, dtype=float)
stroke_info_df.head()

Unnamed: 0,Age,Hypertension,Heart Disease,Average Glucose Level,BMI,Stroke,Gender_Female,Gender_Male,Gender_Other,Ever Married_No,...,Work Type_Never_worked,Work Type_Private,Work Type_Self-employed,Work Type_children,Residence Type_Rural,Residence Type_Urban,Smoking Status_formerly smoked,Smoking Status_never smoked,Smoking Status_smokes,Smoking Status_unknown
0,67.0,0,1,228.69,36.6,1,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,61.0,0,0,202.21,28.9,1,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,80.0,0,1,105.92,32.5,1,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,49.0,0,0,171.23,34.4,1,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,79.0,1,0,174.12,24.0,1,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [5]:
stroke_info_df.dtypes

Age                               float64
Hypertension                        int64
Heart Disease                       int64
Average Glucose Level             float64
BMI                               float64
Stroke                              int64
Gender_Female                     float64
Gender_Male                       float64
Gender_Other                      float64
Ever Married_No                   float64
Ever Married_Yes                  float64
Work Type_Govt_job                float64
Work Type_Never_worked            float64
Work Type_Private                 float64
Work Type_Self-employed           float64
Work Type_children                float64
Residence Type_Rural              float64
Residence Type_Urban              float64
Smoking Status_formerly smoked    float64
Smoking Status_never smoked       float64
Smoking Status_smokes             float64
Smoking Status_unknown            float64
dtype: object

In [6]:
# Get the target variables. 
y = stroke_info_df["Stroke"]

In [7]:
# Get the features. 
X = stroke_info_df.drop('Stroke', axis=1)

In [8]:
X.columns

Index(['Age', 'Hypertension', 'Heart Disease', 'Average Glucose Level', 'BMI',
       'Gender_Female', 'Gender_Male', 'Gender_Other', 'Ever Married_No',
       'Ever Married_Yes', 'Work Type_Govt_job', 'Work Type_Never_worked',
       'Work Type_Private', 'Work Type_Self-employed', 'Work Type_children',
       'Residence Type_Rural', 'Residence Type_Urban',
       'Smoking Status_formerly smoked', 'Smoking Status_never smoked',
       'Smoking Status_smokes', 'Smoking Status_unknown'],
      dtype='object')

In [9]:
# Split data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [10]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
X_test

Unnamed: 0,Age,Hypertension,Heart Disease,Average Glucose Level,BMI,Gender_Female,Gender_Male,Gender_Other,Ever Married_No,Ever Married_Yes,...,Work Type_Never_worked,Work Type_Private,Work Type_Self-employed,Work Type_children,Residence Type_Rural,Residence Type_Urban,Smoking Status_formerly smoked,Smoking Status_never smoked,Smoking Status_smokes,Smoking Status_unknown
4688,31.0,0,0,64.85,23.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4478,40.0,0,0,65.29,28.3,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3849,8.0,0,0,74.42,22.5,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
4355,79.0,1,0,76.64,19.5,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3826,75.0,0,0,94.77,27.2,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1533,54.0,0,0,207.79,38.6,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2437,8.0,0,0,105.63,19.2,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
3164,68.0,0,0,82.85,28.9,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
92,57.0,0,0,68.02,37.5,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [12]:
stroke_info_df.iloc[4355]

Age                               79.00
Hypertension                       1.00
Heart Disease                      0.00
Average Glucose Level             76.64
BMI                               19.50
Stroke                             0.00
Gender_Female                      1.00
Gender_Male                        0.00
Gender_Other                       0.00
Ever Married_No                    0.00
Ever Married_Yes                   1.00
Work Type_Govt_job                 0.00
Work Type_Never_worked             0.00
Work Type_Private                  0.00
Work Type_Self-employed            1.00
Work Type_children                 0.00
Residence Type_Rural               1.00
Residence Type_Urban               0.00
Smoking Status_formerly smoked     0.00
Smoking Status_never smoked        1.00
Smoking Status_smokes              0.00
Smoking Status_unknown             0.00
Name: 4355, dtype: float64

In [None]:
# Create a support vector machine linear classifer, and fit it to the training data
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

In [None]:
# Print the model score by using the test data
print(svm_model.score(X_test,y_test))

In [None]:
# Calculate the classification report
testing_predictions = svm_model.predict(X_test)

testing_report = classification_report(y_test, testing_predictions)
print(testing_report)

In [None]:
from sklearn.metrics import confusion_matrix
test_matrix = confusion_matrix(y_test,testing_predictions)

# Print the confusion matrix for the training data
test_matrix

In [None]:
# Apply SMOTE to the training data only
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# Create a support vector machine linear classifer, and fit it to the training data
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_resampled, y_train_resampled)

In [None]:
# Save the model
import pickle

filename = 'models/svm_model.sav'
pickle.dump(svm_model, open(filename, 'wb'))

filename = 'models/svm__scalar_model.sav'
pickle.dump(X_scaler, open(filename, 'wb'))