In [59]:
import pandas as pd
print(pd.__version__)
pd.set_option("display.max_rows", 12)

1.1.5


In [60]:
stroke = pd.read_csv('stroke.csv')

In [61]:
# View the first 5 rows of the dataset
stroke.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
# See what the different types of variables are
stroke.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [25]:
# Additional data exploration
# More men than women in the study had strokes, which perhaps explains some of the standard deviation in strokes
stroke.groupby('gender').mean()['stroke']

gender
Female    0.047094
Male      0.051064
Other     0.000000
Name: stroke, dtype: float64

In [108]:
# Want to do logistic regression and make the data more manageable
# We begin by removing missing data and the one data with gender "other".
# Also drop 'id' because is does not make sense to use it for fitting (hopefully we will not need it later)
stroke = stroke.dropna()
stroke = stroke[stroke.gender != 'Other']
stroke = stroke.drop('id', axis = 1)

stroke['gender'].value_counts()

Female    2897
Male      2011
Name: gender, dtype: int64

In [118]:
# INSTALL SCIKIT-LEARN: %pip install -U scikit-learn
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split

# Need to encode 'object' variables
obj_stroke = stroke.select_dtypes(include=['object']).copy()

# Dummy encoding
dummy_stroke = pd.get_dummies(obj_stroke).copy()

# Concatenate back into one dataframe that now contains dummy variable and the original non-object data
dummy_stroke = dummy_stroke.join(stroke.select_dtypes(exclude=['object']))

# Split into data and response
x = dummy_stroke.drop('stroke',axis = 1)
y = dummy_stroke.stroke

# Split into training testing set. Here we do a 75 % training, 25 % test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=0)

# Fit logistic regression, need to increase max_iter for the algorithm to converge
# Add Ridge regression with parameter chosen by CV with 10 folds for fun (does not seem to change accuracy))
logregfit = LogisticRegressionCV(random_state=0, max_iter = 1000, penalty='l2', cv = 10).fit(x_train, y_train)

# Link coefficient names with coefficients, stolen from StackOverflow (https://stackoverflow.com/questions/34649969/how-to-find-the-features-names-of-the-coefficients-using-scikit-linear-regressio)
coef_table = pd.DataFrame(list(x_train.columns)).copy()
coef_table.insert(len(coef_table.columns),"Coefs",logregfit.coef_.transpose())
print(coef_table)
# Do inference here (?)...

# 'score' here refers to accuracy
print("Train accuracy: ", logregfit.score(x_train,y_train))
print("Test accuracy: ", logregfit.score(x_test,y_test))
# Better test accuracy than train accuracy in many cases omegalul

                     0     Coefs
0        gender_Female  0.004204
1          gender_Male -0.010861
2      ever_married_No  0.002538
3     ever_married_Yes -0.009194
4   work_type_Govt_job -0.113168
..                 ...       ...
15                 age  0.070485
16        hypertension  0.478603
17       heart_disease  0.206914
18   avg_glucose_level  0.004931
19                 bmi  0.005621

[20 rows x 2 columns]
Train accuracy:  0.9568052159739201
Test accuracy:  0.9600651996740016
