In [1]:
import pandas as pd

In [2]:
INPUT_DATA_DIR = "./../raw_data/"

In [3]:
df = pd.read_csv(INPUT_DATA_DIR + "raw_data.csv")
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [4]:
# Check for null values in each column
null_counts = df.isnull().sum()

# Print the null counts for each column
print(null_counts)

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


In [5]:
# Drop rows with any null values
df_cleaned = df.dropna()

In [6]:
# Drop id column
df_cleaned = df_cleaned.drop(['id'], axis=1)

In [7]:
# Encoding categorical variables
df_cleaned = pd.get_dummies(df_cleaned, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])

In [8]:
from imblearn.over_sampling import SMOTE 
from sklearn.model_selection import train_test_split 

In [9]:
# Split the data and result
X = df_cleaned[df_cleaned.columns.difference(['stroke'])]
y = df_cleaned['stroke']

In [10]:
# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, random_state=340)
print('Ratio of nonstrokes/strokes in training data: {}'.format(sum(y_train == 0)/sum(y_train == 1))) 
print('Ratio of nonstrokes/strokes in testing data: {} \n'.format(sum(y_test == 0)/sum(y_test == 1))) 

Ratio of nonstrokes/strokes in training data: 22.51497005988024
Ratio of nonstrokes/strokes in testing data: 22.38095238095238 



In [11]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) 

# Use SMOTE to create stroke samples from the training data 
sm = SMOTE(random_state=340)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel()) 
  
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0))) 


Before OverSampling, counts of label '1': 167
Before OverSampling, counts of label '0': 3760 

After OverSampling, the shape of train_X: (7520, 21)
After OverSampling, the shape of train_y: (7520,) 

After OverSampling, counts of label '1': 3760
After OverSampling, counts of label '0': 3760


In [12]:
# Create new file with new training data
train_data = X_train_res
train_data['stroke'] = y_train_res

In [13]:
# Create new file with testing data
test_data = X_test
test_data['stroke'] = y_test

In [14]:
train_data.to_csv(INPUT_DATA_DIR + 'train_data.csv', index=False)
test_data.to_csv(INPUT_DATA_DIR + 'test_data.csv', index=False)
test_data

Unnamed: 0,Residence_type_Rural,Residence_type_Urban,age,avg_glucose_level,bmi,ever_married_No,ever_married_Yes,gender_Female,gender_Male,gender_Other,...,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,stroke
4184,1,0,35.0,119.40,22.9,0,1,1,0,0,...,0,0,1,0,0,0,1,0,0,0
4454,1,0,19.0,57.40,22.9,1,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
2745,1,0,19.0,79.82,26.1,1,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
3554,1,0,45.0,87.47,21.5,0,1,1,0,0,...,0,0,1,0,0,0,1,0,0,0
2022,1,0,57.0,77.57,21.0,1,0,1,0,0,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0,1,60.0,65.38,41.2,0,1,1,0,0,...,0,1,0,0,0,0,1,0,0,0
1054,0,1,76.0,77.52,40.9,1,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
4597,1,0,29.0,108.14,25.1,0,1,1,0,0,...,0,1,0,0,0,0,1,0,0,0
311,0,1,79.0,97.93,31.2,0,1,1,0,0,...,1,0,0,0,0,0,1,0,0,0
