In [1]:
import pandas as pd

In [2]:
INPUT_DATA_DIR = "./../raw_data/"

In [3]:
df = pd.read_csv(INPUT_DATA_DIR + "raw_data.csv")

In [4]:
# Check for null values in each column
null_counts = df.isnull().sum()

# Print the null counts for each column
print(null_counts)

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


In [5]:
# Drop rows with any null values
df_cleaned = df.dropna()

In [6]:
# Drop id column
df_cleaned = df_cleaned.drop(['id'], axis=1)

In [7]:
# Encoding categorical variables
# Label encoding
from sklearn.preprocessing import LabelEncoder
for col in ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']:
    df_cleaned[col] = LabelEncoder().fit_transform(df_cleaned[col])

df_cleaned.to_csv(INPUT_DATA_DIR + 'cleaned_data.csv', index=False)

In [8]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [9]:
# Split the data and result
X = df_cleaned[df_cleaned.columns.difference(['stroke'])]
y = df_cleaned['stroke']
df_cleaned.shape

(4909, 11)

In [10]:
# Split training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=340)
print('Ratio of nonstrokes/strokes in training data: {}'.format(sum(y_train == 0)/sum(y_train == 1)))
print('Ratio of nonstrokes/strokes in testing data: {} \n'.format(sum(y_test == 0)/sum(y_test == 1)))

Ratio of nonstrokes/strokes in training data: 22.51497005988024
Ratio of nonstrokes/strokes in testing data: 22.38095238095238 



In [11]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0)))

Before OverSampling, counts of label '1': 167
Before OverSampling, counts of label '0': 3760 



In [12]:
# Use SMOTE to create stroke samples from the training data
sm = SMOTE(k_neighbors=5)

X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0)))

After OverSampling, counts of label '1': 3760
After OverSampling, counts of label '0': 3760


In [13]:
# Create new file with new training data
train_data = X_train_res
train_data['stroke'] = y_train_res

In [14]:
# Create new file with testing data
test_data = X_test
test_data['stroke'] = y_test

In [15]:
train_data.to_csv(INPUT_DATA_DIR + 'train_data.csv', index=False)
test_data.to_csv(INPUT_DATA_DIR + 'test_data.csv', index=False)