In [591]:
# Import necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

In [592]:
# Load the training dataset
train_data = pd.read_csv('train.csv')

In [593]:
train_data

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,28932,Female,36.0,0,0,Yes,Private,Rural,67.29,36.7,formerly smoked,0
1,37150,Female,34.0,0,0,Yes,Private,Rural,83.53,48.5,formerly smoked,0
2,71669,Male,60.0,0,0,Yes,Private,Rural,65.16,30.8,never smoked,0
3,27153,Female,75.0,0,0,Yes,Self-employed,Rural,78.80,29.3,formerly smoked,1
4,58235,Male,76.0,0,0,Yes,Private,Urban,58.65,25.6,smokes,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4083,30457,Female,53.0,1,0,Yes,Govt_job,Rural,98.61,38.8,smokes,0
4084,24836,Female,61.0,0,0,Yes,Private,Rural,72.01,26.0,formerly smoked,0
4085,17079,Male,44.0,0,0,Yes,Private,Rural,94.71,28.4,smokes,0
4086,72340,Male,21.0,0,0,No,Private,Urban,120.94,29.7,formerly smoked,0


In [594]:
# Remove the ID column as it is not useful for modeling
train_data = train_data.drop('id', axis=1)

In [595]:
train_data['gender'].value_counts()

Female    2405
Male      1682
Other        1
Name: gender, dtype: int64

In [596]:
train_data = train_data[train_data['gender'] != 'Other']

In [597]:
train_data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Female,36.0,0,0,Yes,Private,Rural,67.29,36.7,formerly smoked,0
1,Female,34.0,0,0,Yes,Private,Rural,83.53,48.5,formerly smoked,0
2,Male,60.0,0,0,Yes,Private,Rural,65.16,30.8,never smoked,0
3,Female,75.0,0,0,Yes,Self-employed,Rural,78.80,29.3,formerly smoked,1
4,Male,76.0,0,0,Yes,Private,Urban,58.65,25.6,smokes,0
...,...,...,...,...,...,...,...,...,...,...,...
4083,Female,53.0,1,0,Yes,Govt_job,Rural,98.61,38.8,smokes,0
4084,Female,61.0,0,0,Yes,Private,Rural,72.01,26.0,formerly smoked,0
4085,Male,44.0,0,0,Yes,Private,Rural,94.71,28.4,smokes,0
4086,Male,21.0,0,0,No,Private,Urban,120.94,29.7,formerly smoked,0


In [598]:
# Create new features from the existing data (ratio of glucose level to BMI.) to train data
train_data['glucose_bmi_ratio'] = train_data['avg_glucose_level'] / (train_data['bmi'] ** 2)

In [599]:
train_data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,glucose_bmi_ratio
0,Female,36.0,0,0,Yes,Private,Rural,67.29,36.7,formerly smoked,0,0.049960
1,Female,34.0,0,0,Yes,Private,Rural,83.53,48.5,formerly smoked,0,0.035511
2,Male,60.0,0,0,Yes,Private,Rural,65.16,30.8,never smoked,0,0.068688
3,Female,75.0,0,0,Yes,Self-employed,Rural,78.80,29.3,formerly smoked,1,0.091789
4,Male,76.0,0,0,Yes,Private,Urban,58.65,25.6,smokes,0,0.089493
...,...,...,...,...,...,...,...,...,...,...,...,...
4083,Female,53.0,1,0,Yes,Govt_job,Rural,98.61,38.8,smokes,0,0.065502
4084,Female,61.0,0,0,Yes,Private,Rural,72.01,26.0,formerly smoked,0,0.106524
4085,Male,44.0,0,0,Yes,Private,Rural,94.71,28.4,smokes,0,0.117425
4086,Male,21.0,0,0,No,Private,Urban,120.94,29.7,formerly smoked,0,0.137106


In [600]:
# Perform one-hot encoding for smoking_status, Residence_type, and work_type
work_ty = pd.get_dummies(train_data['work_type'], drop_first=True)
Residence_ty = pd.get_dummies(train_data['Residence_type'])
Smoking_stat = pd.get_dummies(train_data['smoking_status'])

In [601]:
# Drop the original columns and concatenate the encoded columns
train_data.drop(['smoking_status', 'Residence_type', 'work_type'], axis=1, inplace=True)
train_data = pd.concat([train_data, Smoking_stat, Residence_ty, work_ty], axis=1)

In [602]:
train_data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,avg_glucose_level,bmi,stroke,glucose_bmi_ratio,Unknown,formerly smoked,never smoked,smokes,Rural,Urban,Never_worked,Private,Self-employed,children
0,Female,36.0,0,0,Yes,67.29,36.7,0,0.049960,0,1,0,0,1,0,0,1,0,0
1,Female,34.0,0,0,Yes,83.53,48.5,0,0.035511,0,1,0,0,1,0,0,1,0,0
2,Male,60.0,0,0,Yes,65.16,30.8,0,0.068688,0,0,1,0,1,0,0,1,0,0
3,Female,75.0,0,0,Yes,78.80,29.3,1,0.091789,0,1,0,0,1,0,0,0,1,0
4,Male,76.0,0,0,Yes,58.65,25.6,0,0.089493,0,0,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4083,Female,53.0,1,0,Yes,98.61,38.8,0,0.065502,0,0,0,1,1,0,0,0,0,0
4084,Female,61.0,0,0,Yes,72.01,26.0,0,0.106524,0,1,0,0,1,0,0,1,0,0
4085,Male,44.0,0,0,Yes,94.71,28.4,0,0.117425,0,0,0,1,1,0,0,1,0,0
4086,Male,21.0,0,0,No,120.94,29.7,0,0.137106,0,1,0,0,0,1,0,1,0,0


In [603]:
# Split the dataset into features (X) and target (y)
X = train_data.drop('stroke', axis=1)
y = train_data['stroke']

In [604]:
#number of null data in each columns
train_data.isna().sum()   

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
avg_glucose_level      0
bmi                  160
stroke                 0
glucose_bmi_ratio    160
Unknown                0
formerly smoked        0
never smoked           0
smokes                 0
Rural                  0
Urban                  0
Never_worked           0
Private                0
Self-employed          0
children               0
dtype: int64

In [605]:
### Handle missing values by imputing the mean
train_data = train_data.fillna(X.mean())

  train_data = train_data.fillna(X.mean())


In [606]:
#number of null values after drop null values
train_data.isna().sum() 

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
avg_glucose_level    0
bmi                  0
stroke               0
glucose_bmi_ratio    0
Unknown              0
formerly smoked      0
never smoked         0
smokes               0
Rural                0
Urban                0
Never_worked         0
Private              0
Self-employed        0
children             0
dtype: int64

In [607]:
# Encode categorical variables using one-hot encoding
X = pd.get_dummies(X)

In [608]:
# Handle missing values by imputing the mean
X = X.fillna(X.mean())

In [609]:
work_ty

Unnamed: 0,Never_worked,Private,Self-employed,children
0,0,1,0,0
1,0,1,0,0
2,0,1,0,0
3,0,0,1,0
4,0,1,0,0
...,...,...,...,...
4083,0,0,0,0
4084,0,1,0,0
4085,0,1,0,0
4086,0,1,0,0


In [610]:
# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [611]:
print("Before SMOTE:")
print(y.value_counts())

Before SMOTE:
0    3892
1     195
Name: stroke, dtype: int64


In [612]:
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [613]:
print("After SMOTE:")
print(y_train_resampled.value_counts())

After SMOTE:
0    3111
1    3111
Name: stroke, dtype: int64


In [614]:
# Create a Decision Tree model and fit it on the resampled training data
model = RandomForestClassifier(max_depth=5, random_state=35)
model.fit(X_train_resampled, y_train_resampled)

In [615]:
# Make predictions on the validation set and calculate accuracy
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

In [616]:
print('Validation accuracy:', accuracy)

Validation accuracy: 0.8300733496332519


In [617]:
# Load the testing dataset and preprocess it in the same way as the training data
test_data = pd.read_csv('test.csv')
test_ids = test_data['id']
test_data = test_data.drop('id', axis=1)

# Create new features from the existing data (ratio of glucose level to BMI.) to test data
test_data['glucose_bmi_ratio'] = test_data['avg_glucose_level'] / (test_data['bmi'] ** 2)

test_data = pd.get_dummies(test_data)

In [618]:
test_data

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,glucose_bmi_ratio,gender_Female,gender_Male,ever_married_No,ever_married_Yes,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,82.0,0,1,144.90,26.4,0.207903,0,1,0,1,...,0,1,0,0,0,1,0,0,0,1
1,4.0,0,0,106.22,16.7,0.380867,0,1,1,0,...,0,0,0,1,1,0,1,0,0,0
2,58.0,0,0,79.95,25.9,0.119184,0,1,0,1,...,0,1,0,0,0,1,0,0,1,0
3,20.0,0,0,96.57,34.1,0.083049,1,0,1,0,...,0,1,0,0,1,0,0,0,1,0
4,10.0,0,0,69.84,13.7,0.372103,1,0,1,0,...,0,0,0,1,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017,43.0,0,0,75.05,22.9,0.143113,1,0,0,1,...,0,1,0,0,1,0,0,0,0,1
1018,42.0,0,0,191.94,27.9,0.246580,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
1019,52.0,0,0,69.37,36.2,0.052936,0,1,1,0,...,0,1,0,0,1,0,1,0,0,0
1020,78.0,0,0,237.75,,,0,1,0,1,...,0,1,0,0,0,1,0,1,0,0


In [619]:
# Align the training data with the test data columns
X_train, test_data = X_train.align(test_data, join='left', axis=1)

In [620]:
### Handle missing values by imputing the mean
test_data = test_data.fillna(X.mean())

In [621]:
# Make predictions on the testing data
test_preds = model.predict(test_data)

In [622]:
# Save the predictions in the required format
submission_df = pd.DataFrame({'id': test_ids, 'stroke': test_preds})
submission_df.to_csv('submissiontop29.csv', index=False)


In [623]:
submission_df

Unnamed: 0,id,stroke
0,61960,1
1,31741,0
2,59451,1
3,40670,0
4,25391,0
...,...,...
1017,2953,0
1018,47799,0
1019,61013,0
1020,66400,1
