In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.express as px
import os
from scipy import stats

In [None]:
df = pd.read_csv('/content/drive/MyDrive/SEMESTER 4 COURSES/Artificial Intelligence/Prototype Project/Datasets/healthcare-dataset-stroke-data.csv')
df.head()

In [None]:
def tidy_columns(col_list):
  return [col.title().replace('_', ' ') for col in col_list]

df.columns = tidy_columns(df.columns)
df.columns

In [None]:
df.info()

# Exploratory Data Analysis

- **Missing Values Observation**

In [None]:
df.isnull().sum().to_frame()

In [None]:
sns.heatmap(df.isnull().sum().to_frame(), annot = True, cmap = 'RdYlGn_r', fmt = '.0f')
plt.show()

- **Target Variable Analysis**

In [None]:
df['Stroke'].value_counts()

In [None]:
def compare_stroke_outcomes(df):
    # plt.style.use('fivethirtyeight')
    sns.set_style('darkgrid')
    labels = ['Normal', 'Stroke']
    numbers = [4861, 249]
    colors = ['green', 'red']

    fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (12, 6))
    fig.suptitle('Stroke Disease Outcome Comparison', fontsize = 'xx-large', fontweight = 'bold')
    ax[0].pie(numbers, labels = labels, colors = colors, autopct = '%.1f%%', 
              explode = [0.2, 0], shadow = True)
    ax[0].set_title('Pie Chart', fontsize = 12, fontweight = 'bold', color = 'blue')

    ax[1].bar(x = labels, height = numbers, data = df, color = colors)
    ax[1].set_title('Bar Chart', fontsize = 12, fontweight = 'bold', color = 'blue')
    ax[1].set_xlabel('Stroke Outcome')
    plt.show()

In [None]:
compare_stroke_outcomes(df)

- **Distribution of Numeric Variables**

In [None]:
avg_age = round(np.mean(df['Age']), 2)
print('Average Patient Age: {}'.format(avg_age))

In [None]:
plt.figure(figsize = (8, 6))
sns.set_style('whitegrid')
sns.histplot(df['Age'], kde = True)
plt.title('Patient Age Distribution', fontsize = 16, fontweight = 'bold')
plt.axvline(x = avg_age, color = 'red')
plt.text(s = f'Mean: {avg_age}', x = avg_age + 2, y = 360)
plt.show()

In [None]:
avg_bmi = round(np.mean(df['Bmi']), 2)
print('Average Patient Body Mass Index: {}'.format(avg_bmi))

In [None]:
plt.figure(figsize = (8, 6))
sns.set_style('whitegrid')
sns.histplot(df['Bmi'], kde = True)
plt.title('Patient BMI Distribution', fontsize = 16, fontweight = 'bold')
plt.xlabel('Body Mass Index')
plt.axvline(x = avg_bmi, color = 'red')
plt.text(s = f'Mean: {avg_bmi}', x = avg_bmi + 5, y = 275)
plt.show()

In [None]:
avg_glucose_level = round(np.mean(df['Avg Glucose Level']), 2)
print('Average Patient Body Mass Index: {}'.format(avg_glucose_level))

In [None]:
plt.figure(figsize = (8, 6))
sns.set_style('whitegrid')
sns.histplot(df['Avg Glucose Level'], kde = True)
plt.title('Patient Average Glucose Level Distribution', fontsize = 16, fontweight = 'bold')
plt.xlabel('Average Glucose Level')
plt.axvline(x = avg_glucose_level, color = 'red')
plt.text(s = f'Mean: {avg_glucose_level}', x = avg_glucose_level + 5, y = 275)
plt.show()

- **Relationship Between All Variables**

In [None]:
sns.set_style('whitegrid')
sns.heatmap(df.corr(), cmap = 'RdYlGn_r', annot = True)
plt.show()

# Data Preprocessing

- **Feature Selection**

In [None]:
df.drop('Id', axis = 'columns', inplace = True)

In [None]:
df.head(5)

- **Missing Values Handling - Numerical Data**

In [None]:
df['Bmi'].fillna(np.nanmedian(df['Bmi']), inplace = True)

- **Missing Values Handling - Categorical Data**

In [None]:
df['Gender'].value_counts()

In [None]:
df['Ever Married'].value_counts()

In [None]:
df['Work Type'].value_counts()

In [None]:
df['Residence Type'].value_counts()

In [None]:
df['Smoking Status'].value_counts()

In [None]:
df[df['Gender'] == 'Other']

In [None]:
df.loc[df['Gender'] == 'Other', 'Gender'] = 'Female'

- **Categorical Columns Encoding**

In [None]:
from sklearn.preprocessing import OneHotEncoder

def onehot_encode(df, column):
    onehot_encoder = OneHotEncoder(sparse = False)
    encoded_column = onehot_encoder.fit_transform(df[[column]]).astype('int64')
    encoded_column_df = pd.DataFrame(encoded_column, 
                                     columns = [column + '_' + str(x) for x in df[column].unique()])
    
    joined_df = df.join(encoded_column_df)
    joined_and_cleaned_df = joined_df.drop(column, axis = 1)
    return joined_and_cleaned_df

In [None]:
categorical_columns = [col for col in df.columns if df[col].dtype == 'O']
categorical_columns

In [None]:
[col for col in df.columns if df[col].dtype == 'O' and df[col].nunique() > 2]

In [None]:
numerical_columns = [col for col in df.columns if df[col].dtype == 'float']
numerical_columns

One Hot Encoding

In [None]:
for col in ['Work Type', 'Residence Type', 'Smoking Status']:
    df = onehot_encode(df, col)

In [None]:
df.head()

Ordinal Encoding

In [None]:
df['Ever Married'].replace({'Yes': 1, 'No': 0}, inplace = True)
df['Gender'].replace({'Male': 1, 'Female': 0}, inplace = True)

In [None]:
df.head()

- **Feature Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()
df[numerical_columns] = standard_scaler.fit_transform(df[numerical_columns])

In [None]:
df.head()

In [None]:
plt.figure(figsize = (8, 6))
sns.set_style('whitegrid')
sns.histplot(df['Avg Glucose Level'], kde = True)
plt.title('Patient Average Glucose Level Distribution', fontsize = 16, fontweight = 'bold')
plt.xlabel('Average Glucose Level')
plt.show()

In [None]:
df['Avg Glucose Level'].max()

In [None]:
df['Avg Glucose Level'].min()

In [None]:
df['Age'].min()

# Save File to Google Drive

In [None]:
filename = 'Scaled + Ordinal & OneHot df.csv'
directory = '/content/drive/MyDrive/SEMESTER 4 COURSES/Artificial Intelligence/Prototype Project/Datasets/'

with open(os.path.join(directory, filename), 'w') as f:
    df.to_csv(f, index = False)