In [395]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [396]:
df = pd.read_csv('..\Data\stroke-data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


## Basic Pre-Processing

In [397]:
# clearing null values
df = df.dropna()

# Making data unique
df['id'].nunique()

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4909 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 4909 non-null   int64  
 1   gender             4909 non-null   object 
 2   age                4909 non-null   float64
 3   hypertension       4909 non-null   int64  
 4   heart_disease      4909 non-null   int64  
 5   ever_married       4909 non-null   object 
 6   work_type          4909 non-null   object 
 7   Residence_type     4909 non-null   object 
 8   avg_glucose_level  4909 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     4909 non-null   object 
 11  stroke             4909 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 498.6+ KB


In [398]:
df = df.drop(columns='id')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4909 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4909 non-null   object 
 1   age                4909 non-null   float64
 2   hypertension       4909 non-null   int64  
 3   heart_disease      4909 non-null   int64  
 4   ever_married       4909 non-null   object 
 5   work_type          4909 non-null   object 
 6   Residence_type     4909 non-null   object 
 7   avg_glucose_level  4909 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     4909 non-null   object 
 10  stroke             4909 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 460.2+ KB


In [399]:
# describing dataframe
df.describe()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0
mean,42.865374,0.091872,0.049501,105.30515,28.893237,0.042575
std,22.555115,0.288875,0.216934,44.424341,7.854067,0.201917
min,0.08,0.0,0.0,55.12,10.3,0.0
25%,25.0,0.0,0.0,77.07,23.5,0.0
50%,44.0,0.0,0.0,91.68,28.1,0.0
75%,60.0,0.0,0.0,113.57,33.1,0.0
max,82.0,1.0,1.0,271.74,97.6,1.0


## Advanced Preprocessing

In [400]:
df['gender'].value_counts()

Female    2897
Male      2011
Other        1
Name: gender, dtype: int64

In [401]:
# removing Other from Gender column because it's only 1
df.drop(df.loc[df['gender']=='Other'].index, inplace=True)
df = df.dropna()
df['gender'].value_counts()

Female    2897
Male      2011
Name: gender, dtype: int64

In [402]:
df.nunique()

gender                  2
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3851
bmi                   418
smoking_status          4
stroke                  2
dtype: int64

**Object Columns: -**
<ul>
	<li>gender: female, male</li>
	<li>ever_married: yes, no</li>
	<li>work_type: private, self-employed, children, Govet_job, never_worked</li>
	<li>Residence_type: urban, rural</li>
	<li>smoking_status: formerly smoked, never smoked, smokes, Unknown</li>
</ul>


In [403]:

# converting Object values in coulmns to int64
def convertToInt64(columnName):
    uniqueObjects = df[columnName].unique()
    for idx, obj in enumerate(uniqueObjects):
        df.at[df[columnName] == obj, columnName] = idx

In [404]:
# female: 1, male: 0
convertToInt64('gender') 
# yes : 0, no: 1
convertToInt64('ever_married')
# private: 0, self-employed: 1, children: 2, Govet_job: 3, never_worked: 4
convertToInt64('work_type')
# Urban: 0, Rural: 1
convertToInt64('Residence_type')
# formerly smoked: 0, never smoked: 1, smokes: 2, Unknown: 3
convertToInt64('smoking_status')

In [405]:
# change cloumn type by inference
df = df.infer_objects()
df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4908 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4908 non-null   int64  
 1   age                4908 non-null   float64
 2   hypertension       4908 non-null   int64  
 3   heart_disease      4908 non-null   int64  
 4   ever_married       4908 non-null   int64  
 5   work_type          4908 non-null   int64  
 6   Residence_type     4908 non-null   int64  
 7   avg_glucose_level  4908 non-null   float64
 8   bmi                4908 non-null   float64
 9   smoking_status     4908 non-null   int64  
 10  stroke             4908 non-null   int64  
dtypes: float64(3), int64(8)
memory usage: 589.2 KB


**Bucketing Data: -**
<ul>
	<li>age: into 10 buckets of 10 years each</li>
	<li>avg_glucose_level:</li>
	<li>bmi:</li>
</ul>

In [406]:
print(max(df['age']))
print(min(df['age']))

82.0
0.08


In [411]:
# bucket the values of columns
def makeBuckets(columnName, numBuckets, maxVal, minVal):
    df[columnName] = df[columnName].apply(np.int64)
    bucketSize = (maxVal - minVal)//numBuckets
    for idx, val in enumerate(df[columnName]):
        df.loc[idx, columnName] = int(val // bucketSize)
    print(bucketSize)

In [408]:
# df['age'] = df['age'].apply(np.int64)
# df.info()

In [412]:
makeBuckets('age', 9, 90, 0)
df['age'].nunique()

10


17