In [1]:
# Import Libraries
import numpy as np
import pandas as pd

In [2]:
# Import CSV into dataframe
df = pd.read_csv('healthcare-dataset-stroke-data.csv',delimiter=',',header='infer')

# Display the first records in the dataframe
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
# Print column names
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [4]:
# Look at the summary stats for each column
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [5]:
# Look at the shape of the dataframes
df.shape

(5110, 12)

In [6]:
# Identify any columns with missing data
df.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [7]:
# Replace missing values
df["bmi"].fillna("missing", inplace = True)
df.isnull().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [8]:
# Identify if there are any duplicated rows
df.duplicated().any()

False

In [9]:
# Print duplicated rows
df[df.duplicated()]

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke


In [10]:
# For each column print the unique values
for col in df.columns:
    print(col)
    print(df[col].unique())
    print('\n')

id
[ 9046 51676 31112 ... 19723 37544 44679]


gender
['Male' 'Female' 'Other']


age
[6.70e+01 6.10e+01 8.00e+01 4.90e+01 7.90e+01 8.10e+01 7.40e+01 6.90e+01
 5.90e+01 7.80e+01 5.40e+01 5.00e+01 6.40e+01 7.50e+01 6.00e+01 5.70e+01
 7.10e+01 5.20e+01 8.20e+01 6.50e+01 5.80e+01 4.20e+01 4.80e+01 7.20e+01
 6.30e+01 7.60e+01 3.90e+01 7.70e+01 7.30e+01 5.60e+01 4.50e+01 7.00e+01
 6.60e+01 5.10e+01 4.30e+01 6.80e+01 4.70e+01 5.30e+01 3.80e+01 5.50e+01
 1.32e+00 4.60e+01 3.20e+01 1.40e+01 3.00e+00 8.00e+00 3.70e+01 4.00e+01
 3.50e+01 2.00e+01 4.40e+01 2.50e+01 2.70e+01 2.30e+01 1.70e+01 1.30e+01
 4.00e+00 1.60e+01 2.20e+01 3.00e+01 2.90e+01 1.10e+01 2.10e+01 1.80e+01
 3.30e+01 2.40e+01 3.40e+01 3.60e+01 6.40e-01 4.10e+01 8.80e-01 5.00e+00
 2.60e+01 3.10e+01 7.00e+00 1.20e+01 6.20e+01 2.00e+00 9.00e+00 1.50e+01
 2.80e+01 1.00e+01 1.80e+00 3.20e-01 1.08e+00 1.90e+01 6.00e+00 1.16e+00
 1.00e+00 1.40e+00 1.72e+00 2.40e-01 1.64e+00 1.56e+00 7.20e-01 1.88e+00
 1.24e+00 8.00e-01 4.00e-01 8.00e-02 1

In [11]:
# Change all the age vaues to categorical
df['age'] = pd.qcut(df['age'].rank(method='first'), q=5, labels=['very low', 'low', 'medium', 'high', 'very high'])

In [12]:
# Change all the average glucose level values to categorical
df['avg_glucose_level'] = pd.qcut(df['avg_glucose_level'].rank(method='first'), q=5, labels=['very low', 'low', 'medium', 'high', 'very high'])

In [13]:
# Change all the bmi columns to categorical
non_missing_rows = df[df["bmi"] != "missing"]
non_missing_rows['bmi'] = non_missing_rows['bmi'].astype(float)
non_missing_rows['bmi'] = pd.qcut(non_missing_rows['bmi'].rank(method='first'), q=5, labels=['very low', 'low', 'medium', 'high', 'very high'])
missing_rows = df[df["bmi"] == "missing"]
non_missing_rows = non_missing_rows.append(missing_rows, ignore_index=True) 
df = non_missing_rows
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,very high,0,1,Yes,Private,Urban,very high,very high,formerly smoked,1
1,31112,Male,very high,0,1,Yes,Private,Rural,high,high,never smoked,1
2,60182,Female,medium,0,0,Yes,Private,Urban,very high,high,smokes,1
3,1665,Female,very high,1,0,Yes,Self-employed,Rural,very high,low,never smoked,1
4,56669,Male,very high,0,0,Yes,Private,Urban,very high,medium,formerly smoked,1


In [14]:
# Describe transformed data
df.describe(include='all')

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,5110.0,5110,5110,5110.0,5110.0,5110,5110,5110,5110,5110,5110,5110.0
unique,,3,5,,,2,5,2,5,6,4,
top,,Female,very high,,,Yes,Private,Urban,very high,low,never smoked,
freq,,2994,1022,,,3353,2925,2596,1022,982,1892,
mean,36517.829354,,,0.097456,0.054012,,,,,,,0.048728
std,21161.721625,,,0.296607,0.226063,,,,,,,0.21532
min,67.0,,,0.0,0.0,,,,,,,0.0
25%,17741.25,,,0.0,0.0,,,,,,,0.0
50%,36932.0,,,0.0,0.0,,,,,,,0.0
75%,54682.0,,,0.0,0.0,,,,,,,0.0


In [15]:
# Drop ID column
df = df.drop(columns = 'id')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,very high,0,1,Yes,Private,Urban,very high,very high,formerly smoked,1
1,Male,very high,0,1,Yes,Private,Rural,high,high,never smoked,1
2,Female,medium,0,0,Yes,Private,Urban,very high,high,smokes,1
3,Female,very high,1,0,Yes,Self-employed,Rural,very high,low,never smoked,1
4,Male,very high,0,0,Yes,Private,Urban,very high,medium,formerly smoked,1


In [None]:
# Describe final dataset
df.describe(include='all')

In [16]:
# Save the transformed dataset
df.to_csv('trainingData.csv', index=False)
print('The csv was successfully saved as trainingData.csv')

The csv was successfully saved as trainingData.csv
