# Data Cleaning

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("./data/diabetes_data.csv")

# sanity check
print(df.shape)

(520, 17)


In [2]:
# check for duplicates
df["is_duplicate"] = df.duplicated()

print(df["is_duplicate"].value_counts())

is_duplicate
True     269
False    251
Name: count, dtype: int64


In [3]:
# drop duplicates
df = df[df['is_duplicate'] == False]

print(df.shape)

(251, 18)


In [4]:
# drop 'is_duplicate column'
df = df.drop(labels=["is_duplicate"], axis=1)

print(df.shape)

(251, 17)


In [5]:
# check for missing values
df.isna().any()

Age                   False
Gender                False
Polyuria              False
Polydipsia            False
sudden weight loss    False
weakness              False
Polyphagia            False
Genital thrush        False
visual blurring       False
Itching               False
Irritability          False
delayed healing       False
partial paresis       False
muscle stiffness      False
Alopecia              False
Obesity               False
class                 False
dtype: bool

In [6]:
# check for any outliers
# calculate IQR
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1
    
# define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
    
# find outliers
outliers = df[(df['Age'] < lower_bound) | (df['Age'] > upper_bound)]

# sanity check and drop outliers
print(outliers.index)
print(df.shape)
df.drop(labels=outliers.index, axis=0, inplace=True)
print(df.shape)


Index([102], dtype='int64')
(251, 17)
(250, 17)


In [7]:
categories = []  # initialize empty list

# append categorical features to a list
for col in df.columns:
    if df[col].dtype == 'object':
        categories.append(col)

print(categories)

['Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss', 'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring', 'Itching', 'Irritability', 'delayed healing', 'partial paresis', 'muscle stiffness', 'Alopecia', 'Obesity', 'class']


In [8]:
from sklearn.preprocessing import LabelEncoder

# initialize label encoder and encode categories
label_encoder = LabelEncoder()
for category in categories:
    df[f"{category}_encoded"] = label_encoder.fit_transform(df[category])

# drop original columns
df = df.drop(labels=categories, axis=1)

In [9]:
# sanity check
print(df.columns)

# rename columns
for category in categories:
    df.rename(columns={f"{category}_encoded": category}, inplace=True)

# sanity check
print(df.columns)

Index(['Age', 'Gender_encoded', 'Polyuria_encoded', 'Polydipsia_encoded',
       'sudden weight loss_encoded', 'weakness_encoded', 'Polyphagia_encoded',
       'Genital thrush_encoded', 'visual blurring_encoded', 'Itching_encoded',
       'Irritability_encoded', 'delayed healing_encoded',
       'partial paresis_encoded', 'muscle stiffness_encoded',
       'Alopecia_encoded', 'Obesity_encoded', 'class_encoded'],
      dtype='object')
Index(['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss',
       'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring',
       'Itching', 'Irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'Alopecia', 'Obesity', 'class'],
      dtype='object')


In [10]:
print(df)

     Age  Gender  Polyuria  Polydipsia  sudden weight loss  weakness  \
0     40       1         0           1                   0         1   
1     58       1         0           0                   0         1   
2     41       1         1           0                   0         1   
3     45       1         0           0                   1         1   
4     60       1         1           1                   1         1   
..   ...     ...       ...         ...                 ...       ...   
515   39       0         1           1                   1         0   
516   48       0         1           1                   1         1   
517   58       0         1           1                   1         1   
518   32       0         0           0                   0         1   
519   42       1         0           0                   0         0   

     Polyphagia  Genital thrush  visual blurring  Itching  Irritability  \
0             0               0                0        1   

In [11]:
# write data frame to a new csv file
df.to_csv('./data/diabetes_data_for_tableau.csv', index=False)