In [1]:
import numpy as np # mathematical operations and algebra
import pandas as pd # data processing, CSV file I/O
import seaborn as sns # Fancier visualizations
sns.set(color_codes=True)
import matplotlib.pyplot as plt # visualization library
import os
import scipy.stats as stats

In [2]:
# import data
data = pd.read_csv (r'diabetes_data_upload.csv')
labels = ['Age','Gender','Polyuria','Polydipsia','sudden weight loss','weakness','Polyphagia','Genital thrush','visual blurring','Itching','Irritability','delayed healing','partial paresis','muscle stiffness','Alopecia','Obesity','class']
df = pd.DataFrame(data, columns= labels)

In [3]:
# data cleaning
print(f'No missing values in data set: {not df.isnull().values.any()}')

#since we have no missing values, we can proceed with the next step

No missing values in data set: True


In [4]:
# duplicate data
print(f'The total number of dumplicated instances are: {sum(df.duplicated())}')
# at first it seems like 269 out of 520 is a lot of duplicated values

# lets drop age as it is an obvious tie breaker between two rows
df_without_age = df.drop(['Age'], axis=1)
print(f'Number of duplicate instances counting the original is {sum(df_without_age.duplicated(keep=False))}')
print(f'Number of duplicate instances NOT counting the original is {sum(df_without_age.duplicated())}')

# so most of these instances are simply copies of a few 'common' cases. This can be illustrated if we take a look at some of these instances
duplicated_data = df[df.duplicated()].sort_values(by='Age')
print(duplicated_data[1:5])

The total number of dumplicated instances are: 269
Number of duplicate instances counting the original is 407
Number of duplicate instances NOT counting the original is 305
     Age Gender Polyuria Polydipsia sudden weight loss weakness Polyphagia  \
374   27   Male       No         No                 No       No         No   
286   27   Male       No         No                 No       No         No   
465   27   Male       No         No                 No       No         No   
474   27   Male       No         No                 No       No         No   

    Genital thrush visual blurring Itching Irritability delayed healing  \
374             No              No      No           No              No   
286             No              No      No           No              No   
465             No              No      No           No              No   
474             No              No      No           No              No   

    partial paresis muscle stiffness Alopecia Obesity     cl

In [5]:
contingency_tab = pd.crosstab(df['Gender'], df['weakness'])
print(contingency_tab)

weakness   No  Yes
Gender            
Female     64  128
Male      151  177


In [10]:
df_without_age = df.drop(['Age'], axis=1)
chi2 = 2.706 #chi2 at df 1 ( for 2x2 table)
chi2_matrix = []
for label_i in df_without_age.keys():
    chi2_row = []
    for label_j in df_without_age.keys():
        # first generate a contingency table
        contingency_tab = pd.crosstab(df_without_age[label_i], df_without_age[label_j])
        # calculate the chi square value
        chi, p, dof, expected = stats.chi2_contingency(contingency_tab)
        chi2_row.append(chi>chi2)
    chi2_matrix.append(chi2_row)
labels_without_age = labels[1:]
contingency_df = pd.DataFrame(data=chi2_matrix,index=labels_without_age, columns=labels_without_age)
print(contingency_df)

                    Gender  Polyuria  Polydipsia  sudden weight loss  \
Gender                True      True        True                True   
Polyuria              True      True        True                True   
Polydipsia            True      True        True                True   
sudden weight loss    True      True        True                True   
weakness             False      True        True                True   
Polyphagia            True      True        True                True   
Genital thrush        True     False       False               False   
visual blurring       True      True        True               False   
Itching              False     False       False               False   
Irritability         False      True        True               False   
delayed healing      False      True       False               False   
partial paresis       True      True        True                True   
muscle stiffness     False      True        True               F