In [9]:
import numpy as np # mathematical operations and algebra
import pandas as pd # data processing, CSV file I/O
import seaborn as sns # Fancier visualizations
sns.set(color_codes=True)
import matplotlib.pyplot as plt # visualization library
import os
import scipy.stats as stats

In [10]:
# import data
data = pd.read_csv (r'diabetes_data_upload.csv')
df = pd.DataFrame(data, columns= ['Age','Gender','Polyuria','Polydipsia','sudden weight loss','weakness','Polyphagia','Genital thrush','visual blurring','Itching','Irritability','delayed healing','partial paresis','muscle stiffness','Alopecia','Obesity','class'])

In [11]:
# data cleaning
print(f'No missing values in data set: {not df.isnull().values.any()}')
#since we have no missing values, we can proceed with the next step

No missing values in data set: True


In [12]:
# duplicate data
print(f'The total number of dumplicated instances are: {sum(df.duplicated())}')
# at first it seems like 269 out of 520 is a lot of duplicated values

# lets drop age as it is an obvious tie breaker between two rows
df_without_age = df.drop(['Age'], axis=1)
print(f'Excluding age, the number of duplicate instances counting the original is {sum(df_without_age.duplicated(keep=False))}')
print(f'Excluding age, the number of duplicate instances NOT counting the original is {sum(df_without_age.duplicated())}')

# so most of these instances are simply copies of a few 'common' cases. This can be illustrated if we take a look at some of these instances
duplicated_data = df[df.duplicated(keep=False)].sort_values(by='Age')
print(duplicated_data[1:6])
# the first 5 data points seems to be some 27-year-olds with no health condition and no diabetes that's a common case 
# so it's safe to assume that the duplication isn't noise or mistake

The total number of dumplicated instances are: 269
Excluding age, the number of duplicate instances counting the original is 407
Excluding age, the number of duplicate instances NOT counting the original is 305
     Age Gender Polyuria Polydipsia sudden weight loss weakness Polyphagia  \
474   27   Male       No         No                 No       No         No   
277   27   Male       No         No                 No       No         No   
286   27   Male       No         No                 No       No         No   
465   27   Male       No         No                 No       No         No   
374   27   Male       No         No                 No       No         No   

    Genital thrush visual blurring Itching Irritability delayed healing  \
474             No              No      No           No              No   
277             No              No      No           No              No   
286             No              No      No           No              No   
465             No  

In [16]:
def similarity (obj1, obj2):
    sim = 0
    for i in range (2,16):
        if (obj1.iat[i] == obj2.iat[i] and obj1.iat[i] == 'Yes'):
            sim += 1
    if obj1.at['Gender'] == obj2.at['Gender']:
        sim += 1
    sim += obj1.at['Age'] - obj1.at['Age']
    return sim

normalized_age = (df['Age'] - df['Age'].min()) / (df['Age'].max() - df['Age'].min())
normalized_df = df.copy()
normalized_df.Age = normalized_age
similarity_matrix = []
for row in range (df.shape[0]):
    similarity_row = []
    for col in range (df.shape[0]):
        similarity_row.append(similarity(normalized_df.loc[row], normalized_df.loc[col]))
    similarity_matrix.append(similarity_row)

print(similarity_matrix[0])


[8.0, 3.0, 6.0, 4.0, 8.0, 8.0, 4.0, 5.0, 6.0, 5.0, 6.0, 5.0, 3.0, 6.0, 5.0, 5.0, 5.0, 7.0, 4.0, 3.0, 4.0, 7.0, 3.0, 5.0, 6.0, 3.0, 3.0, 6.0, 3.0, 4.0, 3.0, 5.0, 4.0, 6.0, 4.0, 4.0, 2.0, 6.0, 2.0, 2.0, 4.0, 2.0, 5.0, 5.0, 4.0, 3.0, 6.0, 4.0, 6.0, 3.0, 3.0, 2.0, 4.0, 3.0, 1.0, 1.0, 5.0, 0.0, 0.0, 5.0, 4.0, 5.0, 5.0, 5.0, 1.0, 0.0, 3.0, 3.0, 2.0, 3.0, 1.0, 5.0, 0.0, 2.0, 5.0, 3.0, 2.0, 3.0, 4.0, 4.0, 2.0, 2.0, 3.0, 4.0, 5.0, 2.0, 5.0, 1.0, 4.0, 0.0, 1.0, 5.0, 4.0, 3.0, 4.0, 3.0, 5.0, 4.0, 2.0, 3.0, 5.0, 6.0, 4.0, 5.0, 5.0, 4.0, 6.0, 4.0, 5.0, 3.0, 3.0, 4.0, 4.0, 4.0, 2.0, 4.0, 2.0, 5.0, 4.0, 3.0, 6.0, 6.0, 7.0, 5.0, 2.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 2.0, 4.0, 1.0, 3.0, 5.0, 5.0, 8.0, 5.0, 4.0, 3.0, 4.0, 3.0, 2.0, 3.0, 4.0, 5.0, 4.0, 4.0, 5.0, 6.0, 4.0, 5.0, 6.0, 6.0, 2.0, 3.0, 3.0, 4.0, 5.0, 1.0, 4.0, 0.0, 1.0, 5.0, 4.0, 3.0, 5.0, 4.0, 6.0, 4.0, 4.0, 2.0, 6.0, 2.0, 2.0, 4.0, 2.0, 5.0, 5.0, 4.0, 4.0, 2.0, 3.0, 5.0, 6.0, 4.0, 5.0, 5.0, 4.0, 6.0, 2.0, 3.0, 4.0, 4.0, 2.0, 2.0, 1.0, 2.0, 5.0,