In [25]:
import numpy as np # mathematical operations and algebra
import pandas as pd # data processing, CSV file I/O
import seaborn as sns # Fancier visualizations
sns.set(color_codes=True)
import matplotlib.pyplot as plt # visualization library
from tree import DesicionTreeNode, buildDecisionTree, buildGraph, evaluateTree
import scipy.stats as stats

IMPORTING DATA

Objective:
-> Import data from CSV file into a padas DataFrame.

In [26]:
data = pd.read_csv (r'diabetes_data_upload.csv')
labels = ['Age','Gender','Polyuria','Polydipsia','sudden weight loss','weakness','Polyphagia',
'Genital thrush','visual blurring','Itching','Irritability','delayed healing','partial paresis',
'muscle stiffness','Alopecia','Obesity','class']
df = pd.DataFrame(data, columns= labels)

DATA CLEANING

Objective:
-> Check if the data contains any null, missing, duplicate.
-> If yes take appopriate action.

In [27]:
print(f'No missing values in data set: {not df.isnull().values.any()}')

No missing values in data set: True


Since we have no missing values, no action is required. Our data is clean and we can proceed with the next step.

In [28]:
# duplicate data

print(f'The total number of dumplicated instances (counting the original) is {sum(df.duplicated(keep=False))}')
print(f'The total number of dumplicated instances  (not counting the original) is: {sum(df.duplicated())}')

The total number of dumplicated instances (counting the original) is 376
The total number of dumplicated instances  (not counting the original) is: 269


At first it seems like 269 out of 520 is a lot of duplicated values.
Lets drop age as it is an obvious tie breaker between two rows. Then recalculate the duplicate values.

In [29]:
df_without_age = df.drop(['Age'], axis=1)
print(f'Number of duplicate instances counting the original is {sum(df_without_age.duplicated(keep=False))}')
print(f'Number of duplicate instances NOT counting the original is {sum(df_without_age.duplicated())}')

Number of duplicate instances counting the original is 407
Number of duplicate instances NOT counting the original is 305


It's a lotmore now (as expected) so let's take a look at some of these duplicate values:

In [30]:
duplicated_data = df[df.duplicated()].sort_values(by='Age')
print(duplicated_data[1:5])

     Age Gender Polyuria Polydipsia sudden weight loss weakness Polyphagia  \
374   27   Male       No         No                 No       No         No   
286   27   Male       No         No                 No       No         No   
465   27   Male       No         No                 No       No         No   
474   27   Male       No         No                 No       No         No   

    Genital thrush visual blurring Itching Irritability delayed healing  \
374             No              No      No           No              No   
286             No              No      No           No              No   
465             No              No      No           No              No   
474             No              No      No           No              No   

    partial paresis muscle stiffness Alopecia Obesity     class  
374              No               No       No      No  Negative  
286              No               No       No      No  Negative  
465              No               N

So most of these instances are simply copies of a few 'common' cases. This can be illustrated if we take a look at some of these instances.

same something about why we used chi-2 etc.

In [31]:
df_without_age = df.drop(['Age'], axis=1)
chi2 = 10.828 #chi2 at df 1 ( for 2x2 table) 0.999 confidence
chi2_matrix = []
for label_i in df_without_age.keys():
    chi2_row = []
    for label_j in df_without_age.keys():
        # first generate a contingency table
        contingency_tab = pd.crosstab(df_without_age[label_i], df_without_age[label_j])
        # calculate the chi square value
        chi, p, dof, expected = stats.chi2_contingency(contingency_tab)
        chi2_row.append(chi>chi2)
    chi2_matrix.append(chi2_row)
labels_without_age = labels[1:]
contingency_df = pd.DataFrame(data=chi2_matrix,index=labels_without_age, columns=labels_without_age)
print(contingency_df)

                    Gender  Polyuria  Polydipsia  sudden weight loss  \
Gender                True      True        True                True   
Polyuria              True      True        True                True   
Polydipsia            True      True        True                True   
sudden weight loss    True      True        True                True   
weakness             False      True        True                True   
Polyphagia            True      True        True                True   
Genital thrush        True     False       False               False   
visual blurring       True      True        True               False   
Itching              False     False       False               False   
Irritability         False      True        True               False   
delayed healing      False      True       False               False   
partial paresis       True      True        True                True   
muscle stiffness     False      True        True               F

DECISION TREE METHOD

In [32]:
# discretize the age into a few categories
df_discretize = df.copy(deep=True)
minAge = df_discretize.Age.min()
maxAge = df_discretize.Age.max()
range = maxAge - minAge
df_discretize.Age = pd.cut(df["Age"],
       bins=[minAge, minAge + range/3, minAge + 2*range/3, maxAge], 
       labels=["Young", "Adult", "Old"])

In [33]:
# divide the discretized data into training and test 
df_discretize_test = df_discretize.sample(n = 50, replace = False)
df_discretize_training = df_discretize.copy(deep=True)
df_discretize_training = df_discretize_training.drop(df_discretize_test.index)

In [34]:
# decision Tree
root = buildDecisionTree(data=df_discretize_training, classAttribute='class')
# draw it as a graph
buildGraph(root).view()
# calculate its accuracy
print("Accuracy: ",evaluateTree(root,df_discretize_test,'class'))

Accuracy:  0.96


KNN METHOD

In [35]:
# normalize age (max min)
normalized_age = (df.Age - df.Age.min()) / (df.Age.max() - df.Age.min())
df_normalized = df.copy(deep=True)
df_normalized.Age = normalized_age 

In [36]:
# divide the normalized data into training and test 
df_normalized_test = df_normalized.sample(n = 50, replace = False) # change test num from here
df_normalized_training = df_normalized # no copy is needed because df_normalized is never used again
df_normalized_training = df_normalized_training.drop(df_normalized_test.index)

In [37]:
symmetrical_attr_labels = ['Gender', 'Polyuria','Polydipsia','sudden weight loss','weakness','Polyphagia',
'Genital thrush','visual blurring','Itching','Irritability','delayed healing','partial paresis',
'muscle stiffness','Alopecia','Obesity'] # the assumption of symmetry might not be right idk!
success = 0
for index, data in df_normalized_test.iterrows():
    distance = 0
    for label in symmetrical_attr_labels:
        distance += df_normalized_training[label] != data[label]
    distance = distance / len(symmetrical_attr_labels) 
    distance += ((df_normalized_training.Age - data.Age) ** 2)**(1/2)
    distance = distance / 2
    df_normalized_training['distance'] = distance
    knn = df_normalized_training.sort_values(by=['distance']).head(10) # change k value here
    # print(knn) #uncomment to see the k nearest neighbours
    if knn['class'].value_counts().idxmax() == data['class']:
        success += 1

print("Accuracy: ", success / df_normalized_test.shape[0])

Accuracy:  0.9


BAYES METHOD

In [38]:
def prob(data, value, attr, givenValue = None, givenAttr = None):
    if (givenValue is not None and givenAttr is not None):
        data = data[data[givenAttr] == givenValue]
    value_count = data[attr].value_counts()
    return value_count[value]/sum(value_count)

labels = ['Age', 'Gender', 'Polyuria','Polydipsia','sudden weight loss','weakness','Polyphagia',
'Genital thrush','visual blurring','Itching','Irritability','delayed healing','partial paresis',
'muscle stiffness','Alopecia','Obesity']
success = 0
p_positive = prob(df_discretize_training, 'Positive', 'class') 
p_negative = 1 - p_positive
for index, data in df_discretize_test.iterrows():
    for label in labels:
        value = data[label]
        p_positive *= prob(df_discretize_training, value, label, 'Positive', 'class')
        p_negative *= prob(df_discretize_training, value, label, 'Negative', 'class')
    classification = "Positive" if p_positive > p_negative else "Negative"
    if data['class'] == classification:
        success += 1

print("Accuracy: ", success / df_discretize_test.shape[0])
 

Accuracy:  0.72
