In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, roc_auc_score

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
# Reading the dataset
data_csv = "parkinsons.data"
df = pd.read_csv(data_csv)#, names=['ID','Clump','U_Cell_size','U_Cell_shape','Marginal_Adhesion','SE_epitelial_cell_size','Bare_nuclei','bland_chromatin','Normal_Nucleoli','Mitoses','Class'])
print('Dataset shape: ', df.shape)
print(df.dtypes)
df.head()

Dataset shape:  (195, 24)
name                 object
MDVP:Fo(Hz)         float64
MDVP:Fhi(Hz)        float64
MDVP:Flo(Hz)        float64
MDVP:Jitter(%)      float64
MDVP:Jitter(Abs)    float64
MDVP:RAP            float64
MDVP:PPQ            float64
Jitter:DDP          float64
MDVP:Shimmer        float64
MDVP:Shimmer(dB)    float64
Shimmer:APQ3        float64
Shimmer:APQ5        float64
MDVP:APQ            float64
Shimmer:DDA         float64
NHR                 float64
HNR                 float64
status                int64
RPDE                float64
DFA                 float64
spread1             float64
spread2             float64
D2                  float64
PPE                 float64
dtype: object


Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [5]:
# Verifying null values and deleting name from dataset
null_columns=df.columns[df.isnull().any()]
print(df[df.isnull().any(axis=1)][null_columns].head())
# Drop the lines with null values
df = df.dropna()
# Removing name column since it won't be considered to the training
df.pop('name')
print('Dataset shape: ', df.shape)

Empty DataFrame
Columns: []
Index: []
Dataset shape:  (195, 23)


In [6]:
# Dataset distribution values
for name, values in df.iteritems():
    print (name, '\nMin Value:  ', np.min(values), '\nMax Value: ', np.max(values), '\n\n')

MDVP:Fo(Hz) 
Min Value:   88.333 
Max Value:  260.105 


MDVP:Fhi(Hz) 
Min Value:   102.145 
Max Value:  592.03 


MDVP:Flo(Hz) 
Min Value:   65.476 
Max Value:  239.17 


MDVP:Jitter(%) 
Min Value:   0.00168 
Max Value:  0.03316 


MDVP:Jitter(Abs) 
Min Value:   7e-06 
Max Value:  0.00026 


MDVP:RAP 
Min Value:   0.00068 
Max Value:  0.02144 


MDVP:PPQ 
Min Value:   0.00092 
Max Value:  0.01958 


Jitter:DDP 
Min Value:   0.00204 
Max Value:  0.06433 


MDVP:Shimmer 
Min Value:   0.00954 
Max Value:  0.11908 


MDVP:Shimmer(dB) 
Min Value:   0.085 
Max Value:  1.302 


Shimmer:APQ3 
Min Value:   0.00455 
Max Value:  0.05647 


Shimmer:APQ5 
Min Value:   0.0057 
Max Value:  0.0794 


MDVP:APQ 
Min Value:   0.00719 
Max Value:  0.13778 


Shimmer:DDA 
Min Value:   0.01364 
Max Value:  0.16942 


NHR 
Min Value:   0.00065 
Max Value:  0.31482 


HNR 
Min Value:   8.441 
Max Value:  33.047 


status 
Min Value:   0 
Max Value:  1 


RPDE 
Min Value:   0.25657 
Max Value:  0.685151 


DF

In [7]:
print(f"Total     = {len(df)} -> 100%")
print(f"Healthy    = {len(df[df.status == 0])} -> {len(df[df.status == 0])/len(df) *100}%")
print(f"Parkinson = {len(df[df.status == 1])} -> {len(df[df.status == 1])/len(df) *100}%")

Total     = 195 -> 100%
Healthy    = 48 -> 24.615384615384617%
Parkinson = 147 -> 75.38461538461539%


In [10]:
# Separate entries from outputs
dataset = df.to_numpy(dtype=np.int) # Converting from Pandas dataframe to Numpy
entries = df.loc[:, df.columns != 'status'].to_numpy(dtype=np.float64)
outputs = df['status'].to_numpy(dtype=np.int64)
print(entries.shape)
print(outputs.shape)

(195, 22)
(195,)
