In [1]:
#importamos todo lo que vamos a usar
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import confusion_matrix


#Datas Set Parkinsons

Fuente: https://archive.ics.uci.edu/ml/datasets/Parkinsons

Este conjunto de datos está compuesto por una serie de medidas biomédicas de la voz de 31 personas, 23 de ellas con la enfermedad de Parkinson (EP). Cada columna de la tabla es una medida de voz concreta, y cada fila corresponde a una de las 195 grabaciones de voz de estos individuos (columna "nombre"). El objetivo principal de los datos es discriminar a las personas sanas de las que padecen EP, según la columna "estado", que se establece en 0 para los sanos y en 1 para los que padecen EP.

Los datos están en formato ASCII CSV. Las filas del archivo CSV contienen una instancia correspondiente a una grabación de voz. Hay unas seis grabaciones por paciente, el nombre del paciente se identifica en la primera columna.Para más información o para transmitir comentarios, póngase en contacto con Max Little (littlem '@' robots.ox.ac.uk).

Si utiliza este conjunto de datos, le rogamos que lo cite:
Max A. Little, Patrick E. McSharry, Eric J. Hunter, Lorraine O. Ramig (2008), 'Suitability of dysphonia measurements for telemonitoring of Parkinson's disease', IEEE Transactions on Biomedical Engineering (pendiente de publicación).

Información de atributos:

Entradas de columna de la matriz (atributos):
* nombre - Nombre del sujeto en ASCII y número de grabación
* MDVP:Fo(Hz) - Frecuencia fundamental vocal media
* MDVP:Fhi(Hz) - Frecuencia fundamental vocal máxima
* MDVP:Flo(Hz) - Frecuencia fundamental vocal mínima
* MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP - Varias 
medidas de variación de la frecuencia fundamental
* MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA - Varias medidas de variación de la amplitud
* NHR,HNR - Dos medidas de la relación entre el ruido y los componentes tonales de la voz
* Estado - Estado de salud del sujeto (uno) - Parkinson, (cero) - sano
* RPDE,D2 - Dos medidas de complejidad dinámica no lineal
* DFA - Exponente de escala fractal de la señal
* spread1,spread2,PPE - Tres medidas no lineales de variación de la frecuencia fundamental 





In [2]:
#descargamos el dataset 
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/parkinsons.data')
df.head()


Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,0.02182,0.0313,0.02971,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,0.03134,0.04518,0.04368,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,0.02757,0.03858,0.0359,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,0.02924,0.04005,0.03772,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,0.0349,0.04825,0.04465,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


## Hacemos un poco de limpieza de nuestros datos

In [3]:
#renombramos los nombres de las columnas para poder trabajar más fácil con ellas
df=df.rename(columns={
    'name':'id', 'MDVP:Fo(Hz)':'fo', 'MDVP:Fhi(Hz)':'fhi', 'MDVP:Flo(Hz)':'flo', 'MDVP:Jitter(%)':'jitter',
       'MDVP:Jitter(Abs)':'jitter_abs', 'MDVP:RAP':'rap', 'MDVP:PPQ':'ppq', 'Jitter:DDP':'ddp',
       'MDVP:Shimmer':'shi', 'MDVP:Shimmer(dB)':'shi_db', 'Shimmer:APQ3':'shi_apq3', 'Shimmer:APQ5':'shi_apq5',
       'MDVP:APQ':'apq', 'Shimmer:DDA':'shi_dda', 'NHR':'nhr', 'HNR':'hnr', 'status':'status', 'RPDE':'rpde', 'DFA':'dfa',
       'spread1':'s1', 'spread2':'s2', 'D2':'d2', 'PPE':'ppe'})
df=df.drop(columns='id')

df.head()

Unnamed: 0,fo,fhi,flo,jitter,jitter_abs,rap,ppq,ddp,shi,shi_db,shi_apq3,shi_apq5,apq,shi_dda,nhr,hnr,status,rpde,dfa,s1,s2,d2,ppe
0,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,0.02182,0.0313,0.02971,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,0.03134,0.04518,0.04368,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,0.02757,0.03858,0.0359,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,0.02924,0.04005,0.03772,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,0.0349,0.04825,0.04465,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


In [4]:
#como parte del proceso de limpieza revisamos si hay datos de tipo null
df.isnull().sum()

fo            0
fhi           0
flo           0
jitter        0
jitter_abs    0
rap           0
ppq           0
ddp           0
shi           0
shi_db        0
shi_apq3      0
shi_apq5      0
apq           0
shi_dda       0
nhr           0
hnr           0
status        0
rpde          0
dfa           0
s1            0
s2            0
d2            0
ppe           0
dtype: int64

In [5]:
 df.shape

(195, 23)

## Comenzamos separando los datos en los datos de entrenamiento y datos para test 

La columna de "status" es la que nos importa para clasificar, será la variable independiente y el resto las dependientes

In [6]:
# Separamos el data set en los features independientes y dependientes 
X = df.drop(columns=['status'])
y = df.status


In [7]:
print(X.shape,y.shape)

(195, 22) (195,)


Separamos los datos en train y test usando k-fold cross-validation

Documentación de la función [train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    shuffle=True,
                                                    random_state=42
                                                   )
#vemos la relación entre gente con parkinson y gente que no en los sets de datos de test y entrenamiento
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

1    0.746575
0    0.253425
Name: status, dtype: float64
1    0.77551
0    0.22449
Name: status, dtype: float64


## Analizamos su accuracy con la libreria de Sklearn

In [9]:
#hacemos un Dummy Classifier para tener el baseline de la predicción

#.mode() regresa el valor con mayor frecuencia de una serie
majority_class = y_train.mode()[0]
#np.full: Devuelve un nuevo array de forma y tipo dados, rellenado con fill_value.
prediction = np.full(shape=y_train.shape, 
                     fill_value=majority_class)

accuracy_score(y_train, prediction)

0.7465753424657534

In [10]:
#Distribution of y test
print('y actual : \n' +  str(y_train.value_counts()))

#Distribution of y predicted
print('y predicted : \n' + str(pd.Series(prediction).value_counts()))

y actual : 
1    109
0     37
Name: status, dtype: int64
y predicted : 
1    146
dtype: int64


In [11]:
print('Accuracy Score : ' + str(accuracy_score(y_train,prediction)))
print('Precision Score : ' + str(precision_score(y_train,prediction)))
print('Recall Score : ' + str(recall_score(y_train,prediction)))
print('F1 Score : ' + str(f1_score(y_train,prediction)))

Accuracy Score : 0.7465753424657534
Precision Score : 0.7465753424657534
Recall Score : 1.0
F1 Score : 0.8549019607843137


In [12]:
print('Confusion Matrix : \n' + str(confusion_matrix(y_train,prediction)))

Confusion Matrix : 
[[  0  37]
 [  0 109]]


In [13]:
clf = LogisticRegression().fit(X_train,y_train)
y_pred = clf.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [14]:
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred)))
print('Precision Score : ' + str(precision_score(y_test,y_pred)))
print('Recall Score : ' + str(recall_score(y_test,y_pred)))
print('F1 Score : ' + str(f1_score(y_test,y_pred)))

#Logistic Regression Classifier Confusion matrix
from sklearn.metrics import confusion_matrix
print('Confusion Matrix : \n' + str(confusion_matrix(y_test,y_pred)))

Accuracy Score : 0.8775510204081632
Precision Score : 0.8809523809523809
Recall Score : 0.9736842105263158
F1 Score : 0.925
Confusion Matrix : 
[[ 6  5]
 [ 1 37]]


#Usamos pipeline de Sklearn

https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html

In [15]:
pipeline = make_pipeline(\
                         RobustScaler(),
                         SelectKBest(f_classif),
                         LogisticRegression(solver='lbfgs'))

## Usamos GridSearchCV de Sklearn

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [16]:
param_grid = {
    'selectkbest__k': [1, 2, 3, 4],
    'logisticregression__class_weight': [None,'balanced'],
    'logisticregression__C': [.0001, .001, .01, .1, 1.0, 10.0, 100.00, 1000.0, 10000.0]
}

gridsearch = GridSearchCV(pipeline, param_grid=param_grid, cv=5,
                         scoring='accuracy', verbose=1)

gridsearch.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('robustscaler', RobustScaler()),
                                       ('selectkbest', SelectKBest()),
                                       ('logisticregression',
                                        LogisticRegression())]),
             param_grid={'logisticregression__C': [0.0001, 0.001, 0.01, 0.1,
                                                   1.0, 10.0, 100.0, 1000.0,
                                                   10000.0],
                         'logisticregression__class_weight': [None, 'balanced'],
                         'selectkbest__k': [1, 2, 3, 4]},
             scoring='accuracy', verbose=1)

Ahora intentamos entender qué pasó arriba

In [17]:
print('Cross Validation Score:', gridsearch.best_score_)

Cross Validation Score: 0.8491954022988505


In [18]:
print('Best Parameters:', gridsearch.best_params_)

Best Parameters: {'logisticregression__C': 1000.0, 'logisticregression__class_weight': None, 'selectkbest__k': 2}


In [19]:
# Cuáles fueron los features seleccionados?
selector = gridsearch.best_estimator_.named_steps['selectkbest']
all_names = X_train.columns
selected_mask = selector.get_support()
selected_names = all_names[selected_mask]
unselected_names = all_names[~selected_mask]

print('Features selected:')
for name in selected_names:
    print(name)

print()
print('Features not selected:')
for name in unselected_names:
    print(name)

Features selected:
s1
ppe

Features not selected:
fo
fhi
flo
jitter
jitter_abs
rap
ppq
ddp
shi
shi_db
shi_apq3
shi_apq5
apq
shi_dda
nhr
hnr
rpde
dfa
s2
d2


s1 y ppe corresponden a medidas no lineales de variación de la frecuencia fundamental

In [20]:
#Get the best model and check it against test data set.

# Predict with X_test features
y_pred = gridsearch.predict(X_test)

# Compare predictions to y_test labels
test_score = accuracy_score(y_test, y_pred)
print('Accuracy Score on test data set:', test_score)

Accuracy Score on test data set: 0.8979591836734694


Fuentes de consulta para este notebook

https://towardsdatascience.com/grid-search-for-model-tuning-3319b259367e
https://towardsdatascience.com/supervised-machine-learning-model-validation-a-step-by-step-approach-771109ae0253
https://www.guavus.com/technical-blog/unsupervised-machine-learning-validation-techniques/