In [7]:
import numpy as np
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score      
from sklearn.metrics import precision_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from pandas import read_csv
from statsmodels.sandbox.tools.tools_pca import pcasvd
import pandas as pd


## Clasificación

Se usarán las siguientes 3 técnicas de clasificación para los 3 datasets.

1. Clasificación lineal
2. Clasificación cuadrática
3. K vecinos mas próximos (K Neighbors Classifier)

Necesitamos la accuracy, precision y medida F de cada dataset. Estas serán guardadas en la siguiente estructura.

In [34]:
datasets = ["hepatitis", "diabetes", "water"]
stats = {"Lineal":{}, "Cuadratic":{}, "Neighbors":{}}
for i in stats:
    for dataset in datasets:
        stats[i][dataset] = dict()

## Hepatitis

In [59]:
vector = ["Class", "AGE", "SEX", "STEROID", "ANTIVIRALS", 
          "FATIGUE", "MALAISE", "ANOREXIA", "LIVER BIG", "LIVER FIRM", "SPLEEN PALPABLE",
          "SPIDERS", "ASCITES", "VARICES", "BILIRUBIN", "ALK PHOSPHATE", "SGOT", "ALBUMIN",
          "PROTIME", "HISTOLOGY"]
hepatitis = read_csv('data/hepatitis.data', sep=',', names=vector, header=None)

    
floats = ['BILIRUBIN', 'ALBUMIN']
integers = ['ALK PHOSPHATE', 'SGOT', 'PROTIME']

for column in floats:
    hepatitis[column] = pd.to_numeric(hepatitis[column], errors='coerce')
    
for column in integers:
    hepatitis[column] = pd.to_numeric(hepatitis[column], errors='coerce')

# Columnas booleanas
non_boolean = ['AGE', 'BILIRUBIN', 'ALK PHOSPHATE', 'SGOT', 'ALBUMIN']

# Sacar las columnas booleanas
hepatitis = hepatitis.dropna()
X = hepatitis.loc[:, non_boolean]
y = hepatitis["Class"]


#Datos de entrenamiento y de prueba
X= StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3) #entrenamiento al 30% de los datos

# Clasificador Lineal
clasificador_li = LinearDiscriminantAnalysis()
clasificador1.fit(X_train,y_train)
y_pred= clasificador1.predict(X_test)

# Accuracy
stats["Lineal"]["hepatitis"]["accuracy"] = accuracy_score(y_test, y_pred)
# Precision
stats["Lineal"]["hepatitis"]["precision"] = precision_score(y_test, y_pred, average='binary')
# Medida F
stats["Lineal"]["hepatitis"]["medida_f"]=f1_score(y_test, y_pred)

# Clasificador cuadratico

clasificador2 =  QuadraticDiscriminantAnalysis()
clasificador2.fit(X_train, y_train)
y_pred = clasificador2.predict(X_test)

# Accuracy
stats["Cuadratic"]["hepatitis"]["accuracy"] = accuracy_score(y_test, y_pred)
# Precision
stats["Cuadratic"]["hepatitis"]["precision"] = precision_score(y_test, y_pred, average='binary')
# Medida F
stats["Cuadratic"]["hepatitis"]["medida_f"]=f1_score(y_test, y_pred)

# Clasificador K vecinos mas proximos. Con 3 vecinos mas proximos

clasificador5=KNeighborsClassifier(3)
clasificador5.fit(X_train, y_train)
y_pred= clasificador5.predict(X_test)

# Accuracy
stats["Neighbors"]["hepatitis"]["accuracy"] = accuracy_score(y_test, y_pred)
# Precision
stats["Neighbors"]["hepatitis"]["precision"] = precision_score(y_test, y_pred, average='binary')
# Medida F
stats["Neighbors"]["hepatitis"]["medida_f"]=f1_score(y_test, y_pred)



In [60]:
# Inprimir estadisticas solo para hepatitis

for algorithm in stats:
    print("Metodo:",algorithm)
    print()
    for k,v in stats[algorithm]["hepatitis"].items():
        print(k,":",v)
    print()

    
        

Metodo: Lineal

accuracy : 0.72
precision : 0.333333333333
medida_f : 0.461538461538

Metodo: Cuadratic

accuracy : 0.76
precision : 0.375
medida_f : 0.5

Metodo: Neighbors

accuracy : 0.72
precision : 0.333333333333
medida_f : 0.461538461538



## Diabetes

In [61]:
# Diabetes Dataset

vector=['TP','glucose','DBP','TSFT','insulin','mass','DPF','age','status']
diabetes = read_csv('data/pima-indians-diabetes.data',sep=',',names=vector,header=None)

X = diabetes.loc[:,"TP":"age"]
y = diabetes["status"]

In [62]:
#Datos de entrenamiento y de prueba
X= StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3) #entrenamiento al 30% de los datos

# Clasificador Lineal
clasificador_li = LinearDiscriminantAnalysis()
clasificador1.fit(X_train,y_train)
y_pred= clasificador1.predict(X_test)

# Accuracy
stats["Lineal"]["diabetes"]["accuracy"] = accuracy_score(y_test, y_pred)
# Precision
stats["Lineal"]["diabetes"]["precision"] = precision_score(y_test, y_pred, average='binary')
# Medida F
stats["Lineal"]["diabetes"]["medida_f"]=f1_score(y_test, y_pred)

# Clasificador cuadratico

clasificador2 =  QuadraticDiscriminantAnalysis()
clasificador2.fit(X_train, y_train)
y_pred = clasificador2.predict(X_test)

# Accuracy
stats["Cuadratic"]["diabetes"]["accuracy"] = accuracy_score(y_test, y_pred)
# Precision
stats["Cuadratic"]["diabetes"]["precision"] = precision_score(y_test, y_pred, average='binary')
# Medida F
stats["Cuadratic"]["diabetes"]["medida_f"]=f1_score(y_test, y_pred)

# Clasificador K vecinos mas proximos. Con 3 vecinos mas proximos

clasificador5=KNeighborsClassifier(3)
clasificador5.fit(X_train, y_train)
y_pred= clasificador5.predict(X_test)

# Accuracy
stats["Neighbors"]["diabetes"]["accuracy"] = accuracy_score(y_test, y_pred)
# Precision
stats["Neighbors"]["diabetes"]["precision"] = precision_score(y_test, y_pred, average='binary')
# Medida F
stats["Neighbors"]["diabetes"]["medida_f"]=f1_score(y_test, y_pred)


In [63]:
for algorithm in stats:
    print("Metodo:",algorithm)
    print()
    for k,v in stats[algorithm]["diabetes"].items():
        print(k,":",v)
    print()

Metodo: Lineal

accuracy : 0.78354978355
precision : 0.709677419355
medida_f : 0.63768115942

Metodo: Cuadratic

accuracy : 0.766233766234
precision : 0.644736842105
medida_f : 0.644736842105

Metodo: Neighbors

accuracy : 0.748917748918
precision : 0.640625
medida_f : 0.585714285714

