In [1]:
import numpy as np
import pandas as pd
import scipy.stats as ss
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import cluster
from sklearn import metrics
from scipy import stats, integrate
%matplotlib inline

# Hepatitis

 1. Class: DIE, LIVE
 2. AGE: 10, 20, 30, 40, 50, 60, 70, 80
 3. SEX: male, female
 4. STEROID: no, yes
 5. ANTIVIRALS: no, yes
 6. FATIGUE: no, yes
 7. MALAISE: no, yes
 8. ANOREXIA: no, yes
 9. LIVER BIG: no, yes
 10. LIVER FIRM: no, yes
 11. SPLEEN PALPABLE: no, yes
 12. SPIDERS: no, yes
 13. ASCITES: no, yes
 14. VARICES: no, yes
 15. BILIRUBIN: 0.39, 0.80, 1.20, 2.00, 3.00, 4.00
 16. ALK PHOSPHATE: 33, 80, 120, 160, 200, 250
 17. SGOT: 13, 100, 200, 300, 400, 500, 
 18. ALBUMIN: 2.1, 3.0, 3.8, 4.5, 5.0, 6.0
 19. PROTIME: 10, 20, 30, 40, 50, 60, 70, 80, 90
 20. HISTOLOGY: no, yes

In [2]:
df1 = pd.read_csv('data/hepatitis.data')

In [3]:
#Quitar las filas que tengan missing values '?'
index = []
for i in range(df1.shape[0]):
    if('?' in df1.iloc[i].values):
        index.append(i)
df1 = df1.drop(index)        

In [4]:
#Sacar la columna Class, que es la que se intenta predecir
df1 = df1.drop('Class',axis=1)

In [5]:
mat = df1.as_matrix()

## Clustering

Algoritmos de Clustering sobre la data 'Hepatitis' para separar a las personas en dos clasificaciones (clases): DIE o LIVE

### K-Means

In [6]:
kmeans = cluster.KMeans(n_clusters=2)
kmeans.fit(mat)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [7]:
print("Silhouette Coefficient: %0.6f"
      % metrics.silhouette_score(mat, kmeans.labels_))

Silhouette Coefficient: 0.482762


### Average Linkage

In [8]:
algoritmo = cluster.AgglomerativeClustering(linkage="average", 
                                            affinity="cityblock", n_clusters=2)
# Linkage: complete, average, ward
# Affinity: “euclidean”, “l1”, “l2”, “manhattan”, "cityblock", “cosine”, o ‘precomputed’
algoritmo.fit(mat)

AgglomerativeClustering(affinity='cityblock', compute_full_tree='auto',
            connectivity=None, linkage='average', memory=None,
            n_clusters=2, pooling_func=<function mean at 0x7efff80f9400>)

In [9]:
print("Silhouette Coefficient: %0.6f"
      % metrics.silhouette_score(mat, algoritmo.labels_))

Silhouette Coefficient: 0.671057


### Mean Shift

In [10]:
ancho_banda = cluster.estimate_bandwidth(mat, quantile=0.15)
mean_shift_alg = cluster.MeanShift(bandwidth=ancho_banda, bin_seeding=True)
mean_shift_alg.fit(mat)

MeanShift(bandwidth=59.752845675029263, bin_seeding=True, cluster_all=True,
     min_bin_freq=1, n_jobs=1, seeds=None)

In [11]:
print("Silhouette Coefficient: %0.6f"
      % metrics.silhouette_score(mat, mean_shift_alg.labels_))

Silhouette Coefficient: 0.368808


## Clasificadores

## PCA

# Pima Indian Diabetes

1. Number of times pregnant
2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
3. Diastolic blood pressure (mm Hg)
4. Triceps skin fold thickness (mm)
5. 2-Hour serum insulin (mu U/ml)
6. Body mass index (weight in kg/(height in m)^2)
7. Diabetes pedigree function
8. Age (years)
9. Class variable (0 or 1)

In [None]:
df2 = pd.read_csv('data/diabetes.data')

# Water Treatment Plant

|N. | Identificador | Descripción |
| -- |:---:|:---|
| 1 | Q-E       | (input flow to plant)   |
| 2 | ZN-E      | (input Zinc to plant) |
| 3 | PH-E      | (input pH to plant)  |
| 4 | DBO-E     | (input Biological demand of oxygen to plant)  |
| 5 | DQO-E     | (input chemical demand of oxygen to plant) |
| 6 | SS-E      | (input suspended solids to plant)   |
| 7 | SSV-E     | (input volatile supended solids to plant) |
| 8 | SED-E     | (input sediments to plant)  |
| 9 | COND-E    | (input conductivity to plant)  |
|10 | PH-P      | (input pH to primary settler) |
|11 | DBO-P     | (input Biological demand of oxygen to primary settler) |
|12 | SS-P      | (input suspended solids to primary settler) |
|13 | SSV-P     | (input volatile supended solids to primary settler) |
|14 | SED-P     | (input sediments to primary settler)  |
|15 | COND-P    | (input conductivity to primary settler) |
|16 | PH-D      | (input pH to secondary settler)  |
|17 | DBO-D     | (input Biological demand of oxygen to secondary settler) |
|18 | DQO-D     | (input chemical demand of oxygen to secondary settler) |
|19 | SS-D      | (input suspended solids to secondary settler) |
|20 | SSV-D     | (input volatile supended solids to secondary settler) |
|21 | SED-D     | (input sediments to secondary settler)   |
|22 | COND-D    | (input conductivity to secondary settler)  |
|23 | PH-S      | (output pH)    |
|24 | DBO-S     | (output Biological demand of oxygen) |
|25 | DQO-S     | (output chemical demand of oxygen) |
|26 | SS-S      | (output suspended solids) |
|27 | SSV-S     | (output volatile supended solids)  |
|28 | SED-S     | (output sediments)  |
|29 | COND-S    | (output conductivity) |
|30 | RD-DBO-P  | (performance input Biological demand of oxygen in primary settler) |
|31 | RD-SS-P   | (performance input suspended solids to primary settler) |
|32 | RD-SED-P  | (performance input sediments to primary settler) |
|33 | RD-DBO-S  | (performance input Biological demand of oxygen to secondary settler) |
|34 | RD-DQO-S  | (performance input chemical demand of oxygen to secondary settler) |
|35 | RD-DBO-G  | (global performance input Biological demand of oxygen) |
|36 | RD-DQO-G  | (global performance input chemical demand of oxygen) |
|37 | RD-SS-G   | (global performance input suspended solids)  |
|38 | RD-SED-G  | (global performance input sediments) |

In [2]:
df3 = pd.read_csv('data/water-treatment.data')