# Caso de Estudio - Análisis de Hongos

Aprendizaje Supervisado

* Datos de entrenamiento, datasets clasificados
* Aprender y evalualar el aprendizaje

Aprendizaje No Supervisado

* Datos no clasificados, se buscan las categorías automáticamente
* Reconocer las categorías que guían los datos

## Adquisción

In [4]:
import pandas as pd

data = pd.read_csv("datasets/agaricus-lepiota.data", header=None)

data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


## Limpieza

1. cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s
2. cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s
3. cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r, pink=p,purple=u,red=e,white=w,yellow=y

In [10]:
poisonous_map = {
    "p": 1,
    "e": 0
}

poisonous_logic_map = {
    "p": True,
    "e": False
}

cap_shape_map = {
    "b": 1,
    "c": 2,
    "f": 3,
    "x": 4,
    "k": 5,
    "s": 6
}

cap_shape_name_map = {
    "b": "bell",
    "c": "conical",
    "f": "flat",
    "x": "convex",
    "k": "knobbed",
    "s": "sunken"
}

cap_surface_map = {
    "f": 1,
    "g": 2,
    "y": 3,
    "s": 4
}

cap_surface_name_map = {
    "f": "fibrous",
    "g": "grooves",
    "y": "scaly",
    "s": "smooth"
}

cap_color_map = {
    "n": 1,
    "b": 2,
    "c": 3,
    "g": 4,
    "r": 5, 
    "p": 6,
    "u": 7,
    "e": 8,
    "w": 9,
    "y": 10
}

cap_color_name_map = {
    "n": "brown",
    "b": "buff",
    "c": "cinnamon",
    "g": "gray",
    "r": "green", 
    "p": "pink",
    "u": "purple",
    "e": "red",
    "w": "white",
    "y": "yellow"
}

data_study = pd.DataFrame({
    "CAP_SHAPE": data[1].map(cap_shape_map),
    "CAP_SHAPE_NAME": data[1].map(cap_shape_name_map),
    "CAP_SURFACE": data[2].map(cap_surface_map),
    "CAP_SURFACE_NAME": data[2].map(cap_surface_name_map),
    "CAP_COLOR": data[3].map(cap_color_map),
    "CAP_COLOR_NAME": data[3].map(cap_color_name_map),
    "POISONOUS": data[0].map(poisonous_map),
    "POISONOUS_LOGIC": data[0].map(poisonous_logic_map),
}) 

data_study

Unnamed: 0,CAP_SHAPE,CAP_SHAPE_NAME,CAP_SURFACE,CAP_SURFACE_NAME,CAP_COLOR,CAP_COLOR_NAME,POISONOUS,POISONOUS_LOGIC
0,4,convex,4,smooth,1,brown,1,True
1,4,convex,4,smooth,10,yellow,0,False
2,1,bell,4,smooth,9,white,0,False
3,4,convex,3,scaly,9,white,1,True
4,4,convex,4,smooth,4,gray,0,False
...,...,...,...,...,...,...,...,...
8119,5,knobbed,4,smooth,1,brown,0,False
8120,4,convex,4,smooth,1,brown,0,False
8121,3,flat,4,smooth,1,brown,0,False
8122,5,knobbed,3,scaly,1,brown,1,True


## Procesamiento

In [16]:
X = pd.DataFrame({
    "CAP_SHAPE": data_study["CAP_SHAPE"],
    "CAP_SURFACE": data_study["CAP_SURFACE"],
    "CAP_COLOR": data_study["CAP_COLOR"],
}).to_numpy()

X

array([[ 4,  4,  1],
       [ 4,  4, 10],
       [ 1,  4,  9],
       ...,
       [ 3,  4,  1],
       [ 5,  3,  1],
       [ 4,  4,  1]], dtype=int64)

In [26]:
Y = pd.DataFrame({
    "POISONOUS": data_study["POISONOUS"],
}).to_numpy()

Y.reshape(-1)

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [21]:
size = len(X)
train_size = int(size * 0.8) # 80% del size (0.8 * size)
test_size = size - train_size

X_train = X[:train_size]
X_test = X[-test_size:]

Y_train = Y[:train_size]
Y_test = Y[-test_size:]

In [23]:
! pip install scikit-learn



You should consider upgrading via the 'c:\python39\python.exe -m pip install --upgrade pip' command.


In [25]:
from sklearn import svm

clf = svm.SVC()

clf.fit(X_train, Y_train.reshape(-1))

SVC()

In [35]:
fails = 0

for i in range(test_size):
    x = X_test[i]
    y = Y_test[i]
    
    yp = clf.predict([x])
    
    fails += abs(y[0] - yp[0])
    
    # print(x, yp, y)
    
fails, test_size, fails / test_size * 100

(927, 1625, 57.04615384615385)