# Identificación de hongos venenosos

# 1. Exploración y preprocesamiento de los datos

In [1]:
# Importar librerías

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
# Cargar datos

train_df = pd.read_csv('train_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')

### Exploración

In [3]:
# Explorar datos de entrenamiento 

train_df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,y,n,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,s,u
1,e,f,y,g,t,n,f,c,b,p,...,s,g,g,p,w,o,p,k,y,d
2,e,x,y,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,m
3,e,x,s,w,f,n,f,w,b,k,...,s,w,w,p,w,o,e,k,a,g
4,e,x,f,n,t,n,f,c,b,p,...,s,p,w,p,w,o,p,n,v,d


In [4]:
train_df.dtypes

type                        object
cap_shape                   object
cap_surface                 object
cap_color                   object
bruises                     object
odor                        object
gill_attachment             object
gill_spacing                object
gill_size                   object
gill_color                  object
stalk_shape                 object
stalk_root                  object
stalk_surface_above_ring    object
stalk_surface_below_ring    object
stalk_color_above_ring      object
stalk_color_below_ring      object
veil_type                   object
veil_color                  object
ring_number                 object
ring_type                   object
spore_print_color           object
population                  object
habitat                     object
dtype: object

In [5]:
train_df.isna().sum()

type                        0
cap_shape                   0
cap_surface                 0
cap_color                   0
bruises                     0
odor                        0
gill_attachment             0
gill_spacing                0
gill_size                   0
gill_color                  0
stalk_shape                 0
stalk_root                  0
stalk_surface_above_ring    0
stalk_surface_below_ring    0
stalk_color_above_ring      0
stalk_color_below_ring      0
veil_type                   0
veil_color                  0
ring_number                 0
ring_type                   0
spore_print_color           0
population                  0
habitat                     0
dtype: int64

In [6]:
# Explorar los datos de prueba

test_df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,b,y,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,s,m
2,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,n,s,u
3,p,x,y,n,t,p,f,c,n,n,...,s,w,w,p,w,o,p,n,v,g
4,e,b,s,w,t,l,f,c,b,g,...,s,w,w,p,w,o,p,k,s,m


In [7]:
test_df.dtypes

type                        object
cap_shape                   object
cap_surface                 object
cap_color                   object
bruises                     object
odor                        object
gill_attachment             object
gill_spacing                object
gill_size                   object
gill_color                  object
stalk_shape                 object
stalk_root                  object
stalk_surface_above_ring    object
stalk_surface_below_ring    object
stalk_color_above_ring      object
stalk_color_below_ring      object
veil_type                   object
veil_color                  object
ring_number                 object
ring_type                   object
spore_print_color           object
population                  object
habitat                     object
dtype: object

In [8]:
test_df.isna().sum()

type                        0
cap_shape                   0
cap_surface                 0
cap_color                   0
bruises                     0
odor                        0
gill_attachment             0
gill_spacing                0
gill_size                   0
gill_color                  0
stalk_shape                 0
stalk_root                  0
stalk_surface_above_ring    0
stalk_surface_below_ring    0
stalk_color_above_ring      0
stalk_color_below_ring      0
veil_type                   0
veil_color                  0
ring_number                 0
ring_type                   0
spore_print_color           0
population                  0
habitat                     0
dtype: int64

### Preprocesamiento

En el enunciado se indica que la variable stalk_root contiene valores faltantes guardados con el caracter '?'. Una vez comprobado cuántos valores nulos hay, se decide realizar una imputación de datos con la moda de la columna debido a que eliminar más de 1800 registros sería deshacerse de aproximadamente un 22.8% de los datos en entrenamiento y 7.6% en prueba.

In [9]:
# Comprobar cuántos valores '?' hay en la columna 'stalk_root'

train_df['stalk_root'].value_counts()

stalk_root
b    2824
?    1856
e     834
c     438
r     141
Name: count, dtype: int64

In [10]:
test_df['stalk_root'].value_counts()

stalk_root
b    952
?    624
e    286
c    118
r     51
Name: count, dtype: int64

In [11]:
# Reemplazar '?' con NaN
train_df['stalk_root'] = train_df['stalk_root'].replace('?', np.nan)
test_df['stalk_root'] = test_df['stalk_root'].replace('?', np.nan)

# Crear el imputador
imputer = SimpleImputer(strategy='most_frequent')

# Ajustar y transformar el conjunto de entrenamiento
train_stalk_root = train_df[['stalk_root']]
train_stalk_root_imputed = imputer.fit_transform(train_stalk_root)
train_df['stalk_root'] = pd.Series(train_stalk_root_imputed.flatten(), index=train_df.index)

# Transformar el conjunto de prueba
test_stalk_root = test_df[['stalk_root']]
test_stalk_root_imputed = imputer.transform(test_stalk_root)
test_df['stalk_root'] = pd.Series(test_stalk_root_imputed.flatten(), index=test_df.index)


In [12]:
print(train_df['stalk_root'].value_counts())
print(test_df['stalk_root'].value_counts())

stalk_root
b    4680
e     834
c     438
r     141
Name: count, dtype: int64
stalk_root
b    1576
e     286
c     118
r      51
Name: count, dtype: int64


Al tener todas las variables como categóricas es necesario convertir sus valores a numéricos de manera que se pueda implementar el modelo. Esto se realizará con el módulo LabelEncoder.

In [13]:
encoder = LabelEncoder()

# Codificar todas las variables en el conjunto de entrenamiento

for column in train_df.columns:
    train_df[column] = encoder.fit_transform(train_df[column])

# Codificar todas las variables en el conjunto de prueba

for column in test_df.columns:
    test_df[column] = encoder.fit_transform(test_df[column])

print(train_df.dtypes)
print(test_df.dtypes)

type                        int64
cap_shape                   int64
cap_surface                 int64
cap_color                   int64
bruises                     int64
odor                        int64
gill_attachment             int64
gill_spacing                int64
gill_size                   int64
gill_color                  int64
stalk_shape                 int64
stalk_root                  int64
stalk_surface_above_ring    int64
stalk_surface_below_ring    int64
stalk_color_above_ring      int64
stalk_color_below_ring      int64
veil_type                   int64
veil_color                  int64
ring_number                 int64
ring_type                   int64
spore_print_color           int64
population                  int64
habitat                     int64
dtype: object
type                        int64
cap_shape                   int64
cap_surface                 int64
cap_color                   int64
bruises                     int64
odor                        int64


### Particionamiento de datos de entrenamiento y prueba

In [14]:
# Preprocesar el conjunto de datos de entrenamiento con la columna 'type' como la variable objetivo
X_train = train_df.drop('type', axis=1)
y_train = train_df['type']

In [15]:
# Preprocesar el conjunto de datos de prueba con la columna 'type' como la variable objetivo
X_test = test_df.drop('type', axis=1)
y_test = test_df['type']

# 2. Entrenamiento del modelo

In [16]:
# Ajustar el modelo a los datos de entrenamiento

model = RandomForestClassifier()
model.fit(X_train, y_train)

# 3. Evaluación del modelo

In [17]:
# Realizar predicciones en los datos de entrenamiento

y_train_pred = model.predict(X_train)

In [18]:
# Calcular la exactitud de las predicciones en los datos de entrenamiento

train_accuracy = accuracy_score(y_train, y_train_pred.round())
train_accuracy

1.0

In [19]:
# Realizar predicciones en los datos de prueba

y_test_pred = model.predict(X_test)

In [20]:
# Calcular la exactitud de las predicciones en los datos de prueba

test_accuracy = accuracy_score(y_test, y_test_pred.round())
test_accuracy

1.0

In [21]:
# Guardar el modelo con pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)