# Clasificación de hongos venenosos

El dataset a trabajar contiene información sobre hongos provenientes de 23 especies de la familia Agaricus y Lepiota, los cuales han sido clasificados como comestibles, venenosos o de comestibilidad indeterminada.

## Importar librerías

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

## Lectura de los dataset de entrenamiento y de prueba

In [None]:
df_train = pd.read_csv('train_dataset.csv')
df_test = pd.read_csv('test_dataset.csv')
df_train.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,y,n,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,s,u
1,e,f,y,g,t,n,f,c,b,p,...,s,g,g,p,w,o,p,k,y,d
2,e,x,y,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,m
3,e,x,s,w,f,n,f,w,b,k,...,s,w,w,p,w,o,e,k,a,g
4,e,x,f,n,t,n,f,c,b,p,...,s,p,w,p,w,o,p,n,v,d


In [None]:
df_train.columns

Index(['type', 'cap_shape', 'cap_surface', 'cap_color', 'bruises', 'odor',
       'gill_attachment', 'gill_spacing', 'gill_size', 'gill_color',
       'stalk_shape', 'stalk_root', 'stalk_surface_above_ring',
       'stalk_surface_below_ring', 'stalk_color_above_ring',
       'stalk_color_below_ring', 'veil_type', 'veil_color', 'ring_number',
       'ring_type', 'spore_print_color', 'population', 'habitat'],
      dtype='object')

In [None]:
df_train['type'].unique()

array(['p', 'e'], dtype=object)

In [None]:
stalk_columns = [col for col in df_train.columns if 'stalk' in col]
unique_values = {col: df_train[col].unique() for col in stalk_columns}
unique_values

{'stalk_shape': array(['e', 't'], dtype=object),
 'stalk_root': array(['e', 'b', 'c', '?', 'r'], dtype=object),
 'stalk_surface_above_ring': array(['s', 'k', 'f', 'y'], dtype=object),
 'stalk_surface_below_ring': array(['s', 'k', 'y', 'f'], dtype=object),
 'stalk_color_above_ring': array(['w', 'g', 'p', 'e', 'n', 'b', 'o', 'c', 'y'], dtype=object),
 'stalk_color_below_ring': array(['w', 'g', 'p', 'b', 'o', 'n', 'c', 'e', 'y'], dtype=object)}

In [None]:
df_train['stalk_root'].replace('?', np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['stalk_root'].replace('?', np.nan, inplace=True)


Con los valores únicos se observa que no tenemos valores distinto a p y e, por lo tanto no hay valores nulos y no es necesario tratar con esto.

## Codificar variables

### Variable objetivo codificada
Se le asigna el valor de 1 a venenoso "p" y el valor de 0 a no venenoso "e".

In [None]:
df_train['type_encoded'] = df_train['type'].map({'p': 1, 'e': 0})
df_test['type_encoded'] = df_test['type'].map({'p': 1, 'e': 0})

### Variables para el entrenamiento del modelo codificadas

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
label_encoder = LabelEncoder()

# Iterate over the stalk_columns list
for col in stalk_columns:
    # Encode the column using LabelEncoder
    df_train[col] = label_encoder.fit_transform(df_train[col])
    df_test[col] = label_encoder.transform(df_test[col])

Agrego las variables codificadas en un dataframe

In [None]:
if 'type_encoded' in df_train.columns:
    hongo_encoded = df_train[stalk_columns + ['type_encoded']]
else:
    hongo_encoded = df_train[stalk_columns]


## Correlación entre las variables categóricas
## Matriz de Cramer V

In [None]:
def cramers_V(var1, var2):
  crosstab = np.array(pd.crosstab(var1, var2, rownames=None, colnames=None))
  stat = chi2_contingency(crosstab)[0]
  obs = np.sum(crosstab)
  mini = min(crosstab.shape) - 1
  return (stat/(obs*mini))

rows= []

for var1 in encode_df:
  col = []
  for var2 in encode_df :
    cramers =cramers_V(encode_df[var1], encode_df[var2])
    col.append(round(cramers,2))
  rows.append(col)
  
cramers_results = np.array(rows)
df = pd.DataFrame(cramers_results, columns = encode_df.columns, index =encode_df.columns)


df

NameError: name 'encode_df' is not defined

In [None]:
with sns.axes_style("white"):
  ax = sns.heatmap(df,
                  vmin=0.,
                  vmax=1,
                  cmap='viridis',
                  annot=True,
                  linewidths=0.1,
                  square=True)

plt.show()

## Exploración relación variables con la variable objetivo

## Definir variable objetivo

Aquí nuestra variable objetivo será type, esta nos dirá si un hongo es venenoso o no.

In [None]:
X_train = df_train.drop(['type'], axis=1)
y_train = df_train['type']