In [6]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [7]:
df = pd.read_csv("cleaned_mushroom_df.csv")

df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,...,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class,label
0,0,0,1,x,s,n,t,Certain Poisonous,f,c,...,w,p,w,o,p,k,s,u,p,0
1,1,1,2,x,s,y,t,Certain Edible,f,c,...,w,p,w,o,p,n,Certain Edible,g,e,1
2,2,2,3,b,s,w,t,Certain Edible,f,c,...,w,p,w,o,p,n,Certain Edible,m,e,1
3,3,3,4,x,y,w,t,Certain Poisonous,f,c,...,w,p,w,o,p,k,s,u,p,0
4,4,4,5,x,s,g,f,n,f,w,...,w,p,w,o,e,n,Certain Edible,g,e,1


In [8]:
y = df["label"]

X = df.drop(columns=["class", "label", "odor"])
X = pd.get_dummies(X)

In [9]:
y.head()

0    0
1    1
2    1
3    0
4    1
Name: label, dtype: int64

In [10]:
X.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,cap-shape_Certain Edible,cap-shape_Certain Poisonous,cap-shape_b,cap-shape_f,cap-shape_k,cap-shape_x,cap-surface_Certain Poisonous,...,population_s,population_v,population_y,habitat_Certain Edible,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u
0,0,0,1,False,False,False,False,False,True,False,...,True,False,False,False,False,False,False,False,False,True
1,1,1,2,False,False,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False
2,2,2,3,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,3,3,4,False,False,False,False,False,True,False,...,True,False,False,False,False,False,False,False,False,True
4,4,4,5,False,False,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False


In [11]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [12]:
dfX = pd.get_dummies(df)
df_corr = dfX.corr()

In [13]:
corr_ = df_corr.sort_values("label")[["label"]]
corr_["label_abs"] = np.abs(corr_["label"])
x2 = corr_[corr_["label_abs"] <= 0.1].index.to_list()
x2

['spore-print-color_Certain Poisonous',
 'cap-color_e',
 'cap-surface_s',
 'cap-surface_y',
 'ring-number_Certain Poisonous',
 'cap-color_Other',
 'veil-color_Certain Poisonous',
 'cap-shape_Certain Poisonous',
 'cap-surface_Certain Poisonous',
 'cap-shape_f',
 'stalk-root_b',
 'stalk-surface-above-ring_Other',
 'cap-shape_x',
 'cap-color_n',
 'cap-color_g',
 'gill-color_Other',
 'gill-color_p',
 'cap-color_Certain Edible',
 'cap-shape_Certain Edible',
 'ring-type_Certain Edible',
 'stalk-surface-below-ring_y']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(dfX[x2], y, random_state=1)

In [15]:
model = LogisticRegression(random_state=1)
model.fit(X_train, y_train)

In [16]:
y_prediction = model.predict(X_test)

In [17]:
confusion_matrix(y_test, y_prediction)

array([[717, 294],
       [542, 478]])

In [18]:
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       0.57      0.71      0.63      1011
           1       0.62      0.47      0.53      1020

    accuracy                           0.59      2031
   macro avg       0.59      0.59      0.58      2031
weighted avg       0.59      0.59      0.58      2031



In [19]:
from sklearn.ensemble import RandomForestClassifier
r_model = RandomForestClassifier(n_estimators = 100, max_depth = 5)
r_model.fit(X_train, y_train)

In [20]:
y_prediction = r_model.predict(X_test)

In [21]:
confusion_matrix(y_test, y_prediction)

array([[647, 364],
       [ 96, 924]])

In [22]:
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       0.87      0.64      0.74      1011
           1       0.72      0.91      0.80      1020

    accuracy                           0.77      2031
   macro avg       0.79      0.77      0.77      2031
weighted avg       0.79      0.77      0.77      2031



In [23]:
r_model.feature_importances_

array([5.78997432e-02, 1.04074224e-01, 9.81261808e-02, 1.03273615e-01,
       1.84572207e-02, 3.48383523e-02, 4.00738935e-03, 4.19280630e-04,
       1.76621792e-04, 1.86366835e-02, 3.06762087e-01, 8.87108920e-04,
       1.36827156e-02, 7.07352719e-02, 1.47526259e-02, 2.91325250e-02,
       4.75817983e-02, 7.19186676e-03, 5.44704636e-03, 2.28457529e-02,
       4.10718895e-02])

In [24]:
y_prediction_train = r_model.predict(X_train)
print(classification_report(y_train, y_prediction_train))

              precision    recall  f1-score   support

           0       0.85      0.66      0.74      2905
           1       0.74      0.89      0.81      3188

    accuracy                           0.78      6093
   macro avg       0.80      0.78      0.78      6093
weighted avg       0.79      0.78      0.78      6093



In [25]:
print(df["label"])

0       0
1       1
2       1
3       0
4       1
       ..
8119    1
8120    1
8121    1
8122    0
8123    1
Name: label, Length: 8124, dtype: int64


In [36]:
model.coef_[0]

array([-2.54375998e+00, -3.57843643e-01, -1.04147701e+00, -9.61982814e-01,
       -4.18620856e+00, -1.42256253e-01, -2.61331209e+00, -5.00113777e-01,
       -5.21564555e-01, -9.26234582e-02, -2.38113943e-01, -9.81688122e-04,
       -2.21979127e-03,  8.53057652e-02,  3.24218220e-02,  1.75288840e+00,
        9.16675988e-02,  9.75423091e-01,  1.47243253e+00,  2.24800882e+00,
        1.51438929e+00])