In [58]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [59]:
df = pd.read_csv("cleaned_mushroom_df.csv")

df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,...,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class,label
0,0,0,1,x,s,n,t,Certain Poisonous,f,c,...,w,p,w,o,p,k,s,u,p,0
1,1,1,2,x,s,y,t,Certain Edible,f,c,...,w,p,w,o,p,n,Certain Edible,g,e,1
2,2,2,3,b,s,w,t,Certain Edible,f,c,...,w,p,w,o,p,n,Certain Edible,m,e,1
3,3,3,4,x,y,w,t,Certain Poisonous,f,c,...,w,p,w,o,p,k,s,u,p,0
4,4,4,5,x,s,g,f,n,f,w,...,w,p,w,o,e,n,Certain Edible,g,e,1


In [60]:
y = df["label"]

X = df.drop(columns=["class", "label", "odor"])
X = pd.get_dummies(X)

In [61]:
y.head()

0    0
1    1
2    1
3    0
4    1
Name: label, dtype: int64

In [62]:
X.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,cap-shape_Certain Edible,cap-shape_Certain Poisonous,cap-shape_b,cap-shape_f,cap-shape_k,cap-shape_x,cap-surface_Certain Poisonous,...,population_s,population_v,population_y,habitat_Certain Edible,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u
0,0,0,1,False,False,False,False,False,True,False,...,True,False,False,False,False,False,False,False,False,True
1,1,1,2,False,False,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False
2,2,2,3,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,3,3,4,False,False,False,False,False,True,False,...,True,False,False,False,False,False,False,False,False,True
4,4,4,5,False,False,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,False


In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [64]:
model = LogisticRegression(random_state=1)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [65]:
y_prediction = model.predict(X_test)

In [66]:
confusion_matrix(y_test, y_prediction)

array([[ 990,   21],
       [  20, 1000]])

In [67]:
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1011
           1       0.98      0.98      0.98      1020

    accuracy                           0.98      2031
   macro avg       0.98      0.98      0.98      2031
weighted avg       0.98      0.98      0.98      2031



In [68]:
from sklearn.ensemble import RandomForestClassifier
r_model = RandomForestClassifier(n_estimators = 100, max_depth = 5)
r_model.fit(X_train, y_train)

In [69]:
y_prediction = r_model.predict(X_test)

In [70]:
confusion_matrix(y_test, y_prediction)

array([[1009,    2],
       [   0, 1020]])

In [71]:
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1011
           1       1.00      1.00      1.00      1020

    accuracy                           1.00      2031
   macro avg       1.00      1.00      1.00      2031
weighted avg       1.00      1.00      1.00      2031



In [72]:
r_model.feature_importances_

array([6.71204476e-02, 6.81151712e-02, 7.27573361e-02, 1.18633715e-04,
       5.09716934e-05, 5.89195150e-04, 2.10920847e-04, 2.12245953e-05,
       9.37956842e-04, 7.93956531e-06, 3.76317070e-03, 2.52769608e-03,
       1.54391076e-03, 1.18773151e-03, 4.20665457e-03, 2.85361588e-04,
       5.27782068e-04, 2.98206579e-04, 9.57271404e-04, 1.27415804e-03,
       2.93409323e-02, 2.13752747e-02, 4.83073118e-04, 1.12002352e-03,
       2.22331613e-02, 2.94812060e-02, 6.83399151e-02, 7.15817497e-02,
       0.00000000e+00, 3.62899402e-02, 2.20444936e-04, 5.88680688e-04,
       1.86611560e-03, 6.13871546e-04, 1.40535032e-03, 9.70842586e-06,
       6.12579441e-04, 3.02670117e-03, 1.57084780e-02, 1.51997321e-02,
       1.02666634e-02, 3.51363136e-05, 1.91102473e-02, 1.90314103e-03,
       1.53359149e-02, 1.54681889e-04, 4.20554519e-03, 2.50558383e-02,
       3.54001753e-02, 4.14520922e-03, 4.50203691e-02, 1.84583928e-02,
       2.84634139e-04, 7.63332907e-03, 1.59899901e-03, 2.13735437e-03,
      

In [73]:
y_prediction_train = r_model.predict(X_train)
print(classification_report(y_train, y_prediction_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2905
           1       1.00      1.00      1.00      3188

    accuracy                           1.00      6093
   macro avg       1.00      1.00      1.00      6093
weighted avg       1.00      1.00      1.00      6093



In [74]:
print(df["label"])

0       0
1       1
2       1
3       0
4       1
       ..
8119    1
8120    1
8121    1
8122    0
8123    1
Name: label, Length: 8124, dtype: int64


In [75]:
model.coef_[0]

array([-0.1275809 , -0.1275809 ,  0.25465605,  0.12535674, -0.0104245 ,
        0.29768297, -0.05337475,  0.38475615, -0.36175967, -0.00970015,
        1.29513353, -1.07687839,  0.17368195,  0.27872845, -0.54393983,
        0.28327427,  0.3006877 ,  0.69656626, -0.40548086, -0.22759904,
        0.40292447, -0.02068753,  0.48220171, -0.09996476, -1.33054166,
        1.71277861,  2.28659537, -1.90435842,  0.33954086, -1.23298099,
        0.15572426, -0.2612576 , -0.42297274,  0.03135598,  0.52233883,
        0.08187086,  0.40596982,  0.76264768, -0.41428118,  0.79651813,
        1.05629724,  0.18460915, -0.84101079,  0.29039556, -0.3080542 ,
        0.09518874,  0.22310983, -1.30210236,  1.36604074,  0.61173511,
       -1.23661036,  0.78065441,  0.22645779,  1.2094116 , -0.46192031,
       -0.33359649, -0.09393526,  0.0622774 ,  1.22425157, -0.47703872,
        0.14027922, -0.14174394, -0.36351117,  0.38223695,  0.49776006,
       -0.00601532, -0.10950779, -0.02874466, -0.49707613,  0.90

In [76]:
df2 = pd.DataFrame({"columns":X.columns, "ftr_imp":model.coef_[0]})
print(df2)

                        columns   ftr_imp
0                  Unnamed: 0.1 -0.127581
1                    Unnamed: 0 -0.127581
2                            id  0.254656
3      cap-shape_Certain Edible  0.125357
4   cap-shape_Certain Poisonous -0.010424
..                          ...       ...
87                    habitat_g -0.332742
88                    habitat_l  0.583679
89                    habitat_m -0.075955
90                    habitat_p -0.479500
91                    habitat_u -0.653226

[92 rows x 2 columns]


In [77]:
df2.sort_values("ftr_imp")

Unnamed: 0,columns,ftr_imp
76,spore-print-color_h,-1.939767
27,gill-size_n,-1.904358
24,gill-spacing_c,-1.330542
71,ring-type_Certain Poisonous,-1.305625
47,stalk-surface-above-ring_k,-1.302102
...,...,...
58,stalk-color-below-ring_Certain Edible,1.224252
10,cap-surface_f,1.295134
48,stalk-surface-above-ring_s,1.366041
25,gill-spacing_w,1.712779


In [78]:
dfX = pd.get_dummies(df)
df_corr = dfX.corr()

In [79]:
corr_ = df_corr.sort_values("label")[["label"]]
corr_["label_abs"] = np.abs(corr_["label"])
x2 = corr_[corr_["label_abs"] <= 0.1].index.to_list()
x2

['spore-print-color_Certain Poisonous',
 'cap-color_e',
 'cap-surface_s',
 'cap-surface_y',
 'ring-number_Certain Poisonous',
 'cap-color_Other',
 'veil-color_Certain Poisonous',
 'cap-shape_Certain Poisonous',
 'cap-surface_Certain Poisonous',
 'cap-shape_f',
 'stalk-root_b',
 'stalk-surface-above-ring_Other',
 'cap-shape_x',
 'cap-color_n',
 'cap-color_g',
 'gill-color_Other',
 'gill-color_p',
 'cap-color_Certain Edible',
 'cap-shape_Certain Edible',
 'ring-type_Certain Edible',
 'stalk-surface-below-ring_y']

In [80]:
# Second Model System
X_train, X_test, y_train, y_test = train_test_split(dfX[x2], y, random_state=1)

In [81]:
model = LogisticRegression(random_state=1)
model.fit(X_train, y_train)

In [82]:
y_prediction = model.predict(X_test)

In [83]:
confusion_matrix(y_test, y_prediction)

array([[717, 294],
       [542, 478]])

In [84]:
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       0.57      0.71      0.63      1011
           1       0.62      0.47      0.53      1020

    accuracy                           0.59      2031
   macro avg       0.59      0.59      0.58      2031
weighted avg       0.59      0.59      0.58      2031



In [85]:
from sklearn.ensemble import RandomForestClassifier
r_model = RandomForestClassifier(n_estimators = 100, max_depth = 5)
r_model.fit(X_train, y_train)

In [86]:
y_prediction = r_model.predict(X_test)

In [87]:
confusion_matrix(y_test, y_prediction)

array([[645, 366],
       [124, 896]])

In [88]:
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       0.84      0.64      0.72      1011
           1       0.71      0.88      0.79      1020

    accuracy                           0.76      2031
   macro avg       0.77      0.76      0.75      2031
weighted avg       0.77      0.76      0.76      2031



In [89]:
r_model.feature_importances_

array([4.80848174e-02, 1.37693203e-01, 1.10769545e-01, 9.85320144e-02,
       2.44894492e-02, 2.64820895e-02, 4.14725517e-03, 5.16826193e-04,
       4.08896462e-05, 1.64282795e-02, 2.44323078e-01, 1.63921039e-03,
       1.03096591e-02, 8.24242989e-02, 2.75738244e-02, 3.05005431e-02,
       3.24162356e-02, 7.43233634e-03, 7.76798925e-03, 3.22069445e-02,
       5.62215121e-02])

In [90]:
y_prediction_train = r_model.predict(X_train)
print(classification_report(y_train, y_prediction_train))

              precision    recall  f1-score   support

           0       0.82      0.66      0.73      2905
           1       0.74      0.86      0.80      3188

    accuracy                           0.77      6093
   macro avg       0.78      0.76      0.76      6093
weighted avg       0.78      0.77      0.77      6093



In [99]:
df2 = pd.DataFrame({"columns":dfX[x2].columns, "ftr_imp":r_model.feature_importances_})
df2.sort_values('ftr_imp',ascending=False)

Unnamed: 0,columns,ftr_imp
10,stalk-root_b,0.244323
1,cap-color_e,0.137693
2,cap-surface_s,0.11077
3,cap-surface_y,0.098532
13,cap-color_n,0.082424
20,stalk-surface-below-ring_y,0.056222
0,spore-print-color_Certain Poisonous,0.048085
16,gill-color_p,0.032416
19,ring-type_Certain Edible,0.032207
15,gill-color_Other,0.030501
