In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from xgboost import to_graphviz, plot_importance

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import _hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingRegressor
%matplotlib inline

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, mean_absolute_error, mean_squared_error,r2_score
from sklearn.metrics import plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve

import keras
from keras.models import save_model, Sequential
from keras.layers import Activation, BatchNormalization, Dense
from keras.optimizers import Adam

pd.options.display.max_columns= None
#pd.options.display.max_rows = None

Using TensorFlow backend.


### Data Exploration

In [2]:
colnames = ['classes','cap-shape','cap-surface','cap-color','bruises','odor','gill-attachment',
           'gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring',
           'stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type',
           'veil-color','ring-number','ring-type','spore-print-color','population','habitat']

In [3]:
df = pd.read_csv("agaricus-lepiota.data",low_memory=False,header=None,names=colnames)

In [4]:
df

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,e,e,s,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,e,c,s,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,e,c,s,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,e,e,s,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,t,e,s,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,e,?,s,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,e,?,s,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,t,?,s,k,w,w,p,w,o,e,w,v,l


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   classes                   8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [6]:
df.describe()

Unnamed: 0,classes,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,2,5,4,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,t,b,s,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,4608,3776,5176,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [7]:
df.shape

(8124, 23)

In [8]:
df2 = df[['cap-shape','cap-surface','cap-color','bruises','odor', 'classes']]

In [9]:
df2

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,classes
0,x,s,n,t,p,p
1,x,s,y,t,a,e
2,b,s,w,t,l,e
3,x,y,w,t,p,p
4,x,s,g,f,n,e
...,...,...,...,...,...,...
8119,k,s,n,f,n,e
8120,x,s,n,f,n,e
8121,f,s,n,f,n,e
8122,k,y,n,f,y,p


In [10]:
df2.isnull().sum()

cap-shape      0
cap-surface    0
cap-color      0
bruises        0
odor           0
classes        0
dtype: int64

In [11]:
df2['classes'] = df2['classes'].map({'p':0,'e':1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [12]:
df2

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,classes
0,x,s,n,t,p,0
1,x,s,y,t,a,1
2,b,s,w,t,l,1
3,x,y,w,t,p,0
4,x,s,g,f,n,1
...,...,...,...,...,...,...
8119,k,s,n,f,n,1
8120,x,s,n,f,n,1
8121,f,s,n,f,n,1
8122,k,y,n,f,y,0


In [13]:
df3 = pd.get_dummies(df2,drop_first=True)

In [14]:
df3

Unnamed: 0,classes,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_g,cap-surface_s,cap-surface_y,cap-color_c,cap-color_e,cap-color_g,cap-color_n,cap-color_p,cap-color_r,cap-color_u,cap-color_w,cap-color_y,bruises_t,odor_c,odor_f,odor_l,odor_m,odor_n,odor_p,odor_s,odor_y
0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0
1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0
4,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,1,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
8120,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
8121,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
8122,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [15]:
X = df3.iloc[:,1:27]
y = df3.iloc[:,1]

### Model Training

In [16]:
X.values, y.values

(array([[0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 1, 0, ..., 0, 0, 0],
        [0, 0, 1, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
 array([0, 0, 0, ..., 0, 0, 0], dtype=uint8))

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [18]:
X_train.shape, X_test.shape

((5686, 26), (2438, 26))

In [19]:
xgb = XGBClassifier(random_state=0, n_estimators=200, objective='binary:logistic')

In [20]:
xgb.fit(X_train,y_train,eval_set=[(X_test,y_test)],eval_metric='logloss',early_stopping_rounds=20)

[0]	validation_0-logloss:0.598359
Will train until validation_0-logloss hasn't improved in 20 rounds.
[1]	validation_0-logloss:0.520749
[2]	validation_0-logloss:0.456032
[3]	validation_0-logloss:0.401294
[4]	validation_0-logloss:0.354553
[5]	validation_0-logloss:0.314298
[6]	validation_0-logloss:0.279378
[7]	validation_0-logloss:0.248931
[8]	validation_0-logloss:0.222246
[9]	validation_0-logloss:0.198773
[10]	validation_0-logloss:0.178062
[11]	validation_0-logloss:0.159718
[12]	validation_0-logloss:0.143449
[13]	validation_0-logloss:0.128987
[14]	validation_0-logloss:0.116096
[15]	validation_0-logloss:0.104592
[16]	validation_0-logloss:0.094318
[17]	validation_0-logloss:0.08512
[18]	validation_0-logloss:0.076883
[19]	validation_0-logloss:0.069502
[20]	validation_0-logloss:0.062877
[21]	validation_0-logloss:0.056928
[22]	validation_0-logloss:0.051583
[23]	validation_0-logloss:0.046776
[24]	validation_0-logloss:0.042454
[25]	validation_0-logloss:0.038566
[26]	validation_0-logloss:0.03506

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [21]:
y_pred = xgb.predict(X_test)

In [22]:
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

### Model Evaluation

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
accuracy_score(y_test,y_pred)

0.9995898277276456