In [1]:
import pandas as pd

train_df = pd.read_csv("train_dataset.csv")
test_df = pd.read_csv("test_dataset.csv")

# Dataframe inspection

In [2]:
train_df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,y,n,t,p,f,c,n,p,...,s,w,w,p,w,o,p,k,s,u
1,e,f,y,g,t,n,f,c,b,p,...,s,g,g,p,w,o,p,k,y,d
2,e,x,y,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,m
3,e,x,s,w,f,n,f,w,b,k,...,s,w,w,p,w,o,e,k,a,g
4,e,x,f,n,t,n,f,c,b,p,...,s,p,w,p,w,o,p,n,v,d


In [3]:
test_df.shape, train_df.shape

((2031, 23), (6093, 23))

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6093 entries, 0 to 6092
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   type                      6093 non-null   object
 1   cap_shape                 6093 non-null   object
 2   cap_surface               6093 non-null   object
 3   cap_color                 6093 non-null   object
 4   bruises                   6093 non-null   object
 5   odor                      6093 non-null   object
 6   gill_attachment           6093 non-null   object
 7   gill_spacing              6093 non-null   object
 8   gill_size                 6093 non-null   object
 9   gill_color                6093 non-null   object
 10  stalk_shape               6093 non-null   object
 11  stalk_root                6093 non-null   object
 12  stalk_surface_above_ring  6093 non-null   object
 13  stalk_surface_below_ring  6093 non-null   object
 14  stalk_color_above_ring  

In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2031 entries, 0 to 2030
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   type                      2031 non-null   object
 1   cap_shape                 2031 non-null   object
 2   cap_surface               2031 non-null   object
 3   cap_color                 2031 non-null   object
 4   bruises                   2031 non-null   object
 5   odor                      2031 non-null   object
 6   gill_attachment           2031 non-null   object
 7   gill_spacing              2031 non-null   object
 8   gill_size                 2031 non-null   object
 9   gill_color                2031 non-null   object
 10  stalk_shape               2031 non-null   object
 11  stalk_root                2031 non-null   object
 12  stalk_surface_above_ring  2031 non-null   object
 13  stalk_surface_below_ring  2031 non-null   object
 14  stalk_color_above_ring  

There appears to be no nulls inside the dataset

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# get the target column and the rest of the data separated
y_train = train_df['type']
X_train = train_df.drop(columns=['type'])

y_test = test_df['type']
X_test = test_df.drop(columns=['type'])

In [7]:
hot_encoder = OneHotEncoder()
X_train_encoded = hot_encoder.fit_transform(X_train)
X_test_encoded = hot_encoder.transform(X_test)

X_train_encoded

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 134046 stored elements and shape (6093, 117)>

### Only need to encode label for the XGBClassifier

In [8]:
label_encoder = LabelEncoder()
Y_train_encoded = label_encoder.fit_transform(y_train)
Y_test_encoded = label_encoder.fit_transform(y_test)
Y_train_encoded

array([1, 0, 0, ..., 1, 1, 0])

## functions like a random forest classifier

In [9]:
from xgboost import XGBClassifier

model = XGBClassifier(                    
    alpha=0.1,                   
    subsample=0.8,     
    colsample_bytree=0.6,  
    objective='binary:logistic',
    max_depth=14,             
    min_child_weight=7,         
    gamma=1e-6,                
    #random_state=42,                 
    n_estimators=100
    )

XGB = model.fit(
    X_train_encoded, 
    Y_train_encoded, 
    eval_set=[(X_test_encoded, Y_test_encoded)])

[0]	validation_0-logloss:0.45237
[1]	validation_0-logloss:0.31514
[2]	validation_0-logloss:0.22489
[3]	validation_0-logloss:0.16315
[4]	validation_0-logloss:0.12079
[5]	validation_0-logloss:0.09306
[6]	validation_0-logloss:0.07106
[7]	validation_0-logloss:0.05578
[8]	validation_0-logloss:0.04540
[9]	validation_0-logloss:0.03641
[10]	validation_0-logloss:0.03007
[11]	validation_0-logloss:0.02595
[12]	validation_0-logloss:0.02293
[13]	validation_0-logloss:0.01974
[14]	validation_0-logloss:0.01715
[15]	validation_0-logloss:0.01559
[16]	validation_0-logloss:0.01467
[17]	validation_0-logloss:0.01400
[18]	validation_0-logloss:0.01292
[19]	validation_0-logloss:0.01250
[20]	validation_0-logloss:0.01207
[21]	validation_0-logloss:0.01144
[22]	validation_0-logloss:0.01076
[23]	validation_0-logloss:0.01056
[24]	validation_0-logloss:0.01003
[25]	validation_0-logloss:0.00992
[26]	validation_0-logloss:0.00971
[27]	validation_0-logloss:0.00941
[28]	validation_0-logloss:0.00901
[29]	validation_0-loglos

In [10]:
y_pred_train = XGB.predict(X_train_encoded)

In [11]:
y_pred_test = XGB.predict(X_test_encoded)

In [12]:
train_accuracy = accuracy_score(Y_train_encoded, y_pred_train)
print(f'Accuracy of training data: {train_accuracy:.2f}')

test_accuracy = accuracy_score(Y_test_encoded, y_pred_test)
print(f'Accuracy of testing data: {test_accuracy:.2f}')

Accuracy of training data: 1.00
Accuracy of testing data: 1.00


This class is created because the normal RandomForestClassifier does not support un-encoded data

## Normal random forest

In [19]:
random_forest_classif = RandomForestClassifier(n_estimators=100, random_state=53)
random_forest_classif.fit(X_train_encoded, Y_train_encoded)

In [20]:
y_train_pred = random_forest_classif.predict(X_train_encoded)
y_test_pred = random_forest_classif.predict(X_test_encoded)


In [21]:
train_accuracy = accuracy_score(Y_train_encoded, y_train_pred)
print(f'Accuracy of training data: {train_accuracy:.2f}')
test_accuracy = accuracy_score(Y_test_encoded, y_test_pred)
print(f'Accuracy of training data: {test_accuracy:.2f}')

TypeError: Labels in y_true and y_pred should be of the same type. Got y_true=['e' 'p'] and y_pred=[0 1]. Make sure that the predictions provided by the classifier coincides with the true labels.

In [18]:
import pickle

# save model XGB
# with open('model.pkl', 'wb') as file:
#     pickle.dump(XGB, file)

# save normal random forest classif
with open('model.pkl', 'wb') as file:
    pickle.dump(random_forest_classif, file)

# Save my custom RFC
# with open('model.pkl', 'wb') as file:
#     pickle.dump(custom_RFC, file)