In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [20]:
# import cleaned data
mushroom = pd.read_csv('mr_cleaned.csv')
print(mushroom.shape)
mushroom.head()

(5631, 19)


Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,c,n,k,e,e,s,s,w,w,o,p,k,s,u
1,e,x,s,y,t,c,b,k,e,c,s,s,w,w,o,p,n,n,g
2,e,b,s,w,t,c,b,n,e,c,s,s,w,w,o,p,n,n,m
3,p,x,y,w,t,c,n,n,e,e,s,s,w,w,o,p,k,s,u
4,e,x,s,g,f,w,b,k,t,e,s,s,w,w,o,e,n,a,g


In [23]:
# Creating independent and dependent variables
#x为判断特征，y为目标特征
x = mushroom.iloc[:,1:].values
y = mushroom.iloc[:,0].values

# Label encoding y - dependent variable
#对y编号
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
y = LE.fit_transform(y)
x

array([['x', 's', 'n', ..., 'k', 's', 'u'],
       ['x', 's', 'y', ..., 'n', 'n', 'g'],
       ['b', 's', 'w', ..., 'n', 'n', 'm'],
       ...,
       ['x', 'y', 'g', ..., 'w', 'y', 'p'],
       ['x', 'y', 'c', ..., 'w', 'c', 'd'],
       ['f', 'y', 'c', ..., 'w', 'c', 'd']], dtype=object)

In [24]:
# One hot encoding independent variable x
#对x的不同特征编号
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

OHE = OneHotEncoder()
x = OHE.fit_transform(x).toarray()

In [25]:
x[0]

array([0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.])

In [26]:
# Splitting the dataset into training set and test set
#分离训练集与测试集
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=0)

In [27]:
# Applying PCA
from sklearn.decomposition import PCA

pca = PCA(n_components = 3)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

In [28]:
# Training the Random Forest Classification on the Training set
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(criterion = 'entropy', random_state = 0, n_estimators = 100)
classifier.fit(x_train, y_train)

RandomForestClassifier(criterion='entropy', random_state=0)

In [29]:
# Predicting the test set
y_pred = classifier.predict(x_test)

In [31]:
# Making the confusion matrix and accuracy score
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
print(cm)
print(ac)

[[699   2]
 [  7 419]]
0.9920141969831411


In [32]:
#默认criterion为gini
classifier = RandomForestClassifier(random_state = 0, n_estimators = 100)
classifier.fit(x_train, y_train)

RandomForestClassifier(random_state=0)

In [33]:
y_pred = classifier.predict(x_test)

In [34]:
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
print(cm)
print(ac)

[[699   2]
 [  5 421]]
0.9937888198757764


In [35]:
#将决策树个数设为200
classifier = RandomForestClassifier(random_state = 0, n_estimators = 200)
classifier.fit(x_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=0)

In [36]:
y_pred = classifier.predict(x_test)

In [37]:
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
print(cm)
print(ac)

[[699   2]
 [  4 422]]
0.9946761313220941


In [38]:
#将决策树个数设为300
classifier = RandomForestClassifier(random_state = 0, n_estimators = 300)
classifier.fit(x_train, y_train)

RandomForestClassifier(n_estimators=300, random_state=0)

In [39]:
y_pred = classifier.predict(x_test)

In [40]:
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
print(cm)
print(ac)

[[699   2]
 [  5 421]]
0.9937888198757764


In [41]:
#将random state设为1
classifier = RandomForestClassifier(criterion = 'entropy', random_state = 1, n_estimators = 100)
classifier.fit(x_train, y_train)

RandomForestClassifier(criterion='entropy', random_state=1)

In [42]:
y_pred = classifier.predict(x_test)

In [43]:
cm = confusion_matrix(y_test, y_pred)
ac = accuracy_score(y_test, y_pred)
print(cm)
print(ac)

[[699   2]
 [  2 424]]
0.9964507542147294
