In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
mushrooms_df = pd.read_csv(r"C:\Users\Fernando\Desktop\mushrooms\dataset_mushrooms.csv")
testing_df = pd.read_csv(r"C:\Users\Fernando\Desktop\mushrooms\submission_mushrooms.csv")
column_def = pd.read_csv(r"C:\Users\Fernando\Desktop\mushrooms\columns_definitions.csv")

In [3]:
column_def.column_name.unique() # Features for the mushrooms in the dataframe

array(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises',
       'odor', 'gill-attachment', 'gill-spacing', 'gill-size',
       'gill-color', 'stalk-shape', 'stalk-root',
       'stalk-surface-above-ring', 'stalk-surface-below-ring',
       'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type',
       'veil-color', 'ring-number', 'ring-type', 'spore-print-color',
       'population', 'habitat'], dtype=object)

**Data Cleaning**

In [4]:
print(testing_df.nunique())

Unnamed: 0                  203
cap-shape                     5
cap-surface                   4
cap-color                     9
bruises                       2
odor                          8
gill-attachment               2
gill-spacing                  2
gill-size                     2
gill-color                   12
stalk-shape                   2
stalk-root                    5
stalk-surface-above-ring      4
stalk-surface-below-ring      4
stalk-color-above-ring        8
stalk-color-below-ring        8
veil-type                     1
veil-color                    4
ring-number                   2
ring-type                     4
spore-print-color             8
population                    6
habitat                       7
dtype: int64


In [6]:
#Veil Type will be dropped since it only has 1 value option.
testing_df.drop(["veil-type"],axis=1,inplace=True)
mushrooms_df.drop(["veil-type"],axis=1,inplace=True)

In [7]:
#Stalk Root contains missing values "?" that will be accounted for
print(testing_df['stalk-root'].value_counts())

b    89
?    61
e    28
c    21
r     4
Name: stalk-root, dtype: int64


In [11]:
del testing_df['Unnamed: 0'] #reomving auto generated index
testing_df

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,y,y,f,f,f,c,b,h,e,...,k,k,n,b,w,o,l,h,v,d
1,b,s,y,t,l,f,c,b,k,e,...,s,s,w,w,w,o,p,n,n,g
2,x,f,n,t,n,f,c,b,u,t,...,s,s,p,w,w,o,p,n,y,d
3,k,s,n,f,y,f,c,n,b,t,...,s,s,w,p,w,o,e,w,v,p
4,f,s,e,f,f,f,c,n,b,t,...,k,k,w,p,w,o,e,w,v,d
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,f,s,n,f,n,a,c,b,y,e,...,s,s,o,o,n,o,p,b,v,l
199,f,y,g,f,f,f,c,b,p,e,...,k,k,p,b,w,o,l,h,y,p
200,x,f,y,f,f,f,c,b,h,e,...,k,k,b,p,w,o,l,h,v,p
201,x,f,n,t,n,f,c,b,p,t,...,s,s,g,w,w,o,p,k,y,d


In [None]:
#testing_df.drop(["veil-color"],axis=1,inplace=True)
#mushrooms_df.drop(["veil-color"],axis=1,inplace=True)

In [None]:
#Column stalk root can be dropped, has low significance
mushrooms_df.drop(["stalk-root"],axis=1,inplace=True) #dropping ? column
testing_df.drop(["stalk-root"],axis=1,inplace=True) #dropping ? column

In [None]:
#column name "class" changed to "class_outcome" to avoid errors
mushrooms_df = mushrooms_df.rename(columns = {'class':'class_outcome'}) #renaming column name to avoid errors
sns.countplot('class_outcome', data = mushrooms_df) #Imbalanced class, have to reconsider performance metrics

In [None]:
mushrooms_df.columns

In [None]:
y = mushrooms_df["class_outcome"].values   # dataframe labels.. edible or poisonous
mushrooms_df.drop(["class_outcome"],axis=1,inplace=True)  # dropping the labels from the data

x = mushrooms_df  

In [None]:
merged_data = pd.concat([x, testing_df]) #Must merge data to allow get_dummies to create the same variables on x and testing data

In [None]:
merged_data

In [None]:
merged_data = pd.get_dummies(merged_data) #get dummies to create varibles for each feature
print(merged_data.shape)

In [None]:
x = merged_data[0:7921]
testing_data = merged_data[7921:]
testing_data

In [None]:
y #1 = poisonous 2 = edible

In [None]:
#Loop will append values depending on "p" or "e" values
numerical_values = []

p_value = 0
e_value = 0

for value in y:
    if value == 'p':
        numerical_values.append(1)
        p_value += 1
    else:
        numerical_values.append(0)
        e_value +=1
        
print("There are " + str(p_value) + " poisonous mushrooms in the original dataset.")
print("There are " + str(e_value) + " edible mushrooms in the original dataset.")

In [None]:
y = numerical_values

In [None]:
from sklearn.model_selection import train_test_split

**Logistic Regression**

In [None]:
x_train, x_test, y_train, y_test_lr = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(x_train,y_train)
y_pred_log = log_reg.predict(x_test)

In [None]:
print("Test accuracy: ", log_reg.score(x_test,y_test_lr))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test_lr,y_pred_log)
print(cm)

**Applying Logistic Regression Model to Submission Data**

In [None]:
log_output = log_reg.predict(testing_data)
print(log_output)

In [None]:
e_total = []
p_total = []

for value in log_output:
    if value == 1:
        p_total.append("p")
    else:
        e_total.append("e")
print("Logistic Regression Predicts: " + str(len(p_total)) + " poisonous mushrooms in the submission dataset.")
print("Logistic Regression Predicts: " + str(len(e_total)) + " edible mushrooms in the submission dataset.")

**K Nearest Neighbors**

In [None]:
x_train, x_test, y_train, y_test_knn = train_test_split(x, y, test_size=0.2, random_state=4)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(x_train, y_train)
y_pred_knn = knn.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test_knn, y_pred_knn)*100
print('Accuracy of our model is equal ' + str(round(accuracy, 2)) + ' %.')
    
print('Accuracy:', accuracy_score(y_test_knn, y_pred_knn))
#print('F1-score:', f1_score(y_test, y_pred))

In [None]:
#cm = confusion_matrix(y_test, y_pred)
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test_knn,y_pred_knn)
cm

**Applying KNN Model to Submission Data**

In [None]:
knn_output = knn.predict(testing_data)
print(knn_output) #KNN Model predicts 85/203 mushrooms are poisonous

In [None]:
e_total_knn = []
p_total_knn = []

for value in knn_output:
    if value == 1:
        p_total_knn.append("p")
    else:
        e_total_knn.append("e")
print("There are " + str(len(p_total_knn)) + " poisonous mushrooms.")
print("There are " + str(len(e_total_knn)) + " edible mushrooms.")

**Support Vector Machine**

In [None]:
x_train, x_test, y_train, y_test_svm = train_test_split(x, y, test_size=0.2)#, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
svm = SVC()

**GridSearchCV Parameters**

In [None]:
#parameters = [{'kernel': ['rbf'], 'C': [1, 10, 100], 'gamma': [1e-3, 1e-4, 1e-5]},
              #{'kernel': ['poly'], 'C': [1, 10, 100], 'degree': (2, 3, 4)}
             #]
#clf = GridSearchCV(svm, parameters, cv=5, scoring="accuracy")
#clf.fit(x_train, y_train)
#print(clf.best_params_)

In [None]:
svm_model=SVC(kernel='poly', C=100, degree=2)
svm_model.fit(x_train,y_train)
svm_model.score(x_test,y_test_svm)

In [None]:
y_pred_svm = svm_model.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_m = confusion_matrix(y_test_svm,y_pred_svm)
confusion_m

**Applying SVM to Submission Data**

In [None]:
svm_output = svm_model.predict(testing_data)
print(svm_output)

In [None]:
e_total_svm = []
p_total_svm = []

for value in svm_output:
    if value == 1:
        p_total_svm.append("p")
    else:
        e_total_svm.append("e")


print("There are " + str(len(p_total_svm)) + " poisonous mushrooms.")
print("There are " + str(len(e_total_svm)) + " edible mushrooms.")

**Decision Tree**

In [None]:
x_train, x_test, y_train, y_test_dt = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from sklearn import tree

In [None]:
dec_tree = tree.DecisionTreeClassifier()
dec_tree.fit(x_train,y_train)
dec_tree.score(x_test,y_test_dt)
y_pred_dt = dec_tree.predict(x_test)

In [None]:
confusion_dt = confusion_matrix(y_test_dt,y_pred_dt)
confusion_dt

**Applying Decision Tree Model to Submission Data**

In [None]:
dt_output = dec_tree.predict(testing_data)
print(dt_output)

In [None]:
e_total_dt = []
p_total_dt = []

for value in dt_output:
    if value == 1:
        p_total_dt.append("p")
    else:
        e_total_dt.append("e")


print("There are " + str(len(p_total_dt)) + " poisonous mushrooms.")
print("There are " + str(len(e_total_dt)) + " edible mushrooms.")

**ROC Comparing Models**

In [None]:
from sklearn.metrics import roc_curve, auc


log_fpr, log_tpr, threshold = roc_curve(y_test_lr, y_pred_log)
auc_log = auc(log_fpr, log_tpr)

knn_fpr, knn_tpr, threshold = roc_curve(y_test_knn, y_pred_knn)
auc_knn = auc(knn_fpr, knn_tpr)

svm_fpr, svm_tpr, threshold = roc_curve(y_test_svm, y_pred_svm)
auc_svm = auc(svm_fpr, svm_tpr)

dt_fpr, dt_tpr, threshold = roc_curve(y_test_dt, y_pred_dt)
auc_dt = auc(dt_fpr, dt_tpr)

plt.figure(figsize=(5, 5), dpi=100)
plt.plot(log_fpr, log_tpr, linestyle='-', label='LOG (auc = %0.3f)' % auc_log)
plt.plot(svm_fpr, svm_tpr, linestyle='-', label='SVM (auc = %0.3f)' % auc_svm)
plt.plot(knn_fpr, knn_tpr, marker='.', label='KNN (auc = %0.3f)' % auc_knn)
plt.plot(dt_fpr, dt_tpr, marker='.', label='Decision Tree (auc = %0.3f)' % auc_dt)

plt.plot([0,1], [0,1], linestyle = '--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.legend()

plt.show()


In [None]:
P = np.array(testing_data)

y_pred_final =log_reg.predict(P)
h = pd.DataFrame(y_pred_final)
h2 = h.rename({0:'Y_predicted'},axis=1)
print("Shape:",h2.shape)
print("Unique Values:", np.unique(h2))
h2.to_csv("Group4_Mushrooms_Log-Reg_Y-predicted.csv", index=True)


In [None]:
log_output == dt_output

In [None]:
dt_output == svm_output