In [130]:
import pandas as pd
import numpy as np
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [3]:
 data = pd.read_csv("agaricus-lepiota.data")

In [12]:
cols = ["target", "cap-shape", "cap-surface", "cap-color", "bruises", "odor", "gill-attachment", "gill-spacing", "gill-size", "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "ring-number", "ring-type", "spore-print-color", "population", "habitat"]

In [36]:
data.columns = cols

In [47]:
le = LabelEncoder()

for label in cols:
    data[label] = le.fit_transform(data[label])

In [52]:
X = np.array(data.drop(["target"], axis=1))
Y = np.array(data["target"])

In [90]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.2, random_state=20000)

## USING LOGISTIC REGRESSION

In [112]:
log_model = LogisticRegression(max_iter=5000)

In [113]:
log_model.fit(x_train, y_train)

In [114]:
log_predictions = log_model.predict(x_test)

In [115]:
log_accuracy = accuracy_score(y_test, log_predictions) * 100
print(f'Accuracy using Logistic Regression is {log_accuracy}%')

Accuracy using Logistic Regression is 94.46153846153847%


## USING K-NEAREST-NEIGHBORS CLASSIFIER

In [116]:
k_model = KNeighborsClassifier(n_neighbors=5)

In [117]:
k_model.fit(x_train, y_train)

In [118]:
k_predictions = k_model.predict(x_test)

In [123]:
k_accuracy = accuracy_score(y_test, k_predictions) * 100
print(f'Accuracy using KNN is {k_accuracy}%')

Accuracy using KNN is 100.0%


## USING DECISION TREES 

In [125]:
d_model = DecisionTreeClassifier()

In [126]:
d_model.fit(x_train, y_train)

In [127]:
d_predictions = d_model.predict(x_test)

In [128]:
d_accuracy = accuracy_score(y_test, d_predictions) * 100
print(f'Accuracy using DECISION TREE is {d_accuracy}%')

Accuracy using DECISION TREE is 100.0%


### USING GRID SEARCH TO SEE IF WE CAN GET A BETTER ACCURACY WITH LOGISTIC REGRESSION

In [131]:
log_params = {
    'C': [0.1, 1.0, 10.0],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}

In [132]:
_log_model = LogisticRegression(max_iter=5000)

In [134]:
grid_search = GridSearchCV(_log_model, log_params, cv=5)

In [135]:
grid_search.fit(x_train, y_train)

In [136]:
grid_search.best_params_

{'C': 10.0, 'class_weight': 'balanced', 'penalty': 'l1', 'solver': 'liblinear'}

In [137]:
grid_search.best_estimator_

In [138]:
_log_predictions = grid_search.best_estimator_.predict(x_test)

In [141]:
_accuracy = accuracy_score(y_test, _log_predictions) * 100
print(f'The improved accuracy of the Logistic Regression Model is {_accuracy}%')

The improved accuracy of the Logistic Regression Model is 96.3076923076923%
