Download the data and create a well-structured notebook with Markdown headings and notes that describes your process as you:

Load the data
Perform some lightweight exploratory data analysis
Create train/test splits from the data
Fit at least 5 (but hopefully more) different classification model forms to the train split
Score all fitted models on your test split (using at least an F1 score)
Record the best performing regression model in your model selection
Models to try:

kNN
Naive Bayes
SVMs
Logistic Regression
DecisionTree
RandomForrest
GradientBoosting
AdaBoost
Multilayer Perceptron
Voting Ensemble
Stacking Ensemble
Ridge Classifier
LASSO Classifier
Make sure you do at least a little bit of hyper parameter tuning! 

*Prereq download https://archive.ics.uci.edu/static/public/73/mushroom.zip
and rename agaricus-lepiota.data to agaricus-lepiota.data.txt


In [1]:
import os
import csv
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import RidgeClassifier, Lasso
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler



In [2]:
#Load the data after renaming mushroom/agaricus-lepiota.data to mushroom/agaricus-lepiota.data.txt export to new df

raw_df = pd.read_csv (r'mushroom/agaricus-lepiota.data.txt')
raw_df.to_csv (r'rawmushroom.csv', index=None)

data = raw_df

In [3]:
#Perform some lightweight exploratory data analysis
data.shape

(8123, 23)

In [4]:
data.head

<bound method NDFrame.head of       p  x  s  n  t p.1  f  c n.1  k  ... s.2  w w.1 p.2 w.2  o p.3 k.1 s.3  u
0     e  x  s  y  t   a  f  c   b  k  ...   s  w   w   p   w  o   p   n   n  g
1     e  b  s  w  t   l  f  c   b  n  ...   s  w   w   p   w  o   p   n   n  m
2     p  x  y  w  t   p  f  c   n  n  ...   s  w   w   p   w  o   p   k   s  u
3     e  x  s  g  f   n  f  w   b  k  ...   s  w   w   p   w  o   e   n   a  g
4     e  x  y  y  t   a  f  c   b  n  ...   s  w   w   p   w  o   p   k   n  g
...  .. .. .. .. ..  .. .. ..  .. ..  ...  .. ..  ..  ..  .. ..  ..  ..  .. ..
8118  e  k  s  n  f   n  a  c   b  y  ...   s  o   o   p   o  o   p   b   c  l
8119  e  x  s  n  f   n  a  c   b  y  ...   s  o   o   p   n  o   p   b   v  l
8120  e  f  s  n  f   n  a  c   b  n  ...   s  o   o   p   o  o   p   b   c  l
8121  p  k  y  n  f   y  f  c   n  b  ...   k  w   w   p   w  o   e   w   v  l
8122  e  x  s  n  f   n  a  c   b  y  ...   s  o   o   p   o  o   p   o   c  l

[8123 rows x 23 colum

In [5]:
data.describe()

Unnamed: 0,p,x,s,n,t,p.1,f,c,n.1,k,...,s.2,w,w.1,p.2,w.2,o,p.3,k.1,s.3,u
count,8123,8123,8123,8123,8123,8123,8123,8123,8123,8123,...,8123,8123,8123,8123,8123,8123,8123,8123,8123,8123
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3655,3244,2283,4748,3528,7913,6811,5612,1728,...,4935,4463,4383,8123,7923,7487,3967,2388,4040,3148


In [6]:
#Display basic information about the dataset

print("\nInfo")
print(data.info())

# Check for missing values
print("\nCheck null sum")
print(data.isnull().sum())

# Distribution of the target variable
print("\nDistribution")
print(data.iloc[:, 0].value_counts())


Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8123 entries, 0 to 8122
Data columns (total 23 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   p       8123 non-null   object
 1   x       8123 non-null   object
 2   s       8123 non-null   object
 3   n       8123 non-null   object
 4   t       8123 non-null   object
 5   p.1     8123 non-null   object
 6   f       8123 non-null   object
 7   c       8123 non-null   object
 8   n.1     8123 non-null   object
 9   k       8123 non-null   object
 10  e       8123 non-null   object
 11  e.1     8123 non-null   object
 12  s.1     8123 non-null   object
 13  s.2     8123 non-null   object
 14  w       8123 non-null   object
 15  w.1     8123 non-null   object
 16  p.2     8123 non-null   object
 17  w.2     8123 non-null   object
 18  o       8123 non-null   object
 19  p.3     8123 non-null   object
 20  k.1     8123 non-null   object
 21  s.3     8123 non-null   object
 22  u       8123 non-n

In [7]:
# Label Encoding
label_encoder = LabelEncoder()
for col in data.columns:
    data[col] = label_encoder.fit_transform(data[col])

# Splitting the data
X = data.drop('p', axis=1)
y = data['p']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Checking the shape of the training and test sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6498, 22), (1625, 22), (6498,), (1625,))

In [8]:
# Initialize models
models = {
    'LogisticRegression': LogisticRegression(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier()
}

# Fit models and calculate F1 score
scores = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    scores[name] = f1_score(y_test, y_pred, average='weighted')

# Print the F1 scores
scores

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'LogisticRegression': 0.9538475555137003,
 'DecisionTree': 1.0,
 'RandomForest': 1.0,
 'SVM': 0.9883002885717984,
 'KNN': 1.0}

In [None]:
"""
Observations:
The Decision Tree, Random Forest, and KNN models achieved a perfect F1 score of 1.0, indicating extremely high performance on this dataset.
Logistic Regression and SVM also performed well but with slightly lower scores. Even with a vetted dataset such as this, I am skeptical of these results.

Additional Considerations:
While perfect scores may indicate excellent model performance, it's also important to consider the possibility of overfitting, especially with models like Decision Trees.
In practice, you might want to perform additional validation, like cross-validation, or inspect other metrics depending on the specific requirements of your task.
This concludes the task with the selected dataset. You now have an overview of the process of loading data, performing EDA, preprocessing, training multiple models, and evaluating them based on the F1 score
"""

In [10]:
# Adjusting hyperparameters for each model

# Updated models with new hyperparameters
updated_models = {
    'LogisticRegression': LogisticRegression(max_iter=1000, C=0.5, solver='liblinear'),
    'DecisionTree': DecisionTreeClassifier(max_depth=5, min_samples_split=4),
    'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=5),
    'SVM': SVC(C=1, kernel='rbf'),
    'KNN': KNeighborsClassifier(n_neighbors=5, weights='uniform')
}

# Fit models and evaluate
updated_model_scores = {}
for name, model in updated_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    updated_model_scores[name] = f1_score(y_test, y_pred, average='weighted')

updated_model_scores

{'LogisticRegression': 0.94891112674963,
 'DecisionTree': 0.9760090127373192,
 'RandomForest': 0.9876828561596037,
 'SVM': 0.9883002885717984,
 'KNN': 1.0}

In [None]:
"""
Observations:
The KNN model maintains a perfect F1 score of 1.0.
Random Forest and SVM show very high F1 scores, indicating strong performance.
Decision Tree and Logistic Regression have slightly lower scores compared to the others, but they still perform well.
Still skpetical of these perfect or near perfect scores. Maybe this dataset lends itself to easy classificaion.
"""

In [11]:
"""
I couldn't get this to work after playing around with it for a while. This is the Error I got > ValueError: Classification metrics can't handle a mix of binary and continuous targets

"""
# Initialize individual models
logistic_model = LogisticRegression(max_iter=10000)
random_forest_model = RandomForestClassifier()

# Initialize Voting and Stacking Ensembles using the individual models
voting_ensemble = VotingClassifier(
    estimators=[
        ('logistic', logistic_model), 
        ('random_forest', random_forest_model)
    ], voting='hard')

stacking_ensemble = StackingClassifier(
    estimators=[
        ('logistic', logistic_model), 
        ('random_forest', random_forest_model)
    ], final_estimator=LogisticRegression())

# Initialize MLP, Ridge, and LASSO models
mlp = MLPClassifier(max_iter=1000)
ridge = RidgeClassifier()
lasso = make_pipeline(StandardScaler(), Lasso())

# Dictionary of all models
all_models = {
    'LogisticRegression': logistic_model,
    'RandomForest': random_forest_model,
    'MLP': mlp,
    'VotingEnsemble': voting_ensemble,
    'StackingEnsemble': stacking_ensemble,
    'RidgeClassifier': ridge,
    'LASSO': lasso
}

# Fit and score models
final_model_scores = {}
for name, model in all_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    final_model_scores[name] = f1_score(y_test, y_pred, average='weighted')

final_model_scores

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [None]:

# Initialize individual models
logistic_model = LogisticRegression(max_iter=10000)
random_forest_model = RandomForestClassifier()

# Initialize Voting and Stacking Ensembles using the individual models
voting_ensemble = VotingClassifier(
    estimators=[
        ('logistic', logistic_model), 
        ('random_forest', random_forest_model)
    ], voting='hard')

stacking_ensemble = StackingClassifier(
    estimators=[
        ('logistic', logistic_model), 
        ('random_forest', random_forest_model)
    ], final_estimator=LogisticRegression())

# Initialize MLP, Ridge, and LASSO models
mlp = MLPClassifier(max_iter=1000)
ridge = RidgeClassifier()
lasso = make_pipeline(StandardScaler(), Lasso())

# Dictionary of all models
all_models = {
    'LogisticRegression': logistic_model,
    'RandomForest': random_forest_model,
    'MLP': mlp,
    'VotingEnsemble': voting_ensemble,
    'StackingEnsemble': stacking_ensemble,
    'RidgeClassifier': ridge,
    'LASSO': lasso
}

# Fit and score models
model_scores = {}
for name, model in all_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    model_scores[name] = f1_score(y_test, y_pred, average='weighted')

model_scores