In [1]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pickle
import dill

class PreProcessing(BaseEstimator, TransformerMixin):
    """Custom Pre-Processing estimator for our use-case
    """

    def __init__(self):
        pass

    def transform(self, df):
        label_encoders = {}
        columns = ['has-ring','does-bruise-or-bleed','cap-shape','cap-color','gill-attachment','gill-color','stem-color','ring-type','habitat','season']
        for column in columns:
            label_encoders[column] = LabelEncoder()
            df[column] = label_encoders[column].fit_transform(df[column]) 
        return df
    def fit(self, X, y=None):
        return self  
data = pd.read_csv('Mushroom_data.csv',sep=';')
pred_var = ['cap-diameter','cap-color','cap-shape','does-bruise-or-bleed','gill-attachment','gill-color','stem-height','stem-width','stem-color','has-ring','ring-type','habitat','season','class']

data = data[pred_var]
data = data.dropna()
label_encoder = LabelEncoder()
data['class'] = label_encoder.fit_transform(data['class'])
mask = data['gill-color'] != 'f'
data = data[mask]
mask = data['stem-color'] != 'f'
data = data[mask]

X_train, X_test, y_train, y_test = train_test_split(data.drop('class', axis=1), data['class'], test_size=0.2, random_state=42)

preprocess = PreProcessing()
scaler = StandardScaler()
pipe = make_pipeline(PreProcessing(),
                     StandardScaler(),
                    RandomForestClassifier())
pipe.fit(X_train,y_train)

predictions_ensemble = pipe.predict(X_test)
print(classification_report(y_test, predictions_ensemble))

with open("model.pkl", "wb") as dill_file:
    dill.dump(pipe, dill_file)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4128
           1       1.00      1.00      1.00      4980

    accuracy                           1.00      9108
   macro avg       1.00      1.00      1.00      9108
weighted avg       1.00      1.00      1.00      9108



In [2]:
X_train.head()


Unnamed: 0,cap-diameter,cap-color,cap-shape,does-bruise-or-bleed,gill-attachment,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
23840,3.9,0,6,0,5,9,5.8,6.96,10,0,1,0,2
13146,0.83,6,6,0,1,5,5.44,1.11,11,0,1,1,0
9450,3.84,10,0,0,5,9,4.75,3.1,11,0,1,1,0
27451,17.19,10,2,1,1,4,7.65,27.18,10,0,1,0,0
16328,5.51,5,6,0,4,10,5.64,8.95,10,0,1,0,2
