In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [4]:
# Method for label encoding
def label_encoder(column):
    le = LabelEncoder()
    label = le.fit_transform(column)
    mapping = {index: label for index, label in enumerate(le.classes_)}
    column = label
    return column

In [7]:
# Sci-Kit Learn Pipeline that loops through a list of Classifiers
# Pipeline uses StandardScaler
# This Pipeline was created for finding the best performing model by iterating through the models and saving the metrics
# Returns a dataframe that contains model names and their accuracy scores

def pipe_report(features, target, models):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)
    
    model_names = [model.__class__.__name__ for model in models]
    score_list = []
    
    for model in models:
        pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
        ])

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        score_list.append(accuracy_score(y_test, y_pred))
        
    df_pipe = pd.DataFrame({'Model': model_names, 'Accuracy': score_list}).sort_values(by='Accuracy', ascending=False)
    return df_pipe

In [6]:
# Sci-Kit Learn Pipeline that loops through a list of Classifiers
# Pipeline uses PolynomialFeatures for transformation and StandardScaler for scaling
# This Pipeline was created for finding the best performing model by iterating through the models and saving the metrics
# Returns a dataframe that contains model names and their accuracy scores

def pipe_poly_report(features, target, models):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)
    
    model_names = [model.__class__.__name__ for model in models]
    score_list = []
    
    for model in models:
        pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('polynomial transformation', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)),
        ('model', model)
        ])

        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        score_list.append(accuracy_score(y_test, y_pred))
        
    df_pipe_poly = pd.DataFrame({'Model': model_names, 'Accuracy': score_list}).sort_values(by='Accuracy', ascending=False)
    return df_pipe_poly

In [None]:
# Method for creating the best pipeline after getting choosing best model
def pipe_poly(X_train, y_train, model):
    scaler = StandardScaler()
    pf = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
    
    pipe = Pipeline([
        ('scaler', scaler),
        ('polynomial transformation', pf),
        ('model', model)
    ])

    pipe.fit(X_train, y_train)
    return pipe