Based on the template I uploaded yesterday, please update your current template as follows:

1) include all possible wrangling functions, including t-tests, chi-squared, anova, correlation heatmaps, normality tests (one of them), missing value solutions, transformations etc.Ã‚Â

2) include all classification and regression algorithms

3) FS: feature selection algorithms (I included RFFS, RFE, MI and PCA but you can add only RFFS as for me, it works good).Ã‚Â

4) CV: cross validation approaches (include stratified K-fold definitely)

5) REG: regularization (lasso, ridge regression) methods (include Lasso definitely).

6) Include methods for addressing class imbalance

7) Include any other helper/logistic function you want to add

8) Include function for executing ML (classification) without FS, REG and CV. Output: Accuracy, precision (+ve), recall (+ve), AUC, ROC curve

9) Include function for executing ML (classification) with FS and without REG and CV. Output: Accuracy, precision (+ve), recall (+ve), AUC, ROC curve

10) Include function for executing ML (classification) with REG and without FS and CV. Output: Accuracy, precision (+ve), recall (+ve), AUC, ROC curve

11) Include function for executing ML (classification) with CV and without FS and REG. Output: Accuracy, precision (+ve), recall (+ve), AUC, ROC curve

12) Include function for executing ML (classification) with CV and with FS and with REG. Output: Accuracy, precision (+ve), recall (+ve), AUC, ROC curve

NB: 8-12 above can be merged as a single function as well

Output Required:

1) Completed template 2) Completed thorough wrangling for all three datasets and include interpretations. 3) Filled up Results.Template excel attached. 4) A detailed commentary on the results regarding performance comparison of algorithms and the effect of REG, CV, FS etc. You can also use different methods of CV, REG and FS to add breadth to your assignment and for bonus marks. Record all results in the template.

In [22]:
#!/usr/bin/env python
# coding: utf-8

#import basic modules
import pandas as pd 
import numpy as np
import seaborn as sb
import seaborn as sns
import math
import warnings
import matplotlib.pyplot as plt        
get_ipython().run_line_magic('matplotlib', 'inline')

from sklearn import preprocessing

#import feature selection modules
from sklearn.feature_selection import mutual_info_classif,RFE,RFECV
from sklearn.feature_selection import mutual_info_regression

#import classification modules
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

# import regression modules
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import VotingRegressor
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

#import split methods
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit

#import performance scores
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve 
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.decomposition import PCA

# import scaling
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import RepeatedStratifiedKFold


from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import AdaBoostClassifier

from matplotlib import pyplot

from sklearn.metrics import r2_score
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
import itertools



from matplotlib import pyplot


# example of grid searching key hyperparametres for logistic regression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC   
from sklearn.ensemble import GradientBoostingClassifier







warnings.filterwarnings("ignore")
sb.set(color_codes=True, font_scale=1.2)

# need to install xgboost first
# pip install xgboost in conda environment
try:
    from xgboost import XGBClassifier
except:
    print("Failed to import xgboost, make sure you have xgboost installed")
    print("Use following command to install it: pip install xgboost")
    XGBClassifier = None

try:
    import lightgbm as lgb
except:
    print("Failed to import lightgbm, make sure that you have lightgbm installed")
    print("Use following command to install it: conda install -c conda-forge lightgbm")
    lgb = None

# Load Data
def load_data(file_name):
    def readcsv(file_name):
        return pd.read_csv(file_name)
    def readexcel(file_name):
        return pd.read_excel(file_name)
    def readjson(file_name):
        return pd.read_json(file_name)
    func_map = {
        "csv": readcsv,
        "xls": readexcel,
        "xlsx": readexcel,
        "txt": readcsv,
        "json": readjson
    }
    
    # default reader = readcsv
    reader = func_map.get("csv")
    
    for k,v in func_map.items():
        if file_name.endswith(k):
            reader = v
            break
    return reader(file_name)


#data cleaning function
def cleaningup(df, to_date=[], to_numeric=[], cols_to_delete=[], fill_na_map={}, cols_to_drop_na_rows=[], cols_to_interpolate=[]):
    """
    We will perform all the generic cleanup stuff in this function,
    Data specific stuff should be handled by driver program.
    
    Mandatory Parameter:
    df : Dataframe to be cleaned
    
    Optional Parameters:
    to_date:  List of columns to convert to date
    to_numeric:  List of columns to convert to numeric
    cols_to_delete: All the useless columns that we need to delete from our dataset
    fill_na_map:  A dictionary containing map for column and a value to be filled in missing places
                    e.g. {'age': df['age'].median(), 'city': 'Karachi'}
    cols_to_drop_na_rows: List of columns where missing value in not tolerable and we couldn't risk predicting                     value for it, so we drop such rows.
    cols_to_interpolate: List of columns where missing values have to be replaced by forward interpolation
    """
    
    # columns to convert to date format
    def change_type_to_date(df, to_date):
        # Deal with incorrect data in date column
        for i in to_date:
            df[i] = pd.to_datetime(df[i], errors='coerce')
        return df
    
    # columns to convert to numerical format
    def change_type_to_numeric(df, to_numeric):
        # Deal with incorrect data in numeric columns
        for i in to_numeric:
            df[i] = pd.to_numeric(df[i], errors='coerce')
        return df
    
    # columns to delete
    def drop_useless_colums(df, cols_to_delete):
        # Drop useless columns before dealing with missing values
        for i in cols_to_delete:
            df = df.drop(i, axis=1)
        return df
    
    #drop all rows which contain more than 40% missing values
    def drop_useless_rows(df):
        min_threshold = math.ceil(len(df.columns)*0.4)
        df = df.dropna(thresh=min_threshold)
        return df
    
    # drop rows in which columns specified by the driver program has missing values
    def drop_na_rows(df, cols_to_drop_na_rows):
        for i in cols_to_drop_na_rows:
            df = df.drop(df[df[i].isnull()].index)
        return df
    
    # Deal with missing values according to map, e.g., {'age': df['age'].median(), 'city': 'Karachi'}
    def fill_na_vals(df, fill_na_map):
        for col,val in fill_na_map.items():
            df[col].fillna(val, inplace=True)
        return df
    
    # Deal with missing values according to the interpolation
    def fill_na_interpolate(df, cols_to_interpolate):
        for i in cols_to_interpolate:
            df[i] = df[i].interpolate(method ='linear', limit_direction ='forward')
        return df
    
    try:
        df = change_type_to_date(df, to_date)
        df = change_type_to_numeric(df, to_numeric)
        df = drop_useless_colums(df, cols_to_delete)
        df = drop_useless_rows(df)
        df = drop_na_rows(df, cols_to_drop_na_rows)
        df = fill_na_vals(df, fill_na_map)
        df = fill_na_interpolate(df, cols_to_interpolate)
        print("df is all cleaned up..")
        return df
    except Exception as e:
        print("Failed to perform cleanup, exception=%s" % str(e))
    finally:
        return df

    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    
    

#basic analysis
def basicanalysis(df):
    print("Shape is:\n", df.shape)
    print("\n Columns are:\n", df.columns)
    print("\n Types are:\n", df.dtypes)
    print("\n Statistical Analysis of Numerical Columns:\n", df.describe())

#string column analysis analysis
def stringcolanalysis(df):
    stringcols = df.select_dtypes(exclude=[np.number, "datetime64"])
    fig = plt.figure(figsize = (8,10))
    for i,col in enumerate(stringcols):
        fig.add_subplot(4,2,i+1)
        fig.savefig('Categorical.png')
        df[col].value_counts().plot(kind = 'bar', color='black' ,fontsize=10)
        plt.tight_layout()
        plt.title(col)

#numerical analysis
def numcolanalysis(df):
    numcols = df.select_dtypes(include=np.number)
    
    # Box plot for numerical columns
    for col in numcols:
        fig = plt.figure(figsize = (5,5))
        sb.boxplot(df[col], color='grey', linewidth=1)
        plt.tight_layout()
        plt.title(col)
        plt.savefig("Numerical.png")
    
    # Lets also plot histograms for these numerical columns
    df.hist(column=list(numcols.columns),bins=25, grid=False, figsize=(15,12),
                 color='#86bf91', zorder=2, rwidth=0.9)

# Perform correlation analysis over numerical columns
def correlation_anlysis(df):
    # NOTE: If label column is non-numeric, 'encode' it before calling this function 
    numcols = df.select_dtypes(include=np.number)
    corr = numcols.corr()
    ax = sb.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sb.diverging_palette(20, 220, n=200),
    square=True
    )
    
    ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right')


# Apply label encoding on specified columns
def apply_label_encoding(df, cols=[]):
    le = preprocessing.LabelEncoder()
    for i in cols:
        le.fit(df[i])
        df[i] = le.transform(df[i])
    return df


# One-Hot/dummy encoding on specified columns
def onehotencoding(df):
    df = pd.get_dummies(df)
    return df

# One Hot encoding with Pandas categorical dtype
def onehotencoding_v2(df, cols=[]):
    for col in cols:
        df[col] = pd.Categorical(df[col])
        dfDummies = pd.get_dummies(df[col], prefix = col)
        df = pd.concat([df, dfDummies], axis=1)
        df = df.drop(col, axis=1)
    return df


#Train Test Split: splitting manually
def traintestsplit(df,split,random=None, label_col=''):
    #make a copy of the label column and store in y
    y = df[label_col].copy()
    
    #now delete the original
    X = df.drop(label_col,axis=1)
    
    #manual split
    trainX, testX, trainY, testY= train_test_split(X, y, test_size=split, random_state=random)
    return X, trainX, testX, trainY, testY

#helper function which only splits into X and y
def XYsplit(df, label_col):
    y = df[label_col].copy()
    X = df.drop(label_col,axis=1)
    return X,y


# #### For Cross Validation, lets create generator functions for different cross validation techniques, this will help us run an iterator over all folds
def cross_valid_kfold(X, y, split=10, random=None, shuffle=False):
    """
    Generator function for KFold cross validation
    
    """
    kf = KFold(n_splits=split, random_state=random, shuffle=shuffle)
    for train_index, test_index in kf.split(X):
        trainX, testX = X.iloc[train_index], X.iloc[test_index] 
        trainY, testY = y.iloc[train_index], y.iloc[test_index]
        yield trainX,trainY,testX,testY
    
def cross_valid_repeated_kf(X, y, split=10, random=None, repeat=10):
    """
    Generator function for Repeated KFold cross validation
    
    """
    kf = RepeatedKFold(n_splits=split, random_state=random, n_repeats=repeat)
    for train_index, test_index in kf.split(X):
        trainX, testX = X.iloc[train_index], X.iloc[test_index] 
        trainY, testY = y.iloc[train_index], y.iloc[test_index]
        yield trainX,trainY,testX,testY
        

def cross_valid_stratified_kf(X, y, split=10, random=None, shuffle=False):
    """
    Generator function for Stratified KFold cross validation
    
    """
    skf = StratifiedKFold(n_splits=split, random_state=random, shuffle=shuffle)
    for train_index, test_index in skf.split(X, y):
        trainX, testX = X.iloc[train_index], X.iloc[test_index] 
        trainY, testY = y.iloc[train_index], y.iloc[test_index]
        yield trainX,trainY,testX,testY


def cross_valid_strat_shuffle_kf(X, y, split=10, random=None):
    """
    Generator function for StratifiedShuffle cross validation
    
    """
    sss = StratifiedShuffleSplit(n_splits=split, random_state=random)
    for train_index, test_index in sss.split(X, y):
        trainX, testX = X.iloc[train_index], X.iloc[test_index] 
        trainY, testY = y.iloc[train_index], y.iloc[test_index]
        yield trainX,trainY,testX,testY


# Validation metrics for classification
def validationmetrics(model, testX, testY, verbose=True):   
    predictions = model.predict(testX)
    
    if model.__class__.__module__.startswith('lightgbm'):
        for i in range(0, predictions.shape[0]):
            predictions[i]= 1 if predictions[i] >= 0.5 else 0
    
    #Accuracy
    accuracy = accuracy_score(testY, predictions)*100
    
    #Precision
    precision = precision_score(testY, predictions, pos_label=1, labels=[0,1])*100
    
    #Recall
    recall = recall_score(testY, predictions,pos_label=1,labels=[0,1])*100
    
    #get FPR (specificity) and TPR (sensitivity)
    fpr , tpr, _ = roc_curve(testY, predictions)
    
    #AUC
    auc_val = auc(fpr, tpr)
    
    #F-Score
    f_score = f1_score(testY, predictions)
    
    if verbose:
        print("Prediction Vector: \n", predictions)
        print("\n Accuracy: \n", accuracy)
        print("\n Precision of event Happening: \n", precision)
        print("\n Recall of event Happening: \n", recall)
        print("\n AUC: \n",auc_val)
        print("\n F-Score:\n", f_score)
        #confusion Matrix
        print("\n Confusion Matrix: \n", confusion_matrix(testY, predictions,labels=[0,1]))
    
    res_map = {
                "accuracy": accuracy,
                "precision": precision,
                "recall": recall,
                "auc_val": auc_val,
                "f_score": f_score,
                "model_obj": model
              }
    return res_map


#Validation metrics for Regression algorithms
def validationmetrics_reg(model,testX,testY, verbose=True):
    predictions = model.predict(testX)
    
    # R-squared
    r2 = r2_score(testY,predictions)
    
    # Adjusted R-squared
    r2_adjusted = 1-(1-r2)*(testX.shape[0]-1)/(testX.shape[0]-testX.shape[1]-1)
    
    # MSE
    mse = mean_squared_error(testY,predictions)
    
    #RMSE
    rmse = math.sqrt(mse)
    
    if verbose:
        print("R-Squared Value: ", r2)
        print("Adjusted R-Squared: ", r2_adjusted)
        print("RMSE: ", rmse)
    
    res_map = {
                "r2": r2,
                "r2_adjusted": r2_adjusted,
                "rmse": rmse,
                "model_obj": model
              }
    return res_map


# Classification Algorithms

def LogReg(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf  = LogisticRegression()
    clf.fit(trainX , trainY)
    return validationmetrics(clf,testX,testY,verbose=verbose)

def KNN(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf = KNeighborsClassifier()
    clf.fit(trainX , trainY)
    return validationmetrics(clf,testX,testY,verbose=verbose)

def GadientBoosting(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf = GradientBoostingClassifier()
    clf.fit(trainX , trainY)
    return validationmetrics(clf,testX,testY,verbose=verbose)

def AdaBoost(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf = AdaBoostClassifier(n_estimators=100, random_state=0)
    clf.fit(trainX , trainY)
    return validationmetrics(clf,testX,testY,verbose=verbose)

def SVM(trainX, testX, trainY, testY, svmtype="SVC", verbose=True, clf=None):
    # for one vs all
    if not clf:
        if svmtype == "Linear":
            clf = svm.LinearSVC()
        else:
            clf = svm.SVC()
    clf.fit(trainX , trainY)
    return validationmetrics(clf,testX,testY,verbose=verbose)

def DecisionTree(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf = DecisionTreeClassifier()
    clf.fit(trainX , trainY)
    return validationmetrics(clf,testX,testY,verbose=verbose)

def RandomForest(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf  = RandomForestClassifier()
    clf.fit(trainX , trainY)
    return validationmetrics(clf,testX,testY,verbose=verbose)

def NaiveBayes(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf = GaussianNB()
    clf.fit(trainX , trainY)
    return validationmetrics(clf,testX,testY,verbose=verbose)

def MultiLayerPerceptron(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf = MLPClassifier(hidden_layer_sizes=5)
    clf.fit(trainX,trainY)
    return validationmetrics(clf,testX,testY,verbose=verbose)

def XgBoost(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf = XGBClassifier(random_state=1,learning_rate=0.01)
    clf.fit(trainX,trainY)
    return validationmetrics(clf,testX,testY,verbose=verbose)

def LightGbm(trainX, testX, trainY, testY, verbose=True, clf=None):
    d_train = lgb.Dataset(trainX, label=trainY)
    params = {}
    params['learning_rate'] = 0.003
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'binary'
    params['metric'] = 'binary_logloss'
    params['sub_feature'] = 0.5
    params['num_leaves'] = 10
    params['min_data'] = 50
    params['max_depth'] = 10
    clf = lgb.train(params, d_train, 100)
    return validationmetrics(clf,testX,testY,verbose=verbose)


# Regression Algorithms
def LinearReg(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf  = LinearRegression()
    clf.fit(trainX , trainY)
    return validationmetrics_reg(clf, testX, testY, verbose=verbose)

def RandomForestReg(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf = RandomForestRegressor(n_estimators=100)
    clf.fit(trainX , trainY)
    return validationmetrics_reg(clf, testX, testY, verbose=verbose)

def PolynomialReg(trainX, testX, trainY, testY, degree=3, verbose=True, clf=None):
    poly = PolynomialFeatures(degree = degree)
    X_poly = poly.fit_transform(trainX)
    poly.fit(X_poly, trainY)
    if not clf:
        clf = LinearRegression() 
    clf.fit(X_poly, trainY)
    return validationmetrics_reg(clf, poly.fit_transform(testX), testY, verbose=verbose)

def SupportVectorRegression(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf = SVR(kernel="rbf")
    clf.fit(trainX , trainY)
    return validationmetrics_reg(clf, testX, testY, verbose=verbose)

def DecisionTreeReg(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf = DecisionTreeRegressor()
    clf.fit(trainX , trainY)
    return validationmetrics_reg(clf, testX, testY, verbose=verbose)

def GradientBoostingReg(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf = GradientBoostingRegressor()
    clf.fit(trainX , trainY)
    return validationmetrics_reg(clf, testX, testY, verbose=verbose)

def AdaBooostReg(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf = AdaBoostRegressor(random_state=0, n_estimators=100)
    clf.fit(trainX , trainY)
    return validationmetrics_reg(clf, testX, testY, verbose=verbose)

def VotingReg(trainX, testX, trainY, testY, verbose=True, clf=None):
    lr = LinearRegression()
    rf = RandomForestRegressor(n_estimators=100)
    sv = SVR(kernel="rbf")
    dt = DecisionTreeRegressor()
    gb = GradientBoostingRegressor()
    ab = AdaBoostRegressor(random_state=0, n_estimators=100)
    if not clf:
        clf = VotingRegressor([('rf', rf), ('dt', dt), ('gb', gb), ('ab', ab)])
    clf.fit(trainX , trainY)
    return validationmetrics_reg(clf, testX, testY, verbose=verbose)


# Helper function to provide list of supported algorithms for Classification
def get_supported_algorithms():
    covered_algorithms = [LogReg, KNN, GadientBoosting, AdaBoost,
                          SVM, DecisionTree, RandomForest, NaiveBayes,
                          MultiLayerPerceptron]
    if XGBClassifier:
        covered_algorithms.append(XgBoost)
    if lgb:
        covered_algorithms.append(LightGbm)
    return covered_algorithms

# Helper function to provide list of supported algorithms for Regression
def get_supported_algorithms_reg():
    covered_algorithms = [LinearReg, RandomForestReg, PolynomialReg, SupportVectorRegression,
                          DecisionTreeReg, GradientBoostingReg, AdaBooostReg, VotingReg]
    return covered_algorithms


# Helper function to run all algorithms provided in algo_list over given dataframe, without cross validation
# By default it will run all supported algorithms 
def run_algorithms(df, label_col, algo_list=get_supported_algorithms(), feature_list=[]):
    """
    Run Algorithms with manual split
    
    """
    # Lets make a copy of dataframe and work on that to be on safe side 
    _df = df.copy()
    
    if feature_list:
        impftrs = feature_list
        impftrs.append(label_col)
        _df = _df[impftrs]
    
    _df, trainX, testX, trainY, testY = traintestsplit(_df, 0.2, 91, label_col=label_col)
    algo_model_map = {}
    score_map = {}
    for algo in algo_list:
        print("============ " + algo.__name__ + " ===========")
        
        res = algo(trainX, testX, trainY, testY)
        
        score_map[ algo.__name__]=res
        algo_model_map[algo.__name__] = res.get("model_obj", None)
        print ("============================== \n")
    
    return algo_model_map,score_map
        

# With stratified kfold validation support
def run_algorithms_cv(df, label_col, algo_list=get_supported_algorithms(), feature_list=[], cross_valid_method=cross_valid_stratified_kf):
    """
    Run Algorithms with cross validation
    
    """
    _df = df.copy()
    X,y = XYsplit(_df, label_col)
    
    # Select features if specified by driver program
    if feature_list:
        X = X[feature_list]
    
    result = {}
    algo_model_map = {}
    for algo in algo_list:
        clf = None
        result[algo.__name__] = dict()
        for trainX,trainY,testX,testY  in cross_valid_method(X, y, split=10):
            res_algo = algo(trainX, testX, trainY, testY, verbose=False, clf=clf)
            # Get trained model so we could use it again in the next iteration
            clf = res_algo.get("model_obj", None)
            
            for k,v in res_algo.items():
                if k == "model_obj":
                    continue
                if k not in result[algo.__name__].keys():
                    result[algo.__name__][k] = list()
                result[algo.__name__][k].append(v)
                
        algo_model_map[algo.__name__] = clf
            
    score_map = dict()
    # let take average scores for all folds now
    for algo, metrics in result.items():
        print("============ " + algo + " ===========")
        score_map[algo] = dict()
        for metric_name, score_lst in metrics.items():
            score_map[algo][metric_name] = np.mean(score_lst)
        print(score_map[algo])
        print ("============================== \n")
        score_map[algo]["model_obj"] = algo_model_map[algo]
    
    return score_map


# Helper function to get fetaure importance metrics via Random Forest Feature Selection (RFFS)
def RFfeatureimportance(df, trainX, testX, trainY, testY, trees=35, random=None, regression=False):
    if regression:
        clf  = RandomForestRegressor(n_estimators=trees, random_state=random)
    else:
        clf  = RandomForestClassifier(n_estimators=trees, random_state=random)
    clf.fit(trainX,trainY)
    #validationmetrics(clf,testX,testY)
    res = pd.Series(clf.feature_importances_, index=df.columns.values).sort_values(ascending=False)*100
    print(res)
    return res


# Helper function to select important features via RFFS, run supported ML algorithms over dataset with manual split and measure accuracy without Cross Validation - select features with importance >=threshold
def MachineLearningwithRFFS(df, label_col, threshold=5, algo_list=get_supported_algorithms(), regression=False):
    # lets create a copy of this dataframe and perform feature selection analysis over that
    df_cpy = df.copy()
    df_cpy, trainX, testX, trainY, testY = traintestsplit(df_cpy, 0.2, 91, label_col=label_col)
    res = RFfeatureimportance(df_cpy, trainX, testX, trainY, testY, trees=10, regression=regression)
    
    impftrs = list(res[res > threshold].keys())
    #impftrs.append(label_col)
    
    print ("Selected Features =" + str(impftrs))
    print(df.shape)
    results,score_map = run_algorithms(df, label_col, algo_list=algo_list, feature_list=impftrs)
    return {"selected_features": impftrs}, score_map


# Helper function to select important features via RFFS, run supported ML algorithms over dataset with cross validation and measure accuracy --- select features with importance >=threshold
def MachineLearningwithRFFS_CV(df, label_col, threshold=5, algo_list=get_supported_algorithms(), regression=False):
    # lets create a copy of this dataframe and perform feature selection analysis over that
    df_cpy = df.copy()
    df_cpy, trainX, testX, trainY, testY = traintestsplit(df_cpy, 0.2, 91, label_col=label_col)
    res = RFfeatureimportance(df_cpy, trainX, testX, trainY, testY,
                              trees=10, regression=regression)

    impftrs = list(res[res > threshold].keys())
    
    print ("Selected Features =" + str(impftrs))
    print(df.shape)
    if regression:
        cross_valid_method = cross_valid_kfold
    else:
        cross_valid_method = cross_valid_stratified_kf
    results = run_algorithms_cv(df, label_col, algo_list=algo_list, feature_list=impftrs, cross_valid_method=cross_valid_method)
    return {"selected_features": impftrs, "results": results}
    


# ## Mutual Information Feature Selection (MIFS)


# 
# MachineLearningwithMIFS() => Helper function to select important features and run supported ML algorithms over dataset

# In[39]:


# mutualinformation()  => Helper function to get fetaure importance metrics via Mutual Information Classifier/Regressor.
def mutualinformation(df, label_col, regression=False):
    df_cpy = df.copy()
    y = df_cpy[label_col].copy()
    X = df_cpy.drop(label_col,axis=1)
    if regression:
        mutual_info = mutual_info_regression(X,y,random_state=35)
    else:
        mutual_info = mutual_info_classif(X,y,random_state=35)
    results = pd.Series(mutual_info, index=X.columns).sort_values(ascending=False)*100
    print(results)
    return results



# Helper function to select important features via MIFS, run supported ML algorithms over dataset with manual split and measure accuracy, without CV ... select features with importance >=threshold
def MachineLearningwithMIFS(df, label_col, threshold=5, algo_list=get_supported_algorithms(), regression=False):
    
    # lets create a copy of this dataframe and perform feature selection analysis over that
    df_cpy = df.copy()
    res = mutualinformation(df_cpy, label_col=label_col, regression=regression)
    
    #include all selected features in impftrs
    impftrs = list(res[res > threshold].keys())
    
    print ("Selected Features =" + str(impftrs))
    
    results = run_algorithms(df, label_col, algo_list=algo_list, feature_list=impftrs)
    return {"selected_features": impftrs, "results": results}


# Helper function to select important features via MIFS, run supported ML algorithms over dataset with manual split and measure accuracy, with CV ... select features with importance >=threshold
def MachineLearningwithMIFS_CV(df, label_col, threshold=5, algo_list=get_supported_algorithms(), regression=False):
    
    # lets create a copy of this dataframe and perform feature selection analysis over that
    df_cpy = df.copy()
    res = mutualinformation(df_cpy, label_col=label_col, regression=regression)
    
    #include all selected features in impftrs
    impftrs = list(res[res > threshold].keys())
    
    print ("Selected Features =" + str(impftrs))
    if regression:
        cross_valid_method = cross_valid_kfold
    else:
        cross_valid_method = cross_valid_stratified_kf
    results = run_algorithms_cv(df, label_col, algo_list=algo_list, feature_list=impftrs, cross_valid_method=cross_valid_method)
    return {"selected_features": impftrs, "results": results}


# Helper function to select important features via REFS, run supported ML algorithms over dataset with manual split and measure accuracy, without CV ... select features with importance >=threshold
# flexible enough to use any algorithm for recursive feature elimination and any alogorithm to run on selected features
def GenericREFS(df, label_col,
                algo_list=get_supported_algorithms(),
                re_algo=RandomForestClassifier,
                **kwargs):
    
    X,y = XYsplit(df, label_col)
    clf = re_algo(**kwargs)
    selector = RFE(estimator=clf, step=1)
    selector = selector.fit(X,y)
    feature_list = X.columns[selector.support_].tolist()
    
    results = run_algorithms(df, label_col, algo_list=algo_list, feature_list=feature_list)
    return {"selected_features": feature_list, "results": results}


# Helper function to select important features via REFS, run supported ML algorithms over dataset with manual split and measure accuracy, with CV ... select features with importance >=threshold
# flexible enough to use any algorithm for recursive feature elimination and any alogorithm to run on selected features
def GenericREFS_CV(df, label_col,
                algo_list=get_supported_algorithms(),
                regression=False,
                re_algo=RandomForestClassifier,
                **kwargs):
    
    X,y = XYsplit(df, label_col)
    clf = re_algo(**kwargs)
    selector = RFECV(estimator=clf, step=1, cv=10)
    selector = selector.fit(X,y)
    feature_list = X.columns[selector.support_].tolist()
    if regression:
        cross_valid_method = cross_valid_kfold
    else:
        cross_valid_method = cross_valid_stratified_kf
    results = run_algorithms_cv(df, label_col, algo_list=algo_list, feature_list=feature_list, cross_valid_method=cross_valid_method)
    return {"selected_features": feature_list, "results": results}

# Helper function to provide list of classification algorithms to be used for recursive elimination feature selection
def get_supported_algorithms_refs():
    algo_list = [LogisticRegression, GradientBoostingClassifier, AdaBoostClassifier,
                          DecisionTreeClassifier, RandomForestClassifier]
    return algo_list

# Helper function to provide list of regression algorithms to be used for recursive elimination feature selection
def get_supported_reg_algorithms_refs():
    algo_list = [LinearRegression, RandomForestRegressor,
                 DecisionTreeRegressor, GradientBoostingRegressor, AdaBoostRegressor]
    return algo_list


# Helper function to perform feature selection using PCA. It runs supported algorithms with over specified components and mesure performance stats, without Cross Validation
from sklearn.decomposition import PCA
def PCA_FS(df, label_col, n_components, algo_list=get_supported_algorithms()):
    df_cpy = df.copy()
    X,y = XYsplit(df_cpy, label_col)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    # First we need to normalize the data
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    # Now perform PCA
    pca = PCA(n_components=n_components)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    

    
    algo_model_map = {}
    score_map = {}
    # At this stage we apply alogorithms
    for algo in algo_list:
        print("============ " + algo.__name__ + " ===========")
        res = algo(X_train, X_test, y_train, y_test)
        algo_model_map[algo.__name__] = res.get("model_obj", None)
        score_map[ algo.__name__]=res
        print("============================== \n")
    return {"n_components": n_components, "results": score_map}


# Helper function to perform feature selection using PCA. It runs supported algorithms with over specified components and mesure performance stats, with Cross Validation

def PCA_FS_CV(df, label_col, n_components, algo_list=get_supported_algorithms(), regression=False):
    df_cpy = df.copy()
    X,y = XYsplit(df_cpy, label_col)
    
    cross_valid_method = cross_valid_kfold if regression else cross_valid_stratified_kf 
    result = {}
    algo_model_map = {}
    for algo in algo_list:
        clf = None
        result[algo.__name__] = dict()
        for X_train,y_train,X_test,y_test in cross_valid_method(X, y, split=10):
            # First we need to normalize the data
            sc = StandardScaler()
            X_train = sc.fit_transform(X_train)
            X_test = sc.transform(X_test)
            
            # Now perform PCA
            pca = PCA(n_components=n_components)
            X_train = pca.fit_transform(X_train)
            X_test = pca.transform(X_test)
            
            # apply algo on this fold and save result for later usage
            res_algo = algo(X_train, X_test, y_train, y_test, verbose=False, clf=clf)
            # Get trained model so we could use it again in the next iteration
            clf = res_algo.get("model_obj", None)
            
            for k,v in res_algo.items():
                if k == "model_obj":
                    continue
                if k not in result[algo.__name__].keys():
                    result[algo.__name__][k] = list()
                result[algo.__name__][k].append(v)
            
        algo_model_map[algo.__name__] = clf
        
    
    score_map = dict()
    # let take average scores for all folds now
    for algo, metrics in result.items():
        print("============ " + algo + " ===========")
        score_map[algo] = dict()
        for metric_name, score_lst in metrics.items():
            score_map[algo][metric_name] = np.mean(score_lst)
        print(score_map[algo])
        print ("============================== \n")
        score_map[algo]["model_obj"] =  algo_model_map[algo]
    return {"n_components": n_components, "results": score_map}

In [19]:
def num_cat(df,type_cols):
    cols = []
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    cols = df.select_dtypes(include=numerics).columns.tolist()
    if type_cols != 'Numeric':
        cols=list(set(df.columns.tolist())-set(cols))
    return cols
        
        
def num_plots (df,col_name):
    fig, axes = plt.subplots(1, 3, figsize=(18,5))
    fig.suptitle('Numerical Analysis'+" "+col_name)
    sns.boxplot(ax=axes[0], data=df,x=col_name)
    sns.histplot(ax=axes[1],data=df, x=col_name, kde=True)
    sm.qqplot(ax=axes[2],data=df[col_name], line ='45') 
    print(df[col_name].describe())
    
    #print(normality_test(df,col_name,'shapiro'))
def target_num(df,col1,col2):
    a_df = pd.DataFrame()
    a_df[col2+'_yes'] = (df[df[col1] == 1][[col1,col2]].describe())[col2]
    a_df[col2+'_no'] = (df[df[col1] == 0][[col1,col2]].describe())[col2]
    a_df.drop(['count', '25%', '50%', '75%']).plot.bar(title = col2+' and ' +col1+ ' statistics')

def cat_tar(df,col1,col2):
    j_df = pd.DataFrame()    
    j_df['yes'] = df[df[col1] == 1][col2].value_counts()
    j_df['no'] = df[df[col1] == 0][col2].value_counts()
    j_df.plot.bar(title = col2+' and ' +col1)
            
def scatter_plot(df,col1,col2):
    sns.scatterplot(data=df, x=col1, y=col2)

#### Outlier Removal

In [None]:
from scipy import stats
def rem_outliers(df,col):
    mean = df[col].mean()
    df[col] = np.where(stats.zscore(df[col])<=3, mean ,df[col])
    return df

In [15]:
def plot(df,cat_column):
    fig = plt.figure()
    value_counts = df[cat_column].value_counts()
    x_pos = np.arange(0, len(value_counts))
    
    ax = fig.add_axes([0,0,1,1])
    ax.bar(x_pos, value_counts.values, tick_label = value_counts.index)
    ax.set_title(cat_column)
    plt.show()

#### Missing Value Analysis

In [None]:
def missingValues(data):
    null_sum = data.isnull().sum()# instantiate columns for missing data
    total = null_sum.sort_values(ascending=False)
    percent = ( ((null_sum / len(data.index))*100).round(2) ).sort_values(ascending=False)

    df_NA = pd.concat([total, percent], axis=1, keys=['Number of NA', 'Percent NA'])

    df_NA = df_NA[ (df_NA.T != 0).any() ]
    
    return df_NA

def mano(df,cols,all_cols=False):
    if all_cols == False:
        df= df[[cols]]        
    msno.matrix(df)
    msno.heatmap(df)
    msno.bar(df)

def cater_missing(df,column,method,value=0):
    # method options is drop, mean and mode
    if method == 'drop row':
        df = df[df[column].notna()]
    if method == 'drop column':
        df.dropna(subset=[column], inplace=True)
    if method == 'mean':
        df[column].fillna(df[column].mean(), inplace=True)
    if method == 'mode':
        df[column].fillna(df[column].mode()[0], inplace=True)
    return df

In [13]:
def dis_rep (df,col1,value,strip=False):    
    for key in value:
        df[col1] = df[col1].replace(key,value[key])
    if strip == True:
        df[col1] = df[col1].replace(' ','')
def col_strip(df):
#Strip white spaces from column names
    df= df.rename(columns=lambda x: x.strip())  
    return df
    

#### T - Test 

In [None]:
def ttest(a,b):
    a=np.array(a)
    b=np.array(b)
   
    t2, p2 = stats.ttest_ind(a,b)
    print("t = " + str(t2))
    print("p = " + str(p2))  
    if t2 > p2 :
        print("We reject the null hypothesis and we can say that the means of both groups are different")
    else:
        print("We accept the null hypothesis and we can say that the means of both groups is significantly same")


#### ANOVA

In [None]:
import scipy.stats as stats

def ANOVA(data,col1,col2):
    ov=pd.crosstab(data[col1],data[col2])   
    edu_frame=data[[col1, col2]]    
    groups = edu_frame.groupby(col1).groups
    edu_class=edu_frame[col2]
    lis_group = groups.keys()
    lg=[]
    for i in groups.keys():
        globals()[i]  = edu_class[groups[i]].values
        lg.append(globals()[i])
    dfd = 0
    for m in lis_group:       
        dfd=len(m)-1+dfd   
    print(stats.f_oneway(*lg))
    stat_val = stats.f_oneway(*lg)[0]
    crit_val = stats.f.ppf(q=1-0.05, dfn=len(lis_group)-1, dfd=dfd)
    if stat_val >= crit_val :
        print('Reject null hypothesies and conclude that atleast one group is different and the feature is releavant to the class.')
    else:
         print('Accept null hypothesies and conclude that atleast one group is same and the feature is not releavant to the class.')

#### Chi squared 

In [None]:
def chi_squared(data,f1,f2,alpha=0.05):
    ov=pd.crosstab(data[f1],data[f2])
    b=stats.chi2_contingency(ov)
    chi2_statistic=b[0]
    p_value=b[1]
    dof=b[2]
    critical_value=stats.chi2.ppf(q=1-alpha, df=dof)
    print('Significance level: ',alpha)
    print('Degree of Freedom: ',dof)
    print('chi-square statistic:',chi2_statistic)
    print('critical_value:',critical_value)
    print('p-value:',p_value)
    
    if chi2_statistic>=critical_value:
        print("Reject H0,There is a relationship between 2 categorical variables")
    else:
        print("Retain H0,There is no relationship between 2 categorical variables")
    
    if p_value<=alpha:
        print("Reject H0,There is a relationship between 2 categorical variables")
    else:
        print("Retain H0,There is no relationship between 2 categorical variables")
    if abs(chi2_statistic) >= critical_value:
        print('Dependent (reject H0)')
    else:
        print('Independent (fail to reject H0)')


#### Correlation

In [None]:
def corr_heat (data):
    str_list = [] # empty list to contain columns with strings (words)
    for colname, colvalue in data.iteritems():
        if type(colvalue[1]) == str:
             str_list.append(colname)
    # Get to the numeric columns by inversion            
    num_list = data.columns.difference(str_list) 
    # Create Dataframe containing only numerical features
    data_num = data[num_list]
    f, ax = plt.subplots(figsize=(16, 12))
    plt.title('Pearson Correlation of features')
    # Draw the heatmap using seaborn
    #sns.heatmap(house_num.astype(float).corr(),linewidths=0.25,vmax=1.0, square=True, cmap="PuBuGn", linecolor='k', annot=True)
    sns.heatmap(data_num.astype(float).corr(),linewidths=0.25,vmax=1.0, square=True, cmap="cubehelix", linecolor='k', annot=True)

#### Cleaning Functions

In [None]:
from numpy import array
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
def encoder (df,col_name,method):
    if method=='onehot':
        df[col_name] = pd.Categorical(df[col_name])
        dfDummies = pd.get_dummies(df[col_name], prefix = col_name)
        df.drop([col_name],axis=1,inplace=True)
        df = pd.concat([df, dfDummies], axis=1)
        print(df.shape)
    elif method == 'label_encoder':
        label_encoder = LabelEncoder()
        df[col_name] = label_encoder.fit_transform(df[col_name])
    elif method == 'ordinal':
         ordinal_encoder = OrdinalEncoder()
         df[col_name] = Ordinal_encoder.fit_transform(df[col_name])
    return df

def convert_target_boolean(data,target,tar_var_neg):
    data[target]=data[target].apply(lambda x:0 if x==tar_var_neg else 1)
    return data
def copy (df):
    df1 = df.copy()
    return df1   
def change_type(df,colname,type_col):
    # using dictionary to convert specific columns
    convert_dict = {colname: type_col}
    df = df.astype(convert_dict)
    return df
def plot(df,cat_column):
    fig = plt.figure()
    value_counts = df[cat_column].value_counts()
    x_pos = np.arange(0, len(value_counts))
    
    ax = fig.add_axes([0,0,1,1])
    ax.bar(x_pos, value_counts.values, tick_label = value_counts.index)
    ax.set_title(cat_column)
    plt.show()
def rem_rows(df,dict_col):
    for key in dict_col:
        indexNames = df[ df[key] == dict_col[key] ].index
        df = df.drop(indexNames , inplace=True)
    return df

def rem_cols(df,lis_col):
    df = df.drop(lis_col, axis='columns', inplace=True)
    print(df)
    return df

#### Class Imbalance

In [7]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
def sm_sample(data,target):
    sm = SMOTE(random_state=42)
    X = data.drop(columns = target)
    # Scaling all the variables to a range of 0 to 1
    features = X.columns.values
    scaler = MinMaxScaler(feature_range = (0,1))
    scaler.fit(X)
    X = pd.DataFrame(scaler.transform(X))
    X.columns = features
    y =data[[target]]
    X_sm, y_sm = sm.fit_resample(X, y)
    TEST_SIZE = 0.3
    RAND_STATE = 42
    X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size = TEST_SIZE, random_state=RAND_STATE)
    print(f'''Shape of X before SMOTE: {X_train.shape}
    Shape of X after SMOTE: {X_sm.shape}''')

    print('\nBalance of positive and negative classes (%):')
    print(y_sm.value_counts(normalize=True) * 100)
    data=X_sm
    data[target]=y_sm
    return data


def check_imbalance(df,target):
    print(df[target].value_counts())
    df[target].value_counts().plot(kind='bar', title='Count (target)')







In [None]:
def test_train(data,target):
    X = data.drop(columns = target)
    # Scaling all the variables to a range of 0 to 1
    features = X.columns.values
    scaler = MinMaxScaler(feature_range = (0,1))
    scaler.fit(X)
    X = pd.DataFrame(scaler.transform(X))
    X.columns = features
    y =data[[target]]
    TEST_SIZE = 0.3
    RAND_STATE = 42
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = TEST_SIZE, random_state=RAND_STATE)
    
    return X_train, X_test, y_train, y_test

def batch_classify(X_train,X_test,y_test,Y_train, verbose=True):                    
    dict_classifiers = {
    "Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB() , 
    "Logistic Regression" : LogisticRegression(solver='lbfgs'),
    "Neural Net": MLPClassifier(alpha=1),  
    "Linear SVM": SVC(probability=True),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "Decision Tree": tree.DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=18),
    "XGBoost":XGBClassifier(),
    'AdaBoost' : AdaBoostClassifier(n_estimators=50, random_state=0),
    'Light GBM': LGBMClassifier(),

    }
    no_classifiers = len(dict_classifiers.keys())
    df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,8)), columns = ['classifier', 'train_score','test_score',
                                                                                  'precision','recall','f1_score','cohens_kappa',
                                                                                 'roc_auc'])
    count = 0
    for key, classifier in dict_classifiers.items():
        classifier.fit(X_train, np.ravel(Y_train))
        
        cf_matrix = confusion_matrix(y_test, (classifier.predict(X_test)))
        print(cf_matrix)
        
        
        
        y_pred=classifier.predict(X_test)
        
        y_test = np.asarray(y_test)
        
        
    
        # generate a no skill prediction (majority class)
        ns_probs = [0 for _ in range(len(y_test))]


        lr_probs = classifier.predict_proba(X_test)
        # keep probabilities for the positive outcome only
        lr_probs = lr_probs[:, 1]
        # calculate scores
        ns_auc = roc_auc_score(y_test, ns_probs)
        lr_auc = roc_auc_score(y_test, lr_probs)
        # summarize scores
        print('No Skill: ROC AUC=%.3f' % (ns_auc))
        print(key+' : ROC AUC=%.3f' % (lr_auc))
        # calculate roc curves
        ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
        lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
        # plot the roc curve for the model
        pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
        pyplot.plot(lr_fpr, lr_tpr, marker='.', label=key)
        # axis labels
        pyplot.xlabel('False Positive Rate')
        pyplot.ylabel('True Positive Rate')
        # show the legend
        pyplot.legend()
        # show the plot
        pyplot.show()
        
        print("Classification Report"+" "+key)
        print(classification_report(y_test, y_pred, labels=[1, 0]))
        train_score = classifier.score(X_train, Y_train)
        test_score=accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        kappa = cohen_kappa_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_pred)
        df_results.loc[count,'classifier'] = key
        df_results.loc[count,'train_score'] = train_score
        df_results.loc[count,'test_score'] = test_score
        df_results.loc[count,'precision'] = precision
        df_results.loc[count,'recall'] = recall
        df_results.loc[count,'f1_score'] = f1
        df_results.loc[count,'cohens_kappa'] = kappa
        df_results.loc[count,'roc_auc'] = auc
        # accuracy: (tp + tn) / (p + n)
        print('Accuracy: %f' % test_score)
        # precision tp / (tp + fp)
        print('Precision: %f' % precision)
        # recall: tp / (tp + fn)
        print('Recall: %f' % recall)
        # f1: 2 tp / (2 tp + fp + fn)
        print('F1 score: %f' % f1)
        print('Cohens kappa: %f' % kappa) 
        print('ROC_AUC: %f' % auc)
     
    
        count=count+1

    return df_results

#### Regression

In [None]:
def regression(xtrain,ytrain,xtest,ytest):

    dict_regressors = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=0.1),
    "Lasso Regression": Lasso(alpha=0.1),
    "Random Forest Regressor": RandomForestRegressor(max_depth=2, random_state=0),
    "Elastic Net Regressor": ElasticNet(random_state=0),
    }
    no_classifiers = len(dict_regressors.keys())
    df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,3)), columns = ['classifier','RMSE','R2_Score'])
    count = 0
    for key, classifier in dict_regressors.items():
        print(key)
        clf=classifier.fit(x,y)
        y_predicted = clf.predict(xtest)
        # model evaluation
        rmse = mean_squared_error(y_test, y_predicted)
        r2 = r2_score(y_test, y_predicted)
        #print('Slope:' ,clf.coef_)
        #print('Intercept:', clf.intercept_)
        print('Root mean squared error: ', rmse)
        print('R2 score: ', r2)

        df_results.loc[count,'classifier'] = key
        df_results.loc[count,'RMSE'] = rmse
        df_results.loc[count,'R2_Score'] = r2



        count=count+1
    return df_results


#### Helper Functions to get the relevent dataframes with the combinations

In [10]:
def df_wcv_wfs(data,y):
    results_wcv,data = run_algorithms(data,y)
    df =pd.DataFrame.from_dict(data,orient='index')
    df.index.name='Results without CV and FS'
    return df

def df_cv(data,y):
    results_cv = run_algorithms_cv(data,y)
    df1 =pd.DataFrame.from_dict(results_cv,orient='index')
    df1.index.name='Results with CV'
    return df1

def df_rffs(data, y):
    res_rffs,score=MachineLearningwithRFFS(data, y, threshold=5, algo_list=get_supported_algorithms())
    df2 =pd.DataFrame.from_dict(score,orient='index')
    df2.index.name='Results with FS'
    return df2

def df_RFFS_CV(data,y):
    res_RFFS_CV=MachineLearningwithRFFS_CV(data, y, threshold=5, algo_list=get_supported_algorithms(), regression=False)
    df3 =pd.DataFrame.from_dict(res_RFFS_CV['results'],orient='index')
    df3.index.name='Results with FS and CV'
    return df3

def df_RFE(data,y):
    res_RFE=GenericREFS(data,y,algo_list=get_supported_algorithms(),re_algo=RandomForestClassifier)
    df4 =pd.DataFrame.from_dict(res_RFE['results'][1],orient='index')
    df4.index.name='Results with RFE'
    return df4

def df_RFE_CV(data,y):
    res_RFE_CV=GenericREFS_CV(data,y,algo_list=get_supported_algorithms(),regression=False,
                re_algo=RandomForestClassifier)
    df5 =pd.DataFrame.from_dict(res_RFE_CV['results'],orient='index')
    df5.index.name='Results with RFE and CV'
    return df5

def df_PCA(data,y):
    res_PCA=PCA_FS(data,y, 5, algo_list=get_supported_algorithms())
    df6 =pd.DataFrame.from_dict(res_PCA['results'],orient='index')
    df6.index.name='Results with PCA '
    return df6

def df_PCA_CV(data,y):
    res_PCA_CV=PCA_FS_CV(data,y, 5, algo_list=get_supported_algorithms())
    df7 =pd.DataFrame.from_dict(res_PCA_CV['results'],orient='index')
    df7.index.name='Results with PCA and CV '
    return df7

from sklearn.ensemble import RandomForestClassifier
def all_combinations(data,y):
    df=df_wcv_wfs(data,y)
    print('Done 1')
    df1=df_cv(data,y)
    print('Done 2')
    df2=df_rffs(data, y)
    print('Done 3')
    df3=df_RFFS_CV(data, y)
    print('Done 4')
    df4=df_RFE(data,y)
    print('Done 5')
    df5=df_RFE_CV(data,y)
    print('Done 6')
    df6=df_PCA(data,y)
    print('Done 7')
    df7=df_PCA_CV(data,y)
    print('Done 8')
    return df,df1,df2,df3,df4,df5,df6,df7

#### Hyper Parametre Tuning Functions

In [8]:
def best_params_SVM(data,y):
    X = data.drop(columns = y)
    y =data[[y]]
    model = LogisticRegression()
    solvers = ['newton-cg', 'lbfgs', 'liblinear']
    penalty = ['l2']
    c_values = [100, 10, 1.0, 0.1, 0.01]
    # define grid search
    grid = dict(solver=solvers,penalty=penalty,C=c_values)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    grid_search = RandomizedSearchCV(estimator=model, param_distributions =grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
    grid_result = grid_search.fit(X, y)
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

def best_params_KNN(data,y):
    # define dataset
    X = data.drop(columns =y)
    y =data[[y]]
    # define models and parameters
    model = KNeighborsClassifier()
    n_neighbors = range(1, 21, 2)
    weights = ['uniform', 'distance']
    metric = ['euclidean', 'manhattan', 'minkowski']
    # define grid search
    grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    grid_search = RandomizedSearchCV(estimator=model, param_distributions =grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
    grid_result = grid_search.fit(X, y)
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

     
def best_params_SVC(data,y):

    # define dataset
    X = data.drop(columns = y)
    y =data[[y]]
    # define model and parameters
    model = SVC()
    kernel = ['poly', 'rbf', 'sigmoid']
    C = [50, 10, 1.0, 0.1, 0.01]
    gamma = ['scale']
    # define grid search
    grid = dict(kernel=kernel,C=C,gamma=gamma)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    grid_search = RandomizedSearchCV(estimator=model, param_distributions =grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
    grid_result = grid_search.fit(X, y)
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))




def best_params_RF(data,y):
# define dataset
    X = data.drop(columns = y)
    y =data[[y]]
    # define models and parameters
    model = RandomForestClassifier()
    n_estimators = [10, 100, 1000]
    max_features = ['sqrt', 'log2']
    # define grid search
    grid = dict(n_estimators=n_estimators,max_features=max_features)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    grid_search = RandomizedSearchCV(estimator=model, param_distributions =grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
    grid_result = grid_search.fit(X, y)
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))



def best_param_gbc(data,y):
    # define dataset
    X = data.drop(columns = y)
    y =data[[y]]
    # define models and parameters
    model = GradientBoostingClassifier()
    n_estimators = [10, 100, 1000]
    learning_rate = [0.001, 0.01, 0.1]
    subsample = [0.5, 0.7, 1.0]
    max_depth = [3, 7, 9]
    # define grid search
    grid = dict(learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, max_depth=max_depth)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    grid_search = RandomizedSearchCV(estimator=model, param_distributions =grid, n_jobs=-1, cv=cv, scoring='f1',error_score=0)
    grid_result = grid_search.fit(X, y)
    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

