In [None]:
from sklearn import metrics
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import xgboost


import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
import time
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    
def fastml(problem_type, models, y, drops, tresh_unique, tresh_missing, train, test):
    
    '''
    Takes in several inputs regarding the ML problem and carries out pre-processing, model fitting and 
    cross validation to provide accuracy score and comparison between various models.
    
    Inputs :
    -------
    problem_type : 'Regression' or 'Classification'
    models  :  [LinearRegression(), KNeighborsRegressor(), RandomForestRegressor()] choose any of these for regression 
               [LogisticRegression(), RandomForestClassifier(), KNeighborsClassifier(), 
               GaussianNB(), LinearSVC(), SGDClassifier(), DecisionTreeClassifier(), GradientBoostingClassifier()] 
               choose any of these for classification
    y       : Column name of the dependant variable as a string 
    drops   : List of column names to be removed prior to modeling
    tresh_unique   : If the number of unique items in a categorical columns exceed this treshold, the column is dropped
              If the number of unique items in a categorical columns exceed this treshold, the column is not dummified
    tresh_missing   : If the % of missing items in a column exceed this treshold, the column is dropped

    train   : Training set (dataframe)
    test    : Testing set (dataframe) is the data on which the model need to be applied
              All the columns (except dependant variable) need to be same as that of the training set
              Test dataset is optional.
    
    Outputs : 
    --------
    data    : A list of various datasets processed and predicted as part of the analysis
              data[0] --> X_train - dataframe of features of training set 
              data[1] --> X_val - dataframe of features of validation set 
              data[2] --> y_train - dataframe of dependant variable of training set
              data[3] --> y_val - dataframe of dependant variable of validation set
              data[4] --> X_test - dataframe of the features of the test set on which the model was applied
              data[5] --> y_pred - dataframe containing predictions based on the test data set using all models
              
    df_eval : A dataframe comparing accuracy scores among various models.
    
    
    Examples :
    ---------
    For titanic dataset, 
    
    >>> train = pd.read_csv(path+'train.csv')
    >>> test = pd.read_csv(path+'test.csv')

    >>> models = [LogisticRegression(), RandomForestClassifier(), KNeighborsClassifier()]
    >>> drops = ['PassengerId']

    >>> data, df_eval = df_run('Classification', models, 'Survived', drops, 20, 40, train, test)
    
    >>> print(df_eval)
    
              LogisticRegression  RandomForestClassifier  KNeighborsClassifier
    R2_train                0.82                    0.98                  0.79
    R2_val                  0.79                    0.80                  0.69
    Time                   27.88                  212.43                 34.91
    '''
    
    y_train = [] 
    df_train = train.copy()
    
    if y in df_train.columns:
        y_train = df_train[y]
        df_train.drop(y,axis=1,inplace=True)
    else:
        print('Dependent variable not found in the train data frame')
        sys.exit()
    
    if not test.empty:
        df_test = test.copy()
        df = pd.concat([df_train, df_test], axis=0)
    else: 
        df = df_train

    df = del_cols(df, drops, tresh_unique, tresh_missing)
    cat_names = cat_cols(df)
    cont_names = num_cols(df)

    df = fill_missing(df)
    df = dummify(df,tresh_unique)
    
    X_train = df[:len(df_train)]
    X_test = [] 
    if not test.empty:
        X_test = df[len(df_train):]
    
    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 42)
     
    df_eval, y_pred = calcs(problem_type, models, X_train, X_val, y_train, y_val, X_test)
    
    data = [X_train, X_val, y_train, y_val, X_test, y_pred]

    return data, df_eval
            
def calcs(problem_type, models, X_train, X_val, y_train, y_val, X_test):
    '''
    Based on the models specified in the input list, this function will run regression models on train and validation datasets.
    Outputs various accuracy and error parameters with a comparison view of all the models.
    '''

    df_eval = pd.DataFrame(columns=[''])
    y_test = {}

    for model in models:
        
        tic = time.time()
        model = model.fit(X_train, y_train)
        pred_val = model.predict(X_val)
        decimals = 4
        
        if problem_type == 'Regression':
            
            evaluation = {
                            'R2_train' : round(model.score(X_train,y_train),decimals),
                            'R2_val' : round(model.score(X_val,y_val),decimals),
                            'MAE_val' : round(metrics.mean_absolute_error(y_val,pred_val),decimals), 
                            'RMSE_val' : round(np.sqrt(metrics.mean_squared_error(y_val,pred_val)),decimals),
                            'Time' : round((time.time() - tic)*1000,decimals)
                         }
        elif problem_type == 'Classification':
            
            evaluation = {
                            'R2_train' : round(model.score(X_train, y_train),decimals),
                            'R2_val' : round(metrics.accuracy_score(y_val,pred_val),decimals),
                            'Time' : round((time.time() - tic)*1000,decimals)
                         }
        else:
            print('Problem type has to be either Classification or Regression')
            sys.exit()
            
        df_temp = pd.DataFrame.from_dict(evaluation, orient='index', columns=[str(model).split('(')[0]])
        df_eval = pd.concat([df_eval,df_temp],axis=1)
        
        y_test[str(model).split('(')[0]] = model.predict(X_test)
            
    df_eval.drop(df_eval.columns[0], axis=1, inplace=True)
    y_pred = pd.DataFrame.from_dict(y_test, orient='columns')

    return df_eval, y_pred

def display_all(df):
    '''
    display more rows and columns of a dataframe in ipython
    '''
    with pd.option_context('display.max_rows', 1000, 'display.max_columns', 1000):
        display(df)
        
def num_cols (df):
    '''
    returns a list containing the names of the columns that are numerical (either float or integer).
    '''
    num_names=[]
    for col in df.columns:
        if is_numeric_dtype(df[col]):
            num_names.append(col)
    return num_names
      
def cat_cols (df):
    '''
    returns a list containing the names of the columns that are categorical.
    '''
    cat_names=[]
    for col in df.columns:
        if not is_numeric_dtype(df[col]):
            cat_names.append(col)
    return cat_names

def del_cols(df, drops, tresh_unique=20, tresh_missing=50):
    '''
    deletes columns based on either of the following three conditions:
    1. categorical columns which has more unique items than the treshold value (tresh_unique) 
    2. columns that are part of the list "drops"
    3. columns containing % of missing values that are greater than the treshold value (tresh_missing)
    '''
    for col in cat_cols(df):
        if len(df[col].unique()) > tresh_unique:
            df.drop(col, axis=1, inplace=True)
            
    df.drop(drops, axis=1, inplace=True)
    
    for col in df.columns:
        if ((df[col].isnull().sum())/len(df))*100 >= tresh_missing:
            df[col+'_na'] = pd.isnull(df[col])
            df.drop(col, axis=1, inplace=True)
            
    return df

def dummify(df, tresh_unique=20, inplace=True):
    cols = []
    for col in df.columns:
        if len(df[col].unique()) <= tresh_unique and df[col].dtypes.name != 'bool':
            cols.append(col)
    df_temp = df[cols]
    df_temp = pd.get_dummies(df_temp.astype(str), columns=cols, drop_first=True)
    df.drop(cols, axis=1, inplace=inplace)
    df = pd.concat([df,df_temp], axis=1)
    return df

def fill_missing(df):
    for col in df.columns:
        if pd.isnull(df[col]).sum():
            if is_numeric_dtype(df[col]):
                filler = df[col].mean()
            else:
                filler = df[col].mode()[0]
            df[col+'_fill'] = df[col].fillna(filler)
            df[col+'_na'] = pd.isnull(df[col])
            df.drop(col, axis=1, inplace=True)

    return df