In [None]:
## set up dependency(s)
import pandas as pd
import numpy as np
from numpy import vstack
from numpy import hstack
from numpy import asarray

## plot method
import matplotlib.pyplot as plt
import seaborn as sns

## Image and HTML
from IPython.display import Image
from IPython.core.display import HTML 
from IPython.display import display

## Metices
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, classification_report, precision_score, ConfusionMatrixDisplay
from sklearn.preprocessing import MinMaxScaler
# from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC

# SKlearn Models
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

## feature ranking
from sklearn.feature_selection import SelectFromModel
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import itertools

## Boosting Method
from xgboost import XGBClassifier

## regex 
import re

## resample technique
from sklearn.utils import resample

## time 
import time

%matplotlib inline 

In [None]:
def upsample(df, target, flag):
    
    ### df: dataframe
    ### target : target that need to be upsampling
    ### flag: show details
    ### return a dataframe with balanced target
    
    if flag:
        plt.figure(figsize=(8,4)) 
        ax = sns.countplot(x= target , data=df)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha="right")
        ax.set_title('How many counts for categorical feature: {}'.format(target_))

    value_dic = df[target].value_counts()
    target_value = value_dic.keys()
    
    majority = target_value[0]
    majority_value = value_dic[majority]
    
    df_majority = df[(df[target]==majority)] 
    
    l = [df_majority]
    
    for minority in target_value[1:]:
        #print('what is minority: ', minority)
        df_minority = df[(df[target] == minority)] 

        df_minority_upsampled = resample(df_minority, 
                                     replace=True,    # sample with replacement
                                     n_samples= majority_value, # to match majority class
                                     random_state=42)  # reproducible results
        
        l.append(df_minority_upsampled)
         
    
    output = pd.concat(l)
        
    return output


def flatten_list(x):
    ### flatten a list
    ### e.g. [1, [2,3], 4]
    ### return [1,2,3,4]
    
    # input is a list of list
    # output is a flattened list 
    
    result = []
    for i in x:
        if isinstance(i,list):
            result.extend(i)
        else:
            result.append(i)
            
    return result

def forward_include(clf, 
                    df_x, 
                    df_y,
                    select_feature,
                    all_features,
                    score):
    
    ### clf: classifier
    ### df_x, df_y : use to train the model
    ### select_feature : features already included
    ### all_features : all the features
    ### score: best score (for classification : use ACC)
    
    ### return a tuple (selected feature, best_score)
    
    select_feature , score_predict = forward_onestep(clf, 
                                             df_x, df_y,
                                             select_feature,all_features,
                                             score)
    
    while score_predict >= score:
        score = score_predict
        select_feature , score_predict = forward_onestep(clf, 
                                             df_x, df_y,
                                             select_feature,all_features,
                                             score)
        
    return select_feature, score


def forward_onestep(clf, 
                    df_x, 
                    df_y, 
                    select_feature, 
                    all_features,
                    score):
    
    ### make one move forward inclusion, used in the function forward include
    ### use sample with replacement, include the elements that maximize the score
    
    ### clf: classifier
    ### df_x , df_y is the dataframe X, y that will be put int the model
    ### select_feature : features already included
    ### all_features : all the features
    ### score: best score (for classification : use ACC)
    
    ### return a tuple (selected feature, best_score)
    
    
    score_list = []
    feature_list = []
    
    left_feature = [i for i in all_features if i not in select_feature]
    
    if len(left_feature) != 0:
        for feature in left_feature:

            feature_list.append(feature)

            select_feature.append(feature)

            feature_input = flatten_list(select_feature)

            select_feature.pop(-1)

            acc = np.mean(cross_val_score(clf, df_x[feature_input], df_y, cv=5))

            score_list.append(acc)

        highest_index = np.argmax(score_list)

        if score_list[highest_index] >= score:
            select_feature.append(feature_list[highest_index])
            return select_feature, score_list[highest_index]

        else:
            return select_feature, -1
    else:
        return select_feature, -1
    


def forward_include2(clf, 
                     df_x, 
                     df_y,
                     select_feature,
                     all_features,
                     score):
    
    ### clf: classifier
    ### df_x, df_y : use to train the model
    ### select_feature : features already included
    ### all_features : all the features
    ### score: best score (for classification : use ACC)
    
    ### return a tuple (selected feature, best_score)
    
    select_list = []
    score_list = []
    
    left_features = [i for i in all_features if i not in select_feature]
    
    if left_features != []:
        for i in left_features:

            max_score = score

            select_feature_copy = [i for i in select_feature]
            select_feature_copy.append(i)

            feature_input = flatten_list(select_feature_copy)
            score_predict = np.mean(cross_val_score(clf, df_x[feature_input], df_y, cv=5))

            while score_predict >= max_score:
                max_score = score_predict
                select_feature_copy , score_predict = forward_onestep(clf, 
                                                                 df_x, 
                                                                 df_y,
                                                                 select_feature_copy,
                                                                 all_features,
                                                                 max_score)

            select_list.append(select_feature_copy)
            score_list.append(max_score)

        max_index = np.argmax(score_list)
        
        print(select_list)
        print(score_list)
        return select_list[max_index], score_list[max_index]
    
    else:
        return select_feature, score
    


def backward_elimination(clf,
                         df_x, 
                         df_y,
                         select_feature,
                         score):
    
    ### df_x, df_y : use to train the model
    ### select_feature : features already included
    ### all_features : all the features
    ### score: best score (for classification : use ACC)
    
    ### return a tuple (selected feature, best_score)
    
    r = len(select_feature) - 1
    feature_list = []
    Acc_list = []
    
    for i in itertools.combinations(select_feature,r):
        feature_list.append(list(i))
        
    for feature in feature_list:
        feature_input = flatten_list(feature)
        acc = np.mean(cross_val_score(clf, df_x[feature_input], df_y, cv=5))
        Acc_list.append(acc)
        
    max_index = np.argmax(Acc_list)
    
    if Acc_list[max_index] >= score:
        return feature_list[max_index], Acc_list[max_index], True
    else:
        return select_feature, score, False
    


def stepwise(clf, df_x, df_y, all_features):
    
    ## stepwise FIBE 
    ## At each step of forward inclusion, we perform backward elimination
    
    ### clf: classifier
    ### df_x, df_y : use to train the model
    ### all_features : all the features
    
    ### return a tuple (selected feature, best_score)
    
    select_feature = [] 
    score = -np.inf
    
    end_flag = True
    
    
    while True:
        end_flag = True
        select_feature , score = forward_include2(clf, df_x, df_y, select_feature,all_features, score)
        print('select_feature after forward inclusion: ', select_feature)
        print('current score is :', score)
  
        current_feature = [i for i in select_feature]
        
        while end_flag == True and len(select_feature) >= 2:
            select_feature, score, end_flag  = backward_elimination(clf, df_x, df_y, select_feature, score)
            print('select_feature after backward elimination: ', select_feature)
            print('current score is :', score)
            
        if current_feature == select_feature:
            break
        else:
            remove_feature = [i for i in current_feature if i not in select_feature]
            all_features = [i for i in all_features if i not in remove_feature]
            
            
    return select_feature, score
