# Useful Function for Data Analysis

In [1]:
# Importing basic functions
import pandas as pd
import missingno as mano
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math as mt 
import statistics as st
from numpy.random import seed
from numpy.random import randn
from scipy.stats import shapiro

In [60]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import StratifiedKFold

### Funtion 1: Data read.

In [3]:
# This function read data from excel and csv files to dataframe.
# Input Arguments file: File name with path (Eg: 'D:/data/dataread.csv'), f_type: File Type (Eg: 'csv',default csv)
def file_todataframe(file,f_type):
    if f_type == 'csv':  
        return pd.read_csv(file)
    elif f_type == 'excel': 
        return pd.read_excel(file)
    elif f_type == 'json': 
        return pd.read_json(file)

### Funtion 2: Data Details.

In [4]:
# This function display shape, data type, data near head and tail of given data fram.
# Input Arguments df: dataframe, n: No f data points to display
def df_details(d_f,n):
    print('Data Types of Column: \n',d_f.dtypes)
    print('\n Size of Datarame: ',d_f.shape)
    print('\n Top and bottom ',n,' rows: \n')
    display(d_f.head(n).append(d_f.tail(n)))

### Funtion 3: Droping Columns.

In [5]:
# This function drops irrelevant columns
# Input Arguments df: dataframe, col_del: Value or index array of column to delete (Eg: [1,3,5] or ['Names',Sales]), 
# typ: 1 for column index and 0 for column name in col_del
def col_drop(d_f,col_del,typ):
    if typ == 0:
        d_f = d_f.drop(col_del,axis=1)
    elif typ == 1:
        d_f = d_f.drop(df.columns[col_del],axis=1)
    return d_f

### Funtion 4: Droping Rows.

In [6]:
# This function drops rows with particular column values
# Input Arguments df: dataframe, row_del: Delete if row with given value,
# col_ref: Name of column to check for row values (Eg:['Names']), 
def row_drop(d_f,row_del,col_ref):
    d_f = d_f.drop(d_f[d_f[col_ref] == row_del].index)
    return d_f

### Funtion 5: Missing Values

In [7]:
# This function give deatials for missing values in data
# Input Arguments df: dataframe
def miss_ch(d_f):
    print('Available data with no nulls: ', d_f.dropna().shape[0])
    display('Deatils of Null values column wise',d_f.isnull().sum())
    

### Funtion 6: Missing Values Analysis using Missing No

In [8]:
# This function give deatials for missing values in data
# Input Arguments df: dataframe, col_int: Columns of intrest (Eg: ['Sale','Customer'],['all'] default 'all'), 
# graph: Types of graph to display. (Eg: ['bar','matrix','heatmap','dendrogram'],['all'] default 'all')
def miss_viz(d_f,col_int = 'all',graph = 'all'):
    import missingno as mano
    av_gp = ['bar','matrix','heatmap','dendrogram']
    col_nam = d_f.columns
    if col_int == 'all':
        col_int = col_nam
    elif not all(i in col_nam for i in col_int):
        print("Invalid column name")
        return 
    
    if graph == 'all':
        graph = av_gp
    elif not all(i in av_gp for i in graph):
        print("Invalid Graph type, select 'bar','matrix','heatmap','dendrogram'")
        return
    
    for gp in graph:
        getattr(mano, gp)(d_f[col_int])
    

### Funtion 7: Filling missing Values 

In [9]:
# This function fill missing values
# Input Arguments df: dataframe, col_int: Columns of intrest (Eg: ['Sale','Customer'],['all'] default 'all'), 
# metd: Types of graph to display. (Option: {0(float),‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, 'linear'} default None)
def fill_miss(d_f,col_int = 'all',metd = None):
    col_nam = d_f.columns
    av_method = ['backfill', 'bfill', 'pad', 'ffill', 'linear']
    if col_int == 'all':
        col_int = col_nam
    elif not all(i in col_nam for i in col_int):
        print("Invalid column name")
        return d_f
    
    if metd == None:
        return d_f
    elif (type(metd) == int) | (type(metd) == float):
        print('yes')
        return d_f[col_int].fillna(metd)  
    elif metd == 'linear':
        return d_f[col_int].interpolate(method = 'linear') 
    elif  metd in av_method:
        return d_f[col_int].fillna(method = 'ffill')
    else:
        print("Invalid fill type")
        return d_f
    

### Funtion 8: Numerical Data Analysis

In [10]:
# Function for Numerical Data Analysis
# Input Arguments df: dataframe, col_int: Columns of intrest (Eg: ['Sale','Customer'],['all'] default 'all'), 
# func: Types of graph and function. (Option: {'distplot','boxpot','scatterplot','describe','normality'} default all)
def data_num(d_f,col_int = 'all',func = 'all',scat = None):
    from scipy.stats import shapiro
    from statsmodels.graphics.gofplots import qqplot
    av_func = ['hist','boxplot','scatter','describe','normality']
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    col_nam = d_f.select_dtypes(include=numerics).columns
    #Checking Parameter Column names
    if col_int == 'all':
        col_int = col_nam
    elif not all(i in col_nam for i in col_int):
        print("Invalid column name")
        return 
    
    #Checking Parameter available function
    if func == 'all':
        func = av_func
    elif not all(i in av_gp for i in graph):
        print("Invalid Graph type, select 'distplot','boxpot','scatterplot','describe','normality'")
        return
    if scat is None:
        scat = df.columns[0]
    
    
    for fn in func:
        if fn == 'describe':
            display("Statictical Details",df[col_int].describe())
        else:
            for col in col_int:
                if fn == 'normality':
                    #qqplot(df[col])
                    print("Normality Test for: ",col)
                    stat, p = shapiro(df[col])
                    if p > 0.05:
                        print('Sample looks Gaussian. Statistics=%.3f, p=%.3f'% (stat, p))
                    else:
                        print('Sample does not look Gaussian. Statistics=%.3f, p=%.3f'% (stat, p))
                elif fn == 'scatter':
                    display(col)
                    plt.scatter(df[col],df[scat])  
                    plt.show()
                else:
                    display(col)
                    getattr(plt, fn)(df[col])   
                    plt.show()

### Funtion 9: Categorical Data Analysis

In [11]:
# Function for Categorical Data Analysis
# Input Arguments df: dataframe, col_int: Columns of intrest (Eg: ['Sale','Customer'],['all'] default 'all')
def data_cat(d_f,col_int = 'all',bar = None):
    import matplotlib.pyplot as plt
    col_nam = d_f.select_dtypes(include=['object','category']).columns
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    col_num = d_f.select_dtypes(include=numerics).columns   
    #Checking Parameter Column names
    if col_int == 'all':
        col_int = col_nam
    elif not all(i in col_nam for i in col_int):
        print("Invalid column name")
        return 

    if not ((bar in col_num) | (bar is None)):
        print("Only numeric column for Bar")
        return 
    if not (bar is None):
        for col in col_int:
            plt.bar(d_f[col], d_f[bar])
            plt.show()
        return
    for col in col_int:
        d_f[col].value_counts().plot(kind='bar')
        plt.show()
    

### Funtion 10: Column Type Change

In [12]:
# Function for Canging column type
# Input Arguments df: dataframe, col_int: Columns of intrest (Eg: ['Sale','Customer'],['all'] default 'all'), 
# dtyp: New data types of coloumn default int
def col_dtype(d_f,col_int,dtyp = int):
    d_f = deep_copy(d_f)
    av_fun = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64','int','float','str','category']
    col_nam = d_f.columns
    #Checking Parameter Column names
    if not all(i in col_nam for i in col_int):
        print("Invalid column name")
        return 
    #Checking Parameter available function
    if not dtyp in av_fun:
        print("Invalid data type")
        return
    
    d_f[col_int] = d_f[col_int].astype(dtyp,errors='ignore')
    return d_f

### Funtion 11: Column Operations

In [13]:
# Function for Canging column type
# Input Arguments df: dataframe, col_int: Columns of intrest (Eg: ['Sale','Customer'],['all'] default 'all'), 
# opr: Operation to be performed on coloumn values, val = [old value, new value]
def col_opre(d_f,col_int = 'all',opr = None,val = None):
    col_nam = d_f.select_dtypes(include=['object']).columns
    av_opr = ['str_replace','rm_space','chg_value']
    
    #Checking Parameter Column names
    if col_int == 'all':
        col_int = col_nam
    elif not all(i in col_nam for i in col_int):
        print("Invalid column name")
        return 

    if opr == 'str_replace':
        d_f[col_int] = d_f[col_int].str.replace(val[0], val[1])
        return d_f
    elif opr == 'rm_space':
        d_f[col_int] = d_f[col_int].str.replace(' ', '')
        return d_f
    elif opr == 'chg_value':
        d_f[col_int] = d_f[col_int].replace(val[0], val[1])
        return d_f
    else:
        print("Invalid Opeation")
        return d_f

### Funtion 12: Basis Data analysis

In [14]:
# Function for Performing numerical operations to column values
# Input Arguments df: dataframe, col_int: Columns of intrest (Eg: ['Sale','Customer'],['all'] default 'all'), 
# opr: Operation to be performed on coloumn values, val: value to apply on column
def col_opre(d_f,col_int = 'all',opr = None,val = None):
    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64',]
    col_num = d_f.select_dtypes(include=numerics).columns  


    if opr == 'add':
        d_f[col_int] = d_f[col_int]+val
        return d_f
    elif opr == 'sub':
        d_f[col_int] = d_f[col_int]-val
        return d_f
    elif opr == 'mul':
        d_f[col_int] = d_f[col_int]*val
        return d_f
    elif opr == 'div':
        d_f[col_int] = d_f[col_int]/val
        return d_f
    else:
        print("Invalid Opeation")
        return d_f

### Funtion 13: Deep Dataframe Copy

In [15]:
# Function for creating a deep copy
# Input Arguments df: dataframe, Output data frame copy
def deep_copy(d_f):
    return d_f.copy(deep=True)

### Funtion 14: Categorical to Numeric

In [109]:
# Function for coding Categorical variable  to Numeric
# Input Arguments df: dataframe, col_int: Columns of intrest (Eg: ['Sale','Customer'],['all'] default 'all'{must be string}), 
# coding_type: Coding type to apply{'label','binary','ordinal','onehot'} default,label, contain: For binary
def cat_num(d_f,col_int,coding_type = 'label',contain=None):
    from sklearn.preprocessing import OrdinalEncoder
    from sklearn.preprocessing import OneHotEncoder
    
    col_nam = d_f.select_dtypes(include=['object']).columns
    #Checking Parameter Column names
    if col_int == 'all':
        col_int = col_nam
    elif not all(i in col_nam for i in col_int):
        print("Invalid column name")
        return 
    
    if coding_type == 'label':
        d_f[col_int] = d_f[col_int].astype('category')
        d_f[col_int] = d_f[col_int].cat.codes
        return d_f
    elif coding_type == 'binary':
        d_f[col_int] = np.where(d_f[col_int].str.contains(contain), 1, 0)
        return d_f
    elif coding_type == 'ordinal':
        ord_enc = OrdinalEncoder()
        d_f[col_int] = ord_enc.fit_transform(d_f[[col_int]])
        return d_f
    elif coding_type == 'onehot':
        oe_style = OneHotEncoder()
        d_f = oe_style.fit_transform(d_f[[col_int]])
        return d_f
    else:
        display("Invalid Ending Method")
        return d_f

### Funtion 15: ML algorithms

In [17]:
# Function for applyng Machine learning Models
# Input Arguments df with last raw as labels, task: regression or classification, algo: Descion Tree or Random Forest
def ml_algo(x,y,algo = 'decisiontree',task = 'Reg',n=3):

    
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=1)
    if task == 'Reg':
        if algo == 'decisiontree':
            model = DecisionTreeRegressor(max_depth=n)

        elif algo == 'randomforest':
            model = RandomForestRegressor(n_estimators = n, random_state = 42)
            
        elif algo == 'knn':
            model = KNeighborsRegressor(n_neighbors=n)
        else:
            print("Invalid Algorithm")
            return  
        
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r_2 = r2_score(y_test, y_pred)
        r_2adj = r2_score(y_test, y_pred,multioutput='variance_weighted')
        return mae,mse,r_2,r_2adj,model
        
    elif task == 'Class':    
        if algo == 'decisiontree':
            model = DecisionTreeClassifier(max_depth=n) 
        
        elif algo == 'randomforest':
            model=RandomForestClassifier(n_estimators=n)
            
        # n neighbour
        elif algo == 'knn':
            model = KNeighborsClassifier(n_neighbors=n)
        else:
            print("Invalid Algorithm")
            return

        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        c_m =confusion_matrix(y_test, y_pred)
        c_r = classification_report(y_test, y_pred)
        acc_sc = accuracy_score(y_test, y_pred)
        auc_roc = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])
        logloss = log_loss(y_test, y_pred)        
        return c_m,c_r,acc_sc,auc_roc,logloss,model
        

### Funtion 16: ANOVA Test

In [18]:
# Function for applyng Machine learning Models
# Input Arguments df with last raw as labels, task: regression or classification, algo: Descion Tree or Random Forest
def anova(d_f,col_int='all',col_main=None):
    from scipy import stats
    import statsmodels.api as sm
    from statsmodels.formula.api import ols
    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64',]
    col_num = d_f.select_dtypes(include=numerics).columns 
    
    #Checking Parameter Column names
    if col_int == 'all':
        col_int = col_nam
    elif not all(i in col_nam for i in col_int):
        print("Invalid column name")
        return 
    
    for col in col_int:
        model = ols(col_main+'~ C(Q("'+col+'"))', data=d_f).fit()
        anova_table = sm.stats.anova_lm(model, typ=2)
        print ("\nAnova =>",col_main," - ",col)
        display(anova_table)

### Funtion 17: Correlation Heat Map

In [19]:
# This function Correlation Heat Map
# Input Arguments d_f: dataframe
def corr_hmap(d_f):
    import seaborn as sns
    sns.set(rc={'figure.figsize':(15,8)})
    corr = d_f.corr().dropna(1,'all').dropna(0,'all')
    ax = sns.heatmap(
        corr, 
        vmin=-1, vmax=1, center=0,
        cmap=sns.diverging_palette(20, 220, n=200),
        square=True
    )
    ax.set_xticklabels(
        ax.get_xticklabels(),
        rotation=90,
        horizontalalignment='right'
    );
    ax.set_yticklabels(
        ax.get_yticklabels(),
        rotation=0,
        horizontalalignment='right'
    );

### Funtion 18: Scatter Plot

In [20]:
# This function scatter Plot
# Input Arguments df: dataframe, col_int: Two columns of intrest (Eg: ['Sale','Customer'],['all']), 
def scatter_plot(d_f,col_int = None):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    col_nam = d_f.select_dtypes(include=numerics).columns
    if (col_int is None) & (not all(i in col_nam for i in col_int)) & (len(col_int) != 2):
        print("Invalid column name")
        return 
    x = d_f[col_int[0]]
    y = d_f[col_int[1]]
    xv = col_int[0]+' X- Value'
    yv = col_int[1]+' Y- Value'
    tit = col_int[0] + ' vs ' + col_int[1] +' Scatter plot'

    plt.scatter(x, y)
    plt.rcParams.update({'figure.figsize':(10,8), 'figure.dpi':100})
    plt.title(tit)
    plt.xlabel(xv)
    plt.ylabel(yv)
    plt.show()

## Template Extension

### 1) Anova, T-test, correlation heatmaps, normality tests
Anova Correlation Heat Map and Normality already done (Compiled Once Again)

In [21]:
# Function for applyng Machine learning Models
# Input Arguments df with last raw as labels, task: regression or classification, algo: Descion Tree or Random Forest
def anova(d_f,col_int='all',col_main=None):
    from scipy import stats
    import statsmodels.api as sm
    from statsmodels.formula.api import ols
    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64',]
    col_num = d_f.select_dtypes(include=numerics).columns 
    
    #Checking Parameter Column names
    if col_int == 'all':
        col_int = col_nam
    elif not all(i in col_nam for i in col_int):
        print("Invalid column name")
        return 
    
    for col in col_int:
        model = ols(col_main+'~ C(Q("'+col+'"))', data=d_f).fit()
        anova_table = sm.stats.anova_lm(model, typ=2)
        print ("\nAnova =>",col_main," - ",col)
        display(anova_table)

## T Test
def t_test(d_f,col_ind=[1,2]):
    data1 = d_f.iloc[:,col_ind[0]].values
    data2 = d_f.iloc[:,col_ind[1]].values
    # calculate means
    mean1, mean2 = st.mean(data1), st.mean(data2)
    # calculate sample standard deviations
    std1, std2 = st.stdev(data1), st.stdev(data2)
    # calculate standard errors
    n1, n2 = len(data1), len(data2)
    se1, se2 = std1/mt.sqrt(n1), std2/mt.sqrt(n2)
    # standard error on the difference between the samples
    sed = mt.sqrt(se1**2.0 + se2**2.0)
    # calculate the t statistic
    t_stat = (mean1 - mean2) / sed
    print('T Test Statistics=%.3f' % (t_stat))

## Normality Test
def norm(d_f,col_ind=[1]):
    for col in col_ind:
        data = d_f.iloc[:,col]
        stat, p = shapiro(data)
        print('Statistics=%.3f, p=%.3f' % (stat, p))
        # interpret
        alpha = 0.05
        if p > alpha:
            print('Sample looks Gaussian (fail to reject H0)')
        else:
            print('Sample does not look Gaussian (reject H0)')
        data.hist()
        
# This function Correlation Heat Map
# Input Arguments d_f: dataframe
def corr_hmap(d_f):
    import seaborn as sns
    sns.set(rc={'figure.figsize':(15,8)})
    corr = d_f.corr().dropna(1,'all').dropna(0,'all')
    ax = sns.heatmap(
        corr, 
        vmin=-1, vmax=1, center=0,
        cmap=sns.diverging_palette(20, 220, n=200),
        square=True
    )
    ax.set_xticklabels(
        ax.get_xticklabels(),
        rotation=90,
        horizontalalignment='right'
    );
    ax.set_yticklabels(
        ax.get_yticklabels(),
        rotation=0,
        horizontalalignment='right'
    );    


### 2) Classification and Regression Remaning Algorithms


In [110]:
# Function for applyng Machine learning Models
# Input Arguments df with last raw as labels, task: regression or classification, algo: Descion Tree or Random Forest
def ml_algo(X_train, X_test, y_train, y_test,algo = 'decisiontree',task = 'Reg',n=3):

    if task == 'Reg':
        if algo == 'decisiontree':
            model = DecisionTreeRegressor(max_depth=n)

        elif algo == 'randomforest':
            model = RandomForestRegressor(n_estimators = n, random_state = 42)
            
        elif algo == 'knn':
            model = KNeighborsRegressor(n_neighbors=n)
            
        elif algo == 'lreg':
            model = LinearRegression()
            
        elif algo == 'svm':
            model = SVR(kernel="rbf")
            
        elif algo == 'gboost':
            model = GradientBoostingRegressor()
            
        elif algo == 'adaboost':
            model = AdaBoostRegressor(random_state=0, n_estimators=100)
            
        else:
            print("Invalid Algorithm")
            return  
        
        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r_2 = r2_score(y_test, y_pred)
        r_2adj = r2_score(y_test, y_pred,multioutput='variance_weighted')
        return mae,mse,r_2,r_2adj,model
        
    elif task == 'Class':    
        if algo == 'decisiontree':
            model = DecisionTreeClassifier(max_depth=n) 
        
        elif algo == 'randomforest':
            model=RandomForestClassifier(n_estimators=n)
            
        elif algo == 'knn':
            model = KNeighborsClassifier(n_neighbors=n)
            
        elif algo == 'gboost':
            model = GradientBoostingClassifier()
            
        elif algo == 'adaboost':
            model = AdaBoostClassifier(n_estimators=100, random_state=0)
            
        elif algo == 'svc':
            model = svm.SVC()
            
        elif algo == 'naive':
            model = GaussianNB()
            
        elif algo == 'xgboost':
            model = XGBClassifier(random_state=1,learning_rate=0.01)
            
        elif algo == 'mlp':
            model = MLPClassifier(hidden_layer_sizes=5)
        elif algo == 'logit':
            model = LogisticRegression()
            
        else:
            print("Invalid Algorithm")
            return
        

        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        c_m =confusion_matrix(y_test, y_pred)
        c_r = classification_report(y_test, y_pred)
        acc_sc = accuracy_score(y_test, y_pred)
        auc_roc = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])
        logloss = log_loss(y_test, y_pred)       
        #Precision
        pre_l = precision_score(y_test, y_pred)*100
        #Recall
        recall_l = recall_score(y_test, y_pred)*100
        return c_m,c_r,acc_sc,auc_roc,logloss,model,pre_l,recall_l

### Feature selection algorithms
Already Implemented in last assignment features according to the k highest scores.<br>
Now Implementing RFFS

In [43]:
# Data frame with output variable at first location
# f_s features to select
def feature_selectk(d_f,f_s):
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import f_classif
    from sklearn.model_selection import train_test_split

    x = d_f.iloc[:,1:len(d_f)-1]
    y = d_f.iloc[:,:1]
    fs = SelectKBest(score_func=f_classif, k=f_s)
    # learn relationship from training data
    fs.fit(x, y)
    # transform train input data
    x_fs = fs.transform(x)
    return x_fs,y,fs

def feature_select_rffs(d_f,reg=True):
    from sklearn.feature_selection import SelectFromModel
    x = d_f.iloc[:,1:len(d_f.columns)]
    y = d_f.iloc[:,:1]
    if reg:
        sel = SelectFromModel(RandomForestRegressor(n_estimators = 50))
    else:
        sel = SelectFromModel(RandomForestClassifier(n_estimators = 50))
    sel.fit(x, y)
    selected_feat= x.columns[(sel.get_support())]
    len(selected_feat)
    print(selected_feat)
    return selected_feat

def importantfeat_rffs(d_f,reg=True): 
    # random forest for feature importance on a regression problem
    from sklearn.datasets import make_regression
    from sklearn.ensemble import RandomForestRegressor
    from matplotlib import pyplot
    # define dataset
    X = d_f.iloc[:,1:len(d_f.columns)]
    y = d_f.iloc[:,:1]
    # define the model
    model = RandomForestRegressor()
    # fit the model
    model.fit(X, y)
    # get importance
    importance = model.feature_importances_
    # summarize feature importance
    for i,v in enumerate(importance):
        print('Feature: %0d, Score: %.5f' % (i,v))
    # plot feature importance
    pyplot.bar([x for x in range(len(importance))], importance)
    pyplot.show()

### 4) Cross Validiation

In [80]:
# kfoldtype = KFold , StratifiedKFold
def ml_algo_cv(X,y,algo = 'decisiontree',task = 'Reg',n=3,split=10,kftype ='skfold'):
    accf = 0
    aucf = 0
    pref = 0
    recallf = 0
    i= 0
    if kftype == 'skfold':
        kf = StratifiedKFold(n_splits=split, random_state=None, shuffle=False)
    elif kftype == 'kfold':
        kf = KFold(n_splits=split, random_state=None, shuffle=False)
        
    for train_index, test_index in kf.split(X,y):
        trainX, testX = X.iloc[train_index], X.iloc[test_index] 
        trainY, testY = y.iloc[train_index], y.iloc[test_index]
        cm,cr,acc,auc,ll,mdl,pre,recall = ml_algo(trainX, testX, trainY, testY,algo = algo ,task = task,n=3)
        accf = accf + acc
        aucf = aucf + auc
        pref = pref + pre
        recallf = recallf + recall 
        i = i + 1
    accf = accf/i
    aucf = auc/i
    pref = pref/i
    recallf = recallf/i
    return accf,aucf,pref,recallf

### 6) Class Imbalance

In [86]:
## Upsampling minority class and down sampling majority class 
def class_imbalance(d_f,up=0.75,down=0.75,minor=1,col=None):
    df_majority = d_f[d_f[col]!=minor]
    df_minority = d_f[d_f[col]==minor]
    upc = mt.ceil(len(df_majority) * up)
    downc = mt.ceil(len(df_majority) * down)  
    # Upsample minority class
    df_minority_upsampled = resample(df_minority, 
                                     replace=True,     # sample with replacement
                                     n_samples=upc,    # to match majority class
                                     random_state=123) # reproducible results
    df_majority_under = resample(df_majority, 
                                 replace=False,     # sample with replacement
                                 n_samples=downc,    # to match majority class
                                 random_state=123) # reproducible results
    # Combine majority class with upsampled minority class
    df_smp = pd.concat([df_majority_under, df_minority_upsampled])
    # Display new class counts
    print(df_smp[col].value_counts())
    return df_smp

### 07) Other Helper Function
Confusion Matrix Heat Map

In [87]:
def con_mat(con_mat,lab = ['x-label','y-label']):
    import seaborn as sns
    import matplotlib.pyplot as plt   
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, fmt="d", ax = ax,cmap="YlGnBu"); #annot=True to annotate cells

    # labels, title and ticks
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels([lab[0],lab[1]]); ax.yaxis.set_ticklabels([lab[0],lab[1]]);        

### 08 - 12) ML Combination

In [108]:
# FS: Feature Selection
# CV: Cross Validiation
# CI: CLass Imbalance
def ml_com(d_f,colum,algor='decisiontree',fs=True,cv=True,ci=True):
    x = col_drop(d_f,colum,typ=1)
    y = d_f[colum]
    if fs & cv & ci:
        df_samp = class_imbalance(d_f,up=0.75,down=0.75,minor=1,col=colum)
        fet = feature_select_rffs(df_samp,reg=False)
        x = col_drop(df_samp,colum,typ=1)
        x = x[fet]
        y = df_samp[colum]
        acc,auc,pre,recall = ml_algo_cv(x,y,algo = algor,task = 'Class',n=3,split=10,kftype ='skfold')
        
    elif fs & ~cv & ~ci:
        fet = feature_select_rffs(d_f,reg=False)
        x = col_drop(d_f,colum,typ=1)
        x = x[fet]
        y = d_f[colum]
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=1)
        cm,cr,acc,auc,ll,model,pre,recall = ml_algo(X_train, X_test, y_train, y_test,algo = algor,task = 'Class',n=3)
    elif ~fs & ~cv & ci:
        df_samp = class_imbalance(d_f,up=0.75,down=0.75,minor=1,col=colum)
        x = col_drop(df_samp,colum,typ=1)
        y = df_samp[colum]
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=1)
        cm,cr,acc,auc,ll,model,pre,recall = ml_algo(X_train, X_test, y_train, y_test,algo = algor,task = 'Class',n=3)
    elif ~fs & ~cv & ~ci:
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=1)
        cm,cr,acc,auc,ll,model,pre,recall = ml_algo(X_train, X_test, y_train, y_test,algo = algor,task = 'Class',n=3)

    return acc,auc,pre,recall
