In [9]:
import numpy as np
import os
import glob
#import torch
from numpy import genfromtxt
import data_loader as dl
import pandas as pd
from sklearn.model_selection import cross_val_score
#from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
#from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
#from statsmodels.tools import categorical
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import sklearn

In [2]:
def compute_accuracy(predn, truth):
    '''
    Computes classification accuracy given 2  numpy arrays where the arguments are predictions and ground
    truth.
    
    Args:
        predn : numpy ndarray of predictions of class labels
        truth : ground tuth numpy ndarray of class labels
        
    Returns:
        accuracy : Float value denoting classification accuracy
    '''
    
    assert isinstance(predn,np.ndarray) and isinstance(truth,np.ndarray)
    assert(len(predn) == len(truth))
    assert(predn.dtype == 'int64'and truth.dtype == 'int64')
    
    
    
    samples = predn.shape
    return (np.sum(1*(predn == truth))/samples[0])

In [3]:
handle = dl.DataLoader('./WA_Fn-UseC_-HR-Employee-Attrition.csv')

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [4]:
train, ltrain,fnames,newfnames = handle.get_data()
test,ltest,fnames,newfnames = handle.get_data(mode = 'testing')

In [5]:
class MLClassifier():
    def __init__(self,Xtrain,ytrain,Xtest,ytest,fnames = None,newfnames = None):
        '''
        Initialize a multipurpose classifier from training data,training labels, testing data and testing
        labels. Provide the arguments fnames and newfnames from the data loader function.
        
        Args:
            Xtrain : Training matrix in the form n_sumples x n_features
            ltrain : Labels of training matrix of the form (n_samples,)
            Xtest : Testing matrix in the form of n_samples x  n_features
            ltest : Labels of testing matrix of the form (n_samples,)
            fnames : Provide the fnames returned by the data loader (Only if data needs to be plotted)
            newfnames : Provide the newfnames returned by the data loader (Only if data needs to be plotted)
        Returns:
            Instance of the multipurpose classifier
        '''
        
        assert isinstance(Xtrain,np.ndarray) and isinstance(ytrain,np.ndarray) and isinstance(Xtest,np.ndarray) and isinstance(ytest,np.ndarray)
        assert (len(Xtrain.shape) == 2) and (len(Xtest.shape) == 2)
        assert (Xtrain.shape[0] == ytrain.shape[0]) and (Xtest.shape[0] == ytest.shape[0])
        
        assert(ytrain.dtype == 'int64') and (ytest.dtype == 'int64')
        
        
        self._train = Xtrain
        self._test  = Xtest
        self._ltrain = ytrain
        self._ltest = ytest
        self._forest = None
        self._fnames = fnames
        self._newfnames = newfnames
        
        
    def ExtraTrees(self,mode = 'v'):
        '''
        Builds an extra trees classifier and outputs accuracies into a text file
        
        Args:
            mode : v (default) : Accuracies printed on screen
                   q : Quiet mode
        Returns:
            Alters the instance of the object to make it into an extra trees classifier. Changes differ only
            when PlotFeatureImportance is called.
        
        '''
        
        assert isinstance(mode,str)
        assert (mode == 'v') or (mode == 'q')
        
        forest = ExtraTreesClassifier(n_estimators = 25,max_features= None,oob_score = True,bootstrap= True)
        forest.fit(self._train,self._ltrain)
        test_pred = forest.predict(self._test)
        test_acc = compute_accuracy(test_pred,self._ltest)
        train_pred = forest.predict(self._train)
        train_acc = compute_accuracy(train_pred,self._ltrain)
        self._forest = forest

        fname = 'ExtraTrees_result.txt'
        
        if(mode == 'q'):
            with open(fname,'w') as f:
                print("Training accuracy for ExtraTrees Classifier is %2.5f\nTesting Accuracy is %2.5f"%(train_acc,test_acc),file = f)
                
        if(mode == 'v'):
            with open(fname,'w') as f:
                print("Training accuracy for ExtraTrees Classifier is %2.5f\nTesting Accuracy is %2.5f"%(train_acc,test_acc),file = f)
            print("Training accuracy for ExtraTrees Classifier is %2.5f\nTesting Accuracy is %2.5f"%(train_acc,test_acc))

            
    def RandomForest(self,mode = 'v'):
        '''
        Builds an random forest classifier and outputs accuracies into a text file
        
        Args:
            mode : v (default) : Accuracies printed on screen
                   q : Quiet mode
        Returns:
            Alters the instance of the object to make it into an random forest classifier. Changes differ only
            when PlotFeatureImportance is called.
        
        '''
        
        assert isinstance(mode,str)
        assert (mode == 'v') or (mode == 'q')
        
        
        
        
        
        
        forest = RandomForestClassifier(n_estimators = 20,max_depth= None,max_features= None,oob_score = True,class_weight = 'balanced')
        forest.fit(self._train,self._ltrain)
        test_pred = forest.predict(self._test)
        test_acc = compute_accuracy(test_pred,self._ltest)
        train_pred = forest.predict(self._train)
        train_acc = compute_accuracy(train_pred,self._ltrain)
        self._forest = forest

        
        fname = 'RandomForest_result.txt'
        
        if(mode == 'q'):
            with open(fname,'w') as f:
                print("Training accuracy for RandomForest Classifier is %2.5f\nTesting Accuracy is %2.5f"%(train_acc,test_acc),file = f)
                
        if(mode == 'v'):
            with open(fname,'w') as f:
                print("Training accuracy for RandomForest Classifier is %2.5f\nTesting Accuracy is %2.5f"%(train_acc,test_acc),file = f)
            print("Training accuracy for RandomForest Classifier is %2.5f\nTesting Accuracy is %2.5f"%(train_acc,test_acc))        
            
    def LinearSVM(self,mode = 'v'):
        '''
        Builds an linear support vector machine classifier and outputs accuracies into a text file
        
        Args:
            mode : v (default) : Accuracies printed on screen
                   q : Quiet mode
        Returns:
            Alters the instance of the object to make it into an linear support vector machine classifier. Changes differ only
            when PlotFeatureImportance is called.
        
        '''
        
        assert isinstance(mode,str)
        assert (mode == 'v') or (mode == 'q')
        
        
        
        
        forest = LinearSVC(dual = False,max_iter = 7000,C=0.1,penalty='l1')
        forest.fit(self._train,self._ltrain)
        test_pred = forest.predict(self._test)
        test_acc = compute_accuracy(test_pred,self._ltest)
        train_pred = forest.predict(self._train)
        train_acc = compute_accuracy(train_pred,self._ltrain)
        self._forest = forest
        
        
        fname = 'LinearSVM_result.txt'
        
        if(mode == 'q'):
            with open(fname,'w') as f:
                print("Training accuracy for LinearSVM Classifier is %2.5f\nTesting Accuracy is %2.5f"%(train_acc,test_acc),file = f)
                
        if(mode == 'v'):
            with open(fname,'w') as f:
                print("Training accuracy for LinearSVM Classifier is %2.5f\nTesting Accuracy is %2.5f"%(train_acc,test_acc),file = f)
            print("Training accuracy for LinearSVM Classifier is %2.5f\nTesting Accuracy is %2.5f"%(train_acc,test_acc))        
    
    def RadSVM(self,mode = 'v'):
        '''
        Builds an Radial basis function SVM and outputs accuracies into a text file
        
        Args:
            mode : v (default) : Accuracies printed on screen
                   q : Quiet mode
        Returns:
            Alters the instance of the object to make it into an Radial basis function SVM. Changes differ only
            when PlotFeatureImportance is called.
        
        '''
        
        assert isinstance(mode,str)
        assert (mode == 'v') or (mode == 'q')
        
        
        
        
        forest = SVC(gamma = 'scale')
        forest.fit(self._train,self._ltrain)
        test_pred = forest.predict(self._test)
        test_acc = compute_accuracy(test_pred,self._ltest)
        train_pred = forest.predict(self._train)
        train_acc = compute_accuracy(train_pred,self._ltrain)
        self._forest = forest
        
        
        
        fname = 'RadSVM_result.txt'
        
        if(mode == 'q'):
            with open(fname,'w') as f:
                print("Training accuracy for Radial Basis Function SVM Classifier is %2.5f\nTesting Accuracy is %2.5f"%(train_acc,test_acc),file = f)
                
        if(mode == 'v'):
            with open(fname,'w') as f:
                print("Training accuracy for Radial Basis Function SVM Classifier is %2.5f\nTesting Accuracy is %2.5f"%(train_acc,test_acc),file = f)
            print("Training accuracy for Radial Basis Function SVM Classifier is %2.5f\nTesting Accuracy is %2.5f"%(train_acc,test_acc))
    
    
    def LogRegression(self,mode = 'v'):
        '''
        Builds an Logistic Regression Classifier and outputs accuracies into a text file
        
        Args:
            mode : v (default) : Accuracies printed on screen
                   q : Quiet mode
        Returns:
            Alters the instance of the object to make it into an Logistic Regression Classifier. Changes differ only
            when PlotFeatureImportance is called.
        
        '''
        
        assert isinstance(mode,str)
        assert (mode == 'v') or (mode == 'q')
        
        
        
        
        forest = LogisticRegression(solver = 'lbfgs', max_iter = 500)
        forest.fit(self._train,self._ltrain)
        test_pred = forest.predict(self._test)
        test_acc = compute_accuracy(test_pred,self._ltest)
        train_pred = forest.predict(self._train)
        train_acc = compute_accuracy(train_pred,self._ltrain)
        self._forest = forest
        
        
        fname = 'LogRegression_result.txt'
        
        if(mode == 'q'):
            with open(fname,'w') as f:
                print("Training accuracy for Logistic Regression Classifier is %2.5f\nTesting Accuracy is %2.5f"%(train_acc,test_acc),file = f)
                
        if(mode == 'v'):
            with open(fname,'w') as f:
                print("Training accuracy for Logistic Regression Classifier is %2.5f\nTesting Accuracy is %2.5f"%(train_acc,test_acc),file = f)
            print("Training accuracy for Logistic Regression Classifier is %2.5f\nTesting Accuracy is %2.5f"%(train_acc,test_acc))
    
    def PlotFeatureImportances(self):
        '''
        Plots the feature importances only for the extra trees or random trees classiffier
        
        Args:
            None
        Returns:
            None
        Side Effects:
            Displays an image consisting of feature importances and also saves same to a file.
        
        '''
        assert (self._forest != None)
        
        assert type(self._forest) == sklearn.ensemble.forest.ExtraTreesClassifier or type(self._forest) == sklearn.ensemble.forest.RandomForestClassifier
 
        forest = self._forest
        fnames = self._fnames
        newfnames = self._newfnames
        
        importances = forest.feature_importances_
        updated_importances = []
        categorical_importances =dict()
        count = [0]*8
        for value,name in zip(importances,fnames):
            if(name[-1] != '0'):
                updated_importances.append(value)
                continue
            else:
                b = name.split('_')
                try:
                    categorical_importances[b[0]][0] +=  value
                    categorical_importances[b[0]][1] += 1
                except KeyError:
                    categorical_importances[b[0]] = [value,1]
                    #categorical_importances[b[0]][1] = 1
        for key,val in zip(categorical_importances.keys(),categorical_importances.values()):
            categorical_importances[key] = categorical_importances[key][0]/categorical_importances[key][1]
        updated_importances = updated_importances + list(categorical_importances.values())

        std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                     axis=0)
        importances = np.asarray(updated_importances)
        indices = np.argsort(importances)[::-1]
        indices = indices[:10]
        # Print the feature ranking
        fig,ax = plt.subplots()
        tosave  = './feature_importance'
        print("Feature ranking:")

        for f in range(10):
            print("%d. feature %d - %s (%f)" % (f + 1, indices[f], newfnames[indices[f]],importances[indices[f]]))

        # Plot the feature importances of the forest
        #plt.figure()
        ax.set_title("Feature importances")
        ax.bar(range(10), importances[indices],yerr=std[indices], align="center",
               color="r")
        plt.xticks(range(10), indices)
        ax.set_xlim([-1, 10])
        #plt.show()
        plt.savefig(tosave)
    