# Class Imbalance

Import libraries

In [7]:
import math

import numpy as np
import pandas as pd

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split
import sklearn.datasets as datasets
from sklearn import metrics
from sklearn import preprocessing


Ensemble 
Includes functions for 10-fold cross validation, ranking

In [8]:
class Ensembles:

    def __init__(self):
        self.bagging = None
        self.boosting = None
        self.randomForest = None
        self.voting = None
        self.ensembles = ["AdaBoost", "Bagging", "RandomForest", "Voting"]
        self.iterations = 10

        # Results
        self.accuracy_results = pd.DataFrame(
            columns=['Dataset', 'AdaBoost', 'Bagging', 'RandomForest', 'Voting'])

        # Data
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None

        # Prediction data
        self.y_predicted_train = None
        self.y_predicted_test = None

    def train(self, data_x, data_y, boosting=True, bagging=True, randomForest=True, voting=True, dataset=""):
        self.dataset = dataset
        self.setupResults()

        # Execute 10-fold cross validation and keep the mean score of train and test accuracy
        for i in range(0, self.iterations):
            # Create new data split
            self.createData(data_x, data_y)
            # Train and predict using Boosting ensemble
            if boosting is True:
                # Create model
                self.boosting = AdaBoostClassifier(random_state=0)
                # Train predict and log results for this model
                self.fitPredict(self.boosting, "AdaBoost")
            # Train and predict using RandomForest ensemble
            if randomForest is True:
                # Create model
                self.randomForest = RandomForestClassifier(
                    random_state=0, n_estimators=50)
                # Train predict and log results for this model
                self.fitPredict(self.randomForest, "RandomForest")
            # Train and predict using Bagging ensemble
            if bagging is True:
                # Create model
                self.bagging = BaggingClassifier(
                    random_state=0)
                # Train predict and log results for this model
                self.fitPredict(self.bagging, "Bagging")
            if voting is True:
                # Train and predict using Voting ensemble
                rf = RandomForestClassifier(random_state=0, n_estimators=10)
                knn = KNeighborsClassifier()
                svc = SVC(random_state=0, gamma='auto')
                mnb = MultinomialNB()

                self.voting = VotingClassifier(estimators=[(
                    'Random Forests', rf), ('KNeighbors', knn), ('SVC', svc), ('MultinomialNB', mnb)], voting='hard')
                # Train predict and log results for this model
                self.fitPredict(self.voting, "Voting")

        # Calculate mean for each ensemble
        self.meanResults()

        # Log final results
        for ensembe in self.ensembles:
            self.logEnsembleResults(ensembe)
        self.logResults()
        self.printEnsembleResults()

    def fitPredict(self, model, name):
        # Train model
        model.fit(self.x_train, self.y_train)
        # Predict
        self.y_predicted_train = model.predict(self.x_train)
        self.y_predicted_test = model.predict(self.x_test)
        # Log results
        self.addResults(name)

    def createData(self, data_x, data_y):
        self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(
            data_x, data_y, test_size=0.30, random_state=0)

    def addResults(self, name):
        result_train = metrics.accuracy_score(self.y_train, self.y_predicted_train)
        result_test = metrics.accuracy_score(self.y_test, self.y_predicted_test)
        self.results[name]["train"] += result_train
        self.results[name]["test"] += result_test

    def meanResults(self):
        for ensemble in self.ensembles:
            self.results[ensemble]["train"] /= self.iterations
            self.results[ensemble]["test"] /= self.iterations
        self.calculateRanking()

    def calculateRanking(self):
        ensembles = self.ensembles.copy()
        i = 1
        while len(ensembles) > 0:
            max_ensemble = None
            duplicates = 0
            for ensemble in ensembles:
                if max_ensemble == None:
                    max_ensemble = ensemble
                else:
                    if self.results[max_ensemble]["test"] < self.results[ensemble]["test"]:
                        max_ensemble = ensemble
                        duplicates = 0
                    elif self.results[max_ensemble]["test"] == self.results[ensemble]["test"]:
                        duplicates += 1
            if duplicates == 0:
                self.ranking[max_ensemble] = " (" + str(i) + ")"
                ensembles.remove(max_ensemble)
            else:
                duplicates += 1
                ranking = i + (1 / duplicates)
                j = 0
                while duplicates > 0:
                    if self.results[max_ensemble]["test"] == self.results[ensembles[j]]["test"]:
                        self.ranking[ensembles[j]] = " (" + str(ranking) + ")"
                        ensembles.remove(ensembles[j])
                        duplicates -= 1
                    else:
                        j += 1
            i += 1

    def logResults(self):
        self.accuracy_results = self.accuracy_results.append({
            'Dataset': self.dataset,
            self.ensembles[0]: str(float("%0.4f" % self.results[self.ensembles[0]]["test"])) + self.ranking[self.ensembles[0]],
            self.ensembles[1]: str(float("%0.4f" % self.results[self.ensembles[1]]["test"])) + self.ranking[self.ensembles[1]],
            self.ensembles[2]: str(float("%0.4f" % self.results[self.ensembles[2]]["test"])) + self.ranking[self.ensembles[2]],
            self.ensembles[3]: str(float("%0.4f" % self.results[self.ensembles[3]]["test"])) + self.ranking[self.ensembles[3]]}, ignore_index=True)

    def logEnsembleResults(self, name):
        self.ensemble_results = self.ensemble_results.append({'Algorithm': name, 'Mean accuracy train':  float(
            "%0.3f" % self.results[name]["train"]), 'Mean accuracy test': float("%0.3f" % self.results[name]["test"])}, ignore_index=True)

    def printEnsembleResults(self):
        # Print final results
        print(self.ensemble_results)

    def printResults(self):
        # Print final results
        print(self.accuracy_results)
        print("")

    def setupResults(self):
        self.ensemble_results = pd.DataFrame(
            columns=['Algorithm', 'Mean accuracy train', 'Mean accuracy test'])
        self.results = {"Bagging": {"train": 0, "test": 0}, "AdaBoost": {
            "train": 0, "test": 0}, "Voting": {"train": 0, "test": 0}, "RandomForest": {"train": 0, "test": 0}}
        self.ranking = {}


function to scale data between 0 - 1

In [9]:
def scaleData(x):
    min_max_scaler = preprocessing.MinMaxScaler()
    scaled_x = min_max_scaler.fit_transform(x)
    return scaled_x

Runs the ensemble classifiers for the given dataset

In [10]:
def runEnsembleForDataset(ensemble, dataset=None, name="", data_x=None, data_y=None):
    if dataset is not None:
        data_x = dataset['data']
        data_y = dataset['target']
    data_x = scaleData(data_x)
    ensemble.train(data_x, data_y, dataset=name)

partA function creates the ClassImbalance object, loads each data and calls runEnsembleForDataset function to run the ensemble for each dataset

In [11]:
def partA():
    """ Assignment Part A """

    # Create ensemble object
    ensemble = Ensembles()

    print("1.Iris dataset")
    runEnsembleForDataset(ensemble, dataset=datasets.load_iris(), name="iris")
    print("")

    print("2.Wine dataset")
    runEnsembleForDataset(ensemble, dataset=datasets.load_wine(), name="wine")
    print("")

    print("3.Digits dataset")
    runEnsembleForDataset(ensemble, dataset=datasets.load_digits(), name="digits")
    print("")

    print("4.Breast cancer dataset")
    runEnsembleForDataset(ensemble, dataset=datasets.load_breast_cancer(), name="breast cancer")
    print("")

    print("5.Abalone dataset")
    # read data/abalone.data
    data = pd.read_csv("data/abalone.data", sep=",")
    data_x = data.values[:, :-1]
    data_y = data.values[:, -1].astype('int')
    for i in range(0, len(data_x)):
        if data_x[i, 0] == 'M':
            data_x[i, 0] = 0
        elif data_x[i, 0] == 'F':
            data_x[i, 0] = 1
        else:
            data_x[i, 0] = 2
    runEnsembleForDataset(ensemble, data_x=data_x, data_y=data_y, name="abalone")
    print("")

    print("6.Heart dataset")
    # read data/heart.csv
    data = pd.read_csv("data/heart.csv", sep=",")
    data_x = data.values[:, :-1]
    data_y = data.values[:, -1]
    runEnsembleForDataset(ensemble, data_x=data_x, data_y=data_y, name="heart")
    print("")

    print("7.Glass dataset")
    # read data/glass.data
    data = pd.read_csv("data/glass.data", sep=",")
    data_x = data.values[:, :-1]
    data_y = data.values[:, -1]
    runEnsembleForDataset(ensemble, data_x=data_x, data_y=data_y, name="glass")
    print("")

    print("8.Transfusion dataset")
    # read data/transfusion.data
    data = pd.read_csv("data/transfusion.data", sep=",")
    data_x = data.values[:, :-1]
    data_y = data.values[:, -1]
    runEnsembleForDataset(ensemble, data_x=data_x, data_y=data_y, name="transfusion")
    print("")

    print("9.Starcraft dataset")
    # read data/SkillCraft1_Dataset.csv
    data = pd.read_csv("data/SkillCraft1_Dataset.csv", sep=",", na_values=['?'])
    data_x = data.iloc[:, 2:]
    data_x = data_x.fillna(data_x.mean())
    data_x = data_x.values
    data_y = data.values[:, 1]
    runEnsembleForDataset(ensemble, data_x=data_x, data_y=data_y, name="starcraft")
    print("")

    print("10.Credit Card dataset")
    # read data/creditcard.csv
    data = pd.read_csv("data/creditcard.csv", sep=",")
    data_x = data.values[:100000, :-1]
    data_y = data.values[:100000, -1]
    runEnsembleForDataset(ensemble, data_x=data_x, data_y=data_y, name="credit card")
    print("")

    ensemble.printResults()

Run partA function

In [12]:
partA()

1.Iris dataset
      Algorithm  Mean accuracy train  Mean accuracy test
0      AdaBoost                0.962               0.911
1       Bagging                0.981               0.956
2  RandomForest                1.000               0.978
3        Voting                0.962               0.978

2.Wine dataset
      Algorithm  Mean accuracy train  Mean accuracy test
0      AdaBoost                0.976               0.889
1       Bagging                1.000               0.981
2  RandomForest                1.000               1.000
3        Voting                0.984               1.000

3.Digits dataset
      Algorithm  Mean accuracy train  Mean accuracy test
0      AdaBoost                0.296               0.259
1       Bagging                0.998               0.924
2  RandomForest                1.000               0.972
3        Voting                0.986               0.963

4.Breast cancer dataset
      Algorithm  Mean accuracy train  Mean accuracy test
0      AdaBoos



      Algorithm  Mean accuracy train  Mean accuracy test
0      AdaBoost                0.220               0.204
1       Bagging                0.979               0.224
2  RandomForest                1.000               0.246
3        Voting                0.465               0.231

6.Heart dataset




      Algorithm  Mean accuracy train  Mean accuracy test
0      AdaBoost                0.947               0.753
1       Bagging                0.984               0.753
2  RandomForest                1.000               0.852
3        Voting                0.878               0.802

7.Glass dataset
      Algorithm  Mean accuracy train  Mean accuracy test
0      AdaBoost                0.819               0.812
1       Bagging                1.000               0.984
2  RandomForest                1.000               0.984
3        Voting                0.839               0.828

8.Transfusion dataset




      Algorithm  Mean accuracy train  Mean accuracy test
0      AdaBoost                0.793               0.760
1       Bagging                0.927               0.711
2  RandomForest                0.941               0.729
3        Voting                0.776               0.729

9.Starcraft dataset
      Algorithm  Mean accuracy train  Mean accuracy test
0      AdaBoost                0.218               0.217
1       Bagging                0.992               0.407
2  RandomForest                1.000               0.422
3        Voting                0.604               0.358

10.Credit Card dataset
      Algorithm  Mean accuracy train  Mean accuracy test
0      AdaBoost                0.999               0.999
1       Bagging                1.000               1.000
2  RandomForest                1.000               1.000
3        Voting                0.999               0.999

         Dataset      AdaBoost       Bagging  RandomForest        Voting
0           iris    0.9111