# Task 2 - Binary Classification Problem
**Given the training and validation datasets, http://bit.ly/widebot-new-binclf-data , Create
and train a machine learning model using the training set that performs well on the
validation set. You should decide on the metrics of "performance" yourself, We will assess
your decision.
It is up to you to use any of the following languages: [Python, Scala, Java, R]. We
appreciate a small write up of the observations and your thoughts to follow your thought
process.**


In [46]:
# Import the dependencies
import tensorflow as tf
 
import numpy as np
import pandas as pandas
import matplotlib.pyplot as plt
from tensorflow import keras
import random
from sklearn.impute import SimpleImputer
import csv as csv
import gc
gc.enable()
 
print(tf.__version__)

2.3.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# # download the dataset.
#!wget https://drive.google.com/u/0/uc?id=1JBnuP1GTXvhiTb80OEzN13uV7wMLkktg&export=download
# # unzip the dataset to google drive.
#!unzip uc?id=1JBnuP1GTXvhiTb80OEzN13uV7wMLkktg -d /content/drive/My\ Drive/wideBot

In [None]:
# class to load the dataset
# and create 2 generators
# 1) train generator : it will generate the train dataset samples to train the model.el during the training.
class DataLoader():
  def __init__(self, oversampling= False, shuffle = True):  
    names = ["variable1", "variable2", "variable3", "variable4", "variable5", "variable6", "variable7", "variable8",
             "variable9", "variable10", "variable11", "variable12", "variable13", "variable14", "variable15",
             "variable17", "variable18", "variable19", "classLabel"]
    
    self.train_x, self.train_y = DataLoader.gen_data('/content/drive/My Drive/wideBot/training.csv')

    valid_data = pandas.read_csv('/content/drive/My Drive/wideBot/validation.csv', names=names , sep=";")
    dataset = DataLoader.remove_NAN(valid_data)
    dataset = DataLoader.normalize_dataframe(dataset)
    valid_data = DataLoader.map_yes_no(dataset)

    valid_data = valid_data.sample(frac=1).reset_index(drop=True)
    valid_x = valid_data.drop(['classLabel'], axis=1)
    self.valid_x = DataLoader.fix_encode_test(valid_x, 'valid')
    self.valid_y = valid_data['classLabel']
    
  def gen_data(url):
    names = ["variable1", "variable2", "variable3", "variable4", "variable5", "variable6", "variable7", "variable8",
             "variable9", "variable10", "variable11", "variable12", "variable13", "variable14", "variable15",
             "variable17", "variable18", "variable19", "classLabel"]

    train_data = pandas.read_csv(url, names=names , sep=";")
    dataset = DataLoader.remove_NAN(train_data)
    dataset = DataLoader.normalize_dataframe(dataset)
    train_data = DataLoader.map_yes_no(dataset)

    train_data = train_data.sample(frac=1).reset_index(drop=True)
    train_x = train_data.drop(['classLabel'], axis=1)
    train_x = DataLoader.fix_encode_test(train_x, 'train')
    train_y = train_data['classLabel']
    return train_x, train_y

  def fix_encode_test(df, distrib):
    if distrib == 'valid':
        df['variable4_l'] = 0
        df['variable5_gg'] = 0
        df['variable6_r'] = 0
        df['variable7_o'] = 0
        df['variable13_p'] = 0
    elif distrib == 'train':
        df['variable4_l'] = df.pop('variable4_l')
        df['variable5_gg'] = df.pop('variable5_gg')
        df['variable6_r'] = df.pop('variable6_r')
        df['variable7_o'] = df.pop('variable7_o')
        df['variable13_p'] = df.pop('variable13_p')

    return df

  def remove_NAN(df):
    # Drop NAs from variables of binary values
    df = df.dropna(subset=['variable1', 'variable4', 'variable5', 'variable6', 'variable7'])

    # Drop NAs from variables with low number of NAs
    df = df.dropna(subset=['variable2', 'variable14', 'variable17'])

    # Split Columns with comma (,) in its data
    df['variable2_x'], df['variable2_y'] = df['variable2'].str.split(',', 1).str
    df['variable3_x'], df['variable3_y'] = df['variable3'].str.split(',', 1).str
    df['variable8_x'], df['variable8_y'] = df['variable8'].str.split(',', 1).str

    # Removed Column: Variable18 which had a lot of NANs (2000+)
    # Rearrange columns after modifications
    cols = ['variable1', 'variable2_x', 'variable2_y', 'variable3_x', 'variable3_y', 'variable4', 'variable5',
            'variable6', 'variable7', 'variable8_x', 'variable8_y', 'variable9', 'variable10', 'variable11',
            'variable12', 'variable13', 'variable14', 'variable15', 'variable17', 'variable19', 'classLabel']
    df = df[cols]

    num_col = ['variable2_x', 'variable2_y', 'variable3_x', 'variable3_y', 'variable8_x', 'variable8_y', 'variable11', 'variable14', 'variable15', 'variable17', 'variable19']

    for i in num_col:
        df[i] = df[i].astype(float)

    # Remove NAs from columns with # of NAs of 100+
    df['variable2_x'] = df['variable2_x'].fillna((df['variable2_x'].mean()))
    df['variable3_x'] = df['variable3_x'].fillna((df['variable3_x'].mean()))
    df['variable8_x'] = df['variable8_x'].fillna((df['variable8_x'].mean()))
    df['variable2_y'] = df['variable2_y'].fillna((df['variable2_y'].mean()))
    df['variable3_y'] = df['variable3_y'].fillna((df['variable3_y'].mean()))
    df['variable8_y'] = df['variable8_y'].fillna((df['variable8_y'].mean()))

    return df

  def normalize_dataframe(df):
    col_to_normalize = ['variable2_x', 'variable2_y', 'variable3_x', 'variable3_y', 'variable8_x', 'variable8_y',
                     'variable11', 'variable14', 'variable15', 'variable17', 'variable19']
    for i in col_to_normalize:
        df[i] = (df[i] - df[i].mean()) / df[i].std()
    return df

  def map_yes_no(df):
    cols_to_one_hot = ['variable1', 'variable4', 'variable5', 'variable6', 'variable7', 'variable9', 'variable10', 'variable12', 'variable13']
    for i in cols_to_one_hot:
        dummies = pandas.get_dummies(df[i], prefix=i, drop_first=False)
        df = pandas.concat([df, dummies], axis=1)
        df = df.drop([i], axis=1)
    df['classLabel'] = df['classLabel'].map({'yes.': 1, 'no.': 0})
    # Rearrange columns
    df['classLabel'] = df.pop('classLabel')
    return df

  #return : train generator
  def get_train_generator(self):
    return self.train_x, self.train_y, self.valid_x, self.valid_y
 
  #return : validation generator
  def get_valid_generator(self):
    return self.valid_generator
 
  #return : test generator
  def get_test_generator(self):
    return self.test_generator
 
  def get_class_weight(self):
    return self.class_weight
 
  def __del__(self):
    del self.csv_train
    del self.csv_test
    del self.train_generator
    del self.valid_generator
    del self.test_generator

In [None]:
dataLoader = DataLoader()

In [92]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import random

def print_model_specs(predictions, Y_test):
    print("Accuracy is: ", accuracy_score(Y_test, predictions))
    print("------------------------------------------------------------------")
    print(classification_report(Y_test, predictions))

def choose_features(train_x, test_x, features):
    """
    This function is used after testing with multiple set of features in the drop_features method,
    and then choosing the set of features with the highest accuracy.
    """
    ret_train_x = pandas.DataFrame()
    ret_test_x = pandas.DataFrame()

    for i in features:
        ret_train_x[i] = train_x[i]
        ret_test_x[i] = test_x[i]

    return ret_train_x, ret_test_x

def neural_model(X_train, X_test, Y_train):
    classifier = MLPClassifier(solver='lbfgs', alpha=0.1, hidden_layer_sizes=(5, 5, 5, 2), random_state=1)
    classifier.fit(X_train, Y_train)
    predictions = classifier.predict(X_test)
    return classifier, predictions


In [112]:
import random

train_x, train_y, test_x, test_y = DataLoader().get_train_generator()
all_columns = ['variable2_x', 'variable2_y', 'variable3_x', 'variable3_y',
       'variable8_x', 'variable8_y', 'variable11', 'variable14', 'variable15',
       'variable17', 'variable19', 'variable1_a', 'variable1_b', 'variable4_u',
       'variable4_y', 'variable5_g', 'variable5_p', 'variable6_W',
       'variable6_aa', 'variable6_c', 'variable6_cc', 'variable6_d',
       'variable6_e', 'variable6_ff', 'variable6_i', 'variable6_j',
       'variable6_k', 'variable6_m', 'variable6_q', 'variable6_x',
       'variable7_bb', 'variable7_dd', 'variable7_ff', 'variable7_h',
       'variable7_j', 'variable7_n', 'variable7_v', 'variable7_z',
       'variable9_f', 'variable9_t', 'variable10_f', 'variable10_t',
       'variable12_f', 'variable12_t', 'variable13_g', 'variable13_s',
       'variable4_l', 'variable5_gg', 'variable6_r', 'variable7_o',
       'variable13_p']
chosen = random.choices(all_columns, k=30)

train_x, test_x = choose_features(train_x, test_x, chosen)
classifier, predictions = neural_model(train_x, test_x, train_y)
print_model_specs(predictions, test_y)
pandas.set_option('display.expand_frame_repr', False)
print(chosen)

Exception ignored in: <bound method DataLoader.__del__ of <__main__.DataLoader object at 0x7fd5b9414470>>
Traceback (most recent call last):
  File "<ipython-input-84-f5c1b2570ef1>", line 115, in __del__
AttributeError: csv_train


Accuracy is:  0.8219895287958116
------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.92      0.72      0.81        99
           1       0.75      0.93      0.83        92

    accuracy                           0.82       191
   macro avg       0.84      0.83      0.82       191
weighted avg       0.84      0.82      0.82       191

['variable7_bb', 'variable1_b', 'variable6_d', 'variable4_y', 'variable6_d', 'variable7_n', 'variable6_i', 'variable6_cc', 'variable8_x', 'variable13_s', 'variable14', 'variable4_u', 'variable10_t', 'variable6_e', 'variable8_x', 'variable6_q', 'variable5_p', 'variable9_t', 'variable6_i', 'variable7_bb', 'variable6_c', 'variable6_W', 'variable5_p', 'variable17', 'variable2_y', 'variable1_b', 'variable7_n', 'variable13_s', 'variable4_u', 'variable7_o']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [94]:
Accuracy is:  0.8010471204188482
['variable4_y', 'variable17', 'variable6_x', 'variable1_a', 'variable5_p', 'variable5_g', 'variable8_x', 'variable6_cc', 'variable12_f', 'variable13_s', 'variable6_m', 
 'variable7_dd', 'variable7_n', 'variable9_f', 'variable1_a', 'variable7_h', 'variable6_c', 'variable3_x', 'variable9_t', 'variable6_r', 'variable6_c', 'variable7_h',
 'variable13_p', 'variable7_j', 'variable8_x', 'variable6_e', 'variable1_a', 'variable6_x', 'variable11', 'variable10_f']
