<a href="https://colab.research.google.com/github/farhanfuadabir/SHL2020/blob/master/SHL_split_and_batch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
def createBatch(data, batch_size = 1000, random_state=None):

  """
  Randomly selects `batch_size` entries of each label from the dataset 

  Parameters:

      data          : The dataset to create small batch from. 
                      This must be a DataFrame.
      batch_size    : Batch size of each label
      random_state  : Seed for the random number generator (if int), or numpy 
                      RandomState object.
  
  Returns:

      Series or DataFrame
  """

  y = data.label
  num_label = y.nunique()
  newDataBatch = pd.DataFrame(columns=data.columns)
  
  for i in range(1, num_label + 1):
    data_i = data.loc[y == i, :].sample(n=batch_size, random_state=random_state)
    newDataBatch = pd.concat([newDataBatch, data_i], axis=0)
  return newDataBatch


def random_split_half(data, random_state=123):

  import pandas as pd

  y = data.label
  num_label = y.nunique()
  newDataBatch1 = pd.DataFrame(columns=data.columns)
  
  for i in range(1, num_label + 1):
    data_i = data.loc[y == i, :].sample(frac=0.5, random_state=random_state)
    newDataBatch1 = pd.concat([newDataBatch1, data_i], axis=0)
  newDataBatch2 = pd.concat([data, newDataBatch1], axis=0)
  newDataBatch2 = newDataBatch2.drop_duplicates(keep=False)
  return newDataBatch1, newDataBatch2


def process_train_validation(trainSet, valSet=None, trainBatch=1000,
                             splitValSet=False, random_state=1234, 
                             removeConstantColumn=True, scaleFeatures=True,
                             noTestLabel=False):

  import pandas as pd
  from sklearn.preprocessing import MinMaxScaler

  print('Given Train Set Shape: ', trainSet.shape)
  print('Given Validation Set Shape: ', valSet.shape)

  if trainBatch != None:
    print('\nCreating train batch...', end=' ')
    trainSet = createBatch(trainSet, batch_size=trainBatch, 
                          random_state=random_state)
    print('Done | Shape: ', trainSet.shape)

  if splitValSet == True:
    print('Merging train batch with half validation set...', end=' ')
    valTrain, valSet = random_split_half(valSet, random_state=random_state)
    trainSet = pd.concat([trainSet, valTrain], axis = 0)
    print('Done | Shape: ', trainSet.shape)

  X_train = trainSet.drop('label', axis=1)
  y_train = trainSet.label
  if noTestLabel == False:
    X_val = valSet.drop('label', axis=1)
    y_val = valSet.label
  else:
    X_val = valSet

  if removeConstantColumn == True:
    X_temp = X_train
    X_train = X_train.loc[:, (X_temp != X_temp.iloc[0]).any()]
    X_val = X_val.loc[:, (X_temp != X_temp.iloc[0]).any()]
  
  if scaleFeatures == True:
    scaler = MinMaxScaler(feature_range=(-1,1))
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.fit_transform(X_val)

  if noTestLabel == False:
    print('\nX_train Shape: ', X_train.shape, ' |  y_train Shape: ', y_train.shape)
    print('X_val Shape: ', X_val.shape, ' |  y_val Shape: ', y_val.shape)

    print('\nInstances of each Label of the Train Set: ')
    print(y_train.value_counts())

    print('\nInstances of each Label of the Validation Set: ')
    print(y_val.value_counts())

    return X_train, y_train, X_val, y_val
  
  else:
    print('\nX_train Shape: ', X_train.shape, ' |  y_train Shape: ', y_train.shape)
    print('X_val Shape: ', X_val.shape)

    print('\nInstances of each Label of the Train Set: ')
    print(y_train.value_counts())

    return X_train, y_train, X_val
  
  

In [None]:
import pandas as pd
import numpy as np
from joblib import load, dump

from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical

path = '/content/drive/My Drive/SHL Features Pickle/'
train1Prefix = 'validation_2019'
train2Prefix = 'validation_2020'
train3Prefix = 'test_2019'
testPrefix = 'test_2020'
valPrefix = 'train_2020'
positions = ['_hand', '_bag', '_torso', '_hips']
position_val = ['_hand', '_hips']

data_train1 = pd.DataFrame()
for pos in positions:
  #Unpickle Train1 Set
  print('Unpickling from: ' + train1Prefix + pos + '_DATA.pickle ...',end=' ')
  temp = pd.read_pickle(path + train1Prefix + pos + '_DATA.pickle')
  print('Done | Shape: ', temp.shape)
  data_train1 = data_train1.append(temp,ignore_index=True)
print(train1Prefix, ' shape: ', data_train1.shape)


data_train2 = pd.DataFrame()
for pos in positions:
  #Unpickle Train2 Set
  print('Unpickling from: ' + train2Prefix + pos + '_DATA.pickle ...',end=' ')
  temp = pd.read_pickle(path + train2Prefix + pos + '_DATA.pickle')
  print('Done | Shape: ', temp.shape)
  data_train2 = data_train2.append(temp,ignore_index=True)
print(train2Prefix, ' shape: ', data_train2.shape)


#Unpickle Train3 Set
print('Unpickling from: ' + train3Prefix + '_hand' + '_DATA.pickle ...',end=' ')
data_train3 = pd.read_pickle(path + train3Prefix + '_hand' + '_DATA.pickle')
print('Done | Shape: ', data_train3.shape)
print(train3Prefix, ' shape: ', data_train3.shape)

data_train = pd.concat([data_train1, data_train2, data_train3], axis=0)
print('\n\ndata_train shape: ', data_train.shape, end='\n\n')

data_val = pd.DataFrame()
for pos in position_val:
  #Unpickle Validation Set
  print('Unpickling from: ' + valPrefix + pos + '_DATA.pickle ...',end=' ')
  temp = pd.read_pickle(path + valPrefix + pos + '_DATA.pickle')
  print('Done | Shape: ', temp.shape)
  data_val = data_val.append(temp,ignore_index=True)
print(valPrefix, ' shape: ', data_val.shape)


#Unpickle Test Set
print('Unpickling from: ' + testPrefix + '_hand' + '_DATA.pickle ...',end=' ')
data_test = pd.read_pickle(path + testPrefix + '_hand' + '_DATA.pickle')
print('Done | Shape: ', data_test.shape)


# Check for nan
if data_train.isna().any().any() == True:
  print('\nnan Detected in Train set')
  # Drop nan rows
  print('Dropping nan rows...',end=' ')
  data_train.dropna(inplace=True)
  print('Done | Shape: ', data_train.shape)

if data_val.isna().any().any() == True:
  print('\nnan Detected in Validation set')
  # Drop nan rows
  print('Dropping nan rows...',end=' ')
  data_val.dropna(inplace=True)
  print('Done | Shape: ', data_val.shape)

if data_test.isna().any().any() == True:
  print('\nnan Detected in Test set')
  # Drop nan rows
  print('Dropping nan rows...',end=' ')
  data_test.dropna(inplace=True)
  print('Done | Shape: ', data_test.shape)


Using TensorFlow backend.


Unpickling from: validation_2019_hand_DATA.pickle ... Done | Shape:  (12177, 496)
Unpickling from: validation_2019_bag_DATA.pickle ... Done | Shape:  (12177, 496)
Unpickling from: validation_2019_torso_DATA.pickle ... Done | Shape:  (12177, 496)
Unpickling from: validation_2019_hips_DATA.pickle ... Done | Shape:  (12177, 496)
validation_2019  shape:  (48708, 496)
Unpickling from: validation_2020_hand_DATA.pickle ... Done | Shape:  (28789, 496)
Unpickling from: validation_2020_bag_DATA.pickle ... Done | Shape:  (28789, 496)
Unpickling from: validation_2020_torso_DATA.pickle ... Done | Shape:  (28789, 496)
Unpickling from: validation_2020_hips_DATA.pickle ... Done | Shape:  (28789, 496)
validation_2020  shape:  (115156, 496)
Unpickling from: test_2019_hand_DATA.pickle ... Done | Shape:  (55811, 496)
test_2019  shape:  (55811, 496)


data_train shape:  (219675, 496)

Unpickling from: train_2020_hand_DATA.pickle ... Done | Shape:  (196072, 496)
Unpickling from: train_2020_hips_DATA.pickle 

In [None]:
X_train, y_train, X_test, y_test = process_train_validation(data_train2, data_val, 
                                                            trainBatch=None, 
                                                            splitValSet=False, 
                                                            random_state=1234)


Given Train Set Shape:  (115156, 496)
Given Validation Set Shape:  (392143, 496)

X_train Shape:  (115156, 452)  |  y_train Shape:  (115156,)
X_val Shape:  (392143, 452)  |  y_val Shape:  (392143,)

Instances of each Label of the Train Set: 
1.0    23868
2.0    20900
7.0    17448
8.0    17368
5.0    16380
4.0     9628
6.0     7344
3.0     2220
Name: label, dtype: int64

Instances of each Label of the Validation Set: 
5.0    63466
7.0    62506
6.0    56652
2.0    49088
1.0    48912
8.0    47668
4.0    46946
3.0    16905
Name: label, dtype: int64


In [None]:
import time
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=300,verbose=True,n_jobs=-1)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Evaluate Algorithm
print("\n\nConfusion Matrix: \n\n",confusion_matrix(y_test,y_pred))
print("\n\nReport: \n\n",classification_report(y_test,y_pred))
print("Accuracy: ",accuracy_score(y_test,y_pred))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 10.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 16.2min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    3.9s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:   16.2s
[Parallel(n_jobs=2)]: Done 300 out of 300 | elapsed:   24.7s finished




Confusion Matrix: 

 [[44880   649     0    98   316    58  1314  1597]
 [ 1952 44469   113  1685   229   139   114   387]
 [   26  9004  7854    11     8     0     2     0]
 [ 1570 24121   939 15715  1736  2000    90   775]
 [ 4069   458     0   487 15313 20906 15909  6324]
 [ 6960   409     0   153  6237 20392 16576  5925]
 [ 9831   504     0    71  1015   396 29755 20934]
 [ 4413   220     0    29   744   120  9275 32867]]


Report: 

               precision    recall  f1-score   support

         1.0       0.61      0.92      0.73     48912
         2.0       0.56      0.91      0.69     49088
         3.0       0.88      0.46      0.61     16905
         4.0       0.86      0.33      0.48     46946
         5.0       0.60      0.24      0.34     63466
         6.0       0.46      0.36      0.41     56652
         7.0       0.41      0.48      0.44     62506
         8.0       0.48      0.69      0.56     47668

    accuracy                           0.54    392143
   macro avg 

In [None]:
def print_unique_count(X):
  unique, counts = np.unique(X, return_counts=True)
  print(np.asarray((unique, counts)).astype(int).T)

print('\n\ny_pred value_counts: \n')
print_unique_count(y_pred)
print('\n\ny_test value_counts: \n')
print_unique_count(y_test)



y_pred value_counts: 

[[    1 73701]
 [    2 79834]
 [    3  8906]
 [    4 18249]
 [    5 25598]
 [    6 44011]
 [    7 73035]
 [    8 68809]]


y_test value_counts: 

[[    1 48912]
 [    2 49088]
 [    3 16905]
 [    4 46946]
 [    5 63466]
 [    6 56652]
 [    7 62506]
 [    8 47668]]


In [None]:
_, _, X_test_final = process_train_validation(data_train, data_test, 
                                                    trainBatch=None, 
                                                    splitValSet=False,
                                                    noTestLabel=True, 
                                                    random_state=1234)

y_pred_final = clf.predict(X_test_final)
print('\n\ny_pred value_counts: \n')
print_unique_count(y_pred_final)

Given Train Set Shape:  (219675, 496)
Given Validation Set Shape:  (57573, 495)

X_train Shape:  (219675, 452)  |  y_train Shape:  (219675,)
X_val Shape:  (57573, 452)

Instances of each Label of the Train Set: 
1.0    43790
2.0    36141
5.0    35659
7.0    29558
8.0    26482
6.0    20753
4.0    20440
3.0     6852
Name: label, dtype: int64


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.8s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    3.4s




y_pred value_counts: 

[[    1  9580]
 [    2 15958]
 [    3     5]
 [    4  2874]
 [    5  7518]
 [    6  3036]
 [    7  4238]
 [    8 14364]]


[Parallel(n_jobs=2)]: Done 300 out of 300 | elapsed:    5.2s finished
