# Augmentation_1
A modified version of SMOTE, where some extra “variation” is added to each created synthetic sample by using the standard deviation of each feature

## Import libraries

In [42]:
# Import libraries
import pandas as pd
import numpy as np
import random as rand

In [43]:
# Import datasets
data = pd.read_csv("./dataset.csv")
pd.set_option('display.max_row',None)
pd.set_option('display.max_column',None)

## Separate Petting/Non-Petting

In [44]:
# Separate the datasets Petting and the datasets that are not Petting
'''Petting has more datasets than other modalities'''
petting_d= data[data['Petting'] == 1]
non_petting_d= data[data['Petting'] == 0]

In [45]:
# Datasets preprocessing
def data_preprocessing(data):
    data.reset_index(inplace=True)
    data=data.drop(['index'],axis=1)
    data= data.drop(['Eating / Cooking', 'Moved It', 'Petting','TV / Radio','Talking'], axis=1)
    return data

## SMOTE

In [46]:
class SMOTE:
    def __init__(self,data,k,pad,synth_number):
        self.data=data
        self.k=k
        self.pad=pad
        self.synth_n=synth_number
        
    # Make the dataframe into list and put the index
    def making_list(self,data):
        data_list=data.values.tolist()
        a=0
        for x in data_list:
            x.insert(0,a)                                       #put the index of the data in the front
            a+=1
        return data_list

    # Calculate eucledian distance
    def eucledian(self,p1,p2):
        dist = np.sqrt(np.sum((p1-p2)**2))
        return dist
    
    # SMOTE algorithm
    def smote_al (self):
        data=self.making_list(self.data)
        feat_cnt = len(data[0])-1
        feat_start = 1                                           #start position of features in dataset, usually first column are IDs
        data_avg = [0 for i in range(feat_start+feat_cnt)] 
        data_sd = [0 for i in range(feat_start+feat_cnt)]
        data_med = [0 for i in range(feat_start+feat_cnt)]      #med = median
        k = self.k                                               #number of nearest neighbors (e.g. 3)
        pad = self.pad                                           #Padding - percentage of std dev to add to each feature (2 = 95th percentile)                                           
        synth_n = self.synth_n                                   #number of synthetic samples to make
        synth = [[] for i in range(synth_n)]                    #placeholder array for synthetic dataset
        nn= [[] for i in range(len(data))]
        
        # Find nearest neighbors
        dist_list=[]                              
        for i in data:                                      
            idx = i[0]                                          #get current sample index
            for m in range(len(data)):
                dist=self.eucledian(np.array(data[idx][1:]),np.array(data[m][1:])) #get the eucledian distance between the current data and other data
                a=(dist,idx,m)
                dist_list.append(list(a))
            dist_list.sort(key=lambda x:x[0])                  #sort the list by the distance
            for j in range(k):
                nn[idx].append(dist_list[j+1][2])               #append the data index
            dist_list=[]

        # Calc std devs
        '''Calculate the std dev for feature j across all rows of data, put in data_sd array'''
        for j in range(feat_start, feat_start+feat_cnt):
            temp_arr=[]
            for i in range(len(data)):
                temp_arr.append(data[i][j])                     #if dataset is numpy array, this could be done more cleanly w/o pulling data out to temp array
            data_avg[j] = np.mean(temp_arr)
            data_sd[j] = np.std(temp_arr)
            data_med[j] = np.median(temp_arr)
        
        # Create synthetic samples
        '''Randomly selecting samples, then use the difference between nearest neighbors and std dev to calcuate new feature values with perturbation'''
        for i in range(synth_n):    
            idx = rand.randint(0, len(data)-1)                      #get random sample index
            for j in range(feat_start, feat_start+feat_cnt):
                diff = 0                                            #variable for holding difference between nearest neighbor
                gap = rand.uniform(-1.0,1.0)                        #random decimal number between -1 and 1
                for q in range(k):
                    temp = nn[idx][q]
                    diff += abs(data[temp][j]) - abs(data[idx][j])      #sum absolute difference across all nearest neighbors
                diff = diff/k                                           #divide total by k to get avg diff
                diff = diff + (data_sd[j]*pad)                          #add in the std_dev * padding  to difference to create perturbations
                val = data[idx][j] + (diff*gap)
                synth[i].append(val)

        #Make into Dataframe
        '''Append synth array onto the original dataset array'''
        len_data=len(self.data)
        for i in range(self.synth_n):
            self.data.loc[len_data+i]=synth[i]
        return self.data

In [47]:
# Merging petting and non_petting datasets into one dataframe
def into_one_dataframe(k, pad, synth_size, t_1, t_0):
    p=SMOTE(t_1, k, pad, synth_size+5)                               #Create 5 more datasets because Petting == 0 has 5 more datasets
    q=SMOTE(t_0, k, pad, synth_size)
    petting_data=p.smote_al()
    non_petting_data=q.smote_al()
    petting_data['Petting'] = 1    
    non_petting_data['Petting'] = 0
    merged_data=pd.concat([petting_data,non_petting_data])
    return merged_data

## Random Forest

In [48]:
#Import libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.metrics import roc_curve, roc_auc_score

In [49]:
def random_forest(data):
    # Split the data into features (X) and target (y)
    X = data.drop('Petting', axis=1)
    y = data['Petting']
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    #Random Forest
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    y_pred_prob = rf.predict_proba(X_test)[:, 1]
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    return accuracy, roc_auc

## Main Function

In [50]:
def main(k, pad, synth_size, t_1, t_0):
    data_after_smote=into_one_dataframe(k, pad, synth_size, t_1, t_0)
    rf_acc, rf_auc = random_forest(data_after_smote)
    return rf_acc, rf_auc

## Check ACC, AUC for each parameters

In [51]:
def check_acc_auc(k, pad, synth_size, t_1, t_0):
    acc_Avg = 0
    auc_Avg = 0
    for i in range(5):
        acc, auc = main(k, pad, synth_size, t_1.copy(), t_0.copy())        
        acc_Avg += acc
        auc_Avg += auc
    acc_Avg = acc_Avg/5
    auc_Avg = auc_Avg/5
    print('for k: ', k,' pad: ', pad,' synth_size: ', synth_size)
    print('average_accuaracy : ', round(acc_Avg, 5))
    print('average_auc : ', round(auc_Avg, 5))
    return acc_Avg, auc_Avg

In [52]:
data_list=[]
petting=data_preprocessing(petting_d)
non_petting=data_preprocessing(non_petting_d)

In [53]:
# Run one by one 
# [k, pad, synth_size]
params_list = [[2, 1, 40], [2, 1, 40], [2, 1, 40], [2, 1, 40], [2, 1, 40],
              [2, 1, 80], [2, 1, 80], [2, 1, 80], [2, 1, 80], [2, 1, 80],
              [2, 2, 40], [2, 2, 40], [2, 2, 40], [2, 2, 40], [2, 2, 40],
              [2, 2, 80], [2, 2, 80], [2, 2, 80], [2, 2, 80], [2, 2, 80],
              [3, 1, 40], [3, 1, 40], [3, 1, 40], [3, 1, 40], [3, 1, 40],
              [3, 1, 80], [3, 1, 80], [3, 1, 80], [3, 1, 80], [3, 1, 80],
              [3, 2, 40], [3, 2, 40], [3, 2, 40], [3, 2, 40], [3, 2, 40],
              [3, 2, 80], [3, 2, 80], [3, 2, 80], [3, 2, 80], [3, 2, 80],
              [4, 1, 40], [4, 1, 40], [4, 1, 40], [4, 1, 40], [4, 1, 40],
              [4, 1, 80], [4, 1, 80], [4, 1, 80], [4, 1, 80], [4, 1, 80],
              [4, 2, 40], [4, 2, 40], [4, 2, 40], [4, 2, 40], [4, 2, 40],
              [4, 2, 80], [4, 2, 80], [4, 2, 80], [4, 2, 80], [4, 2, 80]]

In [54]:
for param in params_list:
    temp_acc, temp_auc = check_acc_auc(param[0], param[1], param[2], petting, non_petting)
    data_list.append([param[0], param[1], param[2], temp_acc, temp_auc])

for k:  2  pad:  1  synth_size:  40
average_accuaracy :  0.84
average_auc :  0.91037
for k:  2  pad:  1  synth_size:  40
average_accuaracy :  0.77
average_auc :  0.87101
for k:  2  pad:  1  synth_size:  40
average_accuaracy :  0.85
average_auc :  0.9295
for k:  2  pad:  1  synth_size:  40
average_accuaracy :  0.77
average_auc :  0.82515
for k:  2  pad:  1  synth_size:  40
average_accuaracy :  0.87
average_auc :  0.8596
for k:  2  pad:  1  synth_size:  80
average_accuaracy :  0.89444
average_auc :  0.95071
for k:  2  pad:  1  synth_size:  80
average_accuaracy :  0.88333
average_auc :  0.93931
for k:  2  pad:  1  synth_size:  80
average_accuaracy :  0.91667
average_auc :  0.94863
for k:  2  pad:  1  synth_size:  80
average_accuaracy :  0.88889
average_auc :  0.94656
for k:  2  pad:  1  synth_size:  80
average_accuaracy :  0.9
average_auc :  0.95448
for k:  2  pad:  2  synth_size:  40
average_accuaracy :  0.74
average_auc :  0.86318
for k:  2  pad:  2  synth_size:  40
average_accuaracy : 

In [55]:
acc_auc_summary_df = pd.DataFrame(data=data_list, 
                                  columns = ['k', 'pad', 'synth_size', 'average_accuracy', 'average_auc'])
acc_auc_summary_df

Unnamed: 0,k,pad,synth_size,average_accuracy,average_auc
0,2,1,40,0.84,0.910365
1,2,1,40,0.77,0.87101
2,2,1,40,0.85,0.929504
3,2,1,40,0.77,0.825149
4,2,1,40,0.87,0.859596
5,2,1,80,0.894444,0.950712
6,2,1,80,0.883333,0.939307
7,2,1,80,0.916667,0.948627
8,2,1,80,0.888889,0.946558
9,2,1,80,0.9,0.954477


In [56]:
acc_auc_summary_df.to_csv('./augmentation_1.csv')