In [1]:
#import package

%matplotlib inline
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import csv
from numpy import genfromtxt
from math import sqrt
import math
from sklearn.cluster import KMeans
import random
import warnings
warnings.filterwarnings('ignore')

## Data Extraction

In [2]:
# This method joins/subtracts the input features
def concatenate ( arg1, arg2,arg3):
    sol = arg1 + arg2
    sol.append(int(arg3))
    return sol

def subtraction ( arg1, arg2, arg3):
    sol = [i - j for i, j in zip(arg1, arg2)]
    sol.append(int(arg3))
    return sol

In [3]:
#This method saves the shuffled and cleaned file in the target location
def save_file(filename, content):
    with open(filename, 'w', newline='') as fi:
        writer = csv.writer(fi)
        writer.writerows(content)
    fid = open(filename, "r")
    li = fid.readlines()
    fid.close()
    random.shuffle(li)
    fid = open(filename, "w")
    fid.writelines(li)
    fid.close()
    print(str(filename) + " is created for " + str(len(content)) + " entries")


In [4]:
#This method extracts the features from Human observed dataset
def human_feature():
    raw_data = []
    featureid = []
    featureset = []
    with open("./Dataset/HumanObserved/HumanObserved-Features-Data.csv", 'r') as fi:
        reader = csv.reader(fi)
        for row in reader: 
            x = []
            featureid.append(row[1])
            raw_data.append(row)   
            for i in range(9):
                x.append(float(row[i+2]))
            featureset.append(x)
    return featureid,featureset

In [5]:
#This method extracts the features from for same and different files and saves them in the specified path
def human_concat_sub(featureid,featureset):
    concatenation_raw_data = []
    subtraction_raw_data = []
    with open("Dataset/HumanObserved/same_pairs.csv", 'r') as fi:
        reader = csv.reader(fi)
        for row in reader:
            sign1 = featureset[featureid.index(row[0])]
            sign2 = featureset[featureid.index(row[1])]
            sign3 = row[2]
            concatenation_raw_data.append(concatenate(sign1,sign2,sign3))
            subtraction_raw_data.append(subtraction(sign1,sign2,sign3))
    with open("Dataset/HumanObserved/diffn_pairs.csv", 'r') as fi:
        reader = csv.reader(fi)
        for row in reader:
            sign1 = featureset[featureid.index(row[0])]
            sign2 = featureset[featureid.index(row[1])]
            sign3 = row[2]     
            concatenation_raw_data.append(concatenate(sign1,sign2,sign3))
            subtraction_raw_data.append(subtraction(sign1,sign2,sign3))
    save_file("Dataset-cleaned/human-feature-concatenation.csv",concatenation_raw_data)
    save_file("Dataset-cleaned/human-feature-subtraction.csv",subtraction_raw_data)

In [6]:
#This method extracts the features from Gradient Stochastic concavity  dataset
def gsc_feature():
    raw_data = []
    featureid = []
    featureset = []
    with open("./Dataset/GradientStructuralConcavity/GSC-Features.csv", 'r') as fi:
        reader = csv.reader(fi)
        for row in reader: 
            x = []
            featureid.append(row[0])
            raw_data.append(row)   
            for i in range(512):
                x.append(float(row[i+1]))
            featureset.append(x)
    return featureid,featureset


In [7]:
#This method extracts the features from for same and different files and saves them in the specified path
def gsc_concat_sub(featureid,featureset):
    concatenation_raw_data = []
    subtraction_raw_data = []
    with open("Dataset/GradientStructuralConcavity/same_pairs.csv", 'r') as fi:
        reader = csv.reader(fi)
        for row in reader:
            sign1 = featureset[featureid.index(row[0])]
            sign2 = featureset[featureid.index(row[1])]
            sign3 = row[2]
            concatenation_raw_data.append(concatenate(sign1,sign2,sign3))
            subtraction_raw_data.append(subtraction(sign1,sign2,sign3))
    with open("Dataset/GradientStructuralConcavity/diffn_pairs.csv", 'r') as fi:
        reader = csv.reader(fi)
        for row in reader:
            sign1 = featureset[featureid.index(row[0])]
            sign2 = featureset[featureid.index(row[1])]
            sign3 = row[2]     
            concatenation_raw_data.append(concatenate(sign1,sign2,sign3))
            subtraction_raw_data.append(subtraction(sign1,sign2,sign3))
    save_file("Dataset-cleaned/gsc-feature-concatenation.csv",concatenation_raw_data)
    save_file("Dataset-cleaned/gsc-feature-subtraction.csv",subtraction_raw_data)


In [8]:
#This method extracts all features from Human observed and GSC dataset. It returns the feature and target set for selected datasets
def data_extraction():
    human_feature_id,human_feature_val  = human_feature()
    human_concat_sub(human_feature_id,human_feature_val)

    gsc_feature_id,gsc_feature_val  = gsc_feature()
    gsc_concat_sub(gsc_feature_id,gsc_feature_val)

    return human_feature_id,human_feature_val,gsc_feature_id ,gsc_feature_val

In [9]:
human_feature_id,human_feature_val,gsc_feature_id ,gsc_feature_val = data_extraction()

Dataset-cleaned/human-feature-concatenation.csv is created for 1582 entries
Dataset-cleaned/human-feature-subtraction.csv is created for 1582 entries
Dataset-cleaned/gsc-feature-concatenation.csv is created for 10000 entries
Dataset-cleaned/gsc-feature-subtraction.csv is created for 10000 entries


## Linear Regression

In [10]:
# This method reads the target file for all samples and appends the content to variable, and returns it as a list
def GetTargetVector(filePath,position):
    t = []
    with open(filePath, 'r') as f:
        reader = csv.reader(f)
        for row in reader:  
            t.append(int(row[int(position)]))
    print(" Target generated for  " + str(filePath))
    return t

In [11]:
# Dataset is seperated into training, validation and test. 
TrainingPercent = 80
ValidationPercent = 10
TestPercent = 10
# Regularization parameter to minimize error 
C_Lambda = 0.02
#M is the number of basis function
M = 4
# φ is a vector of M basis functions
PHI = []


In [12]:
RawTarget_human_concatenation = GetTargetVector('Dataset-cleaned/human-feature-concatenation.csv','18')
RawTarget_human_subtraction = GetTargetVector('Dataset-cleaned/human-feature-subtraction.csv','9')
RawTarget_gradient_concatenation = GetTargetVector('Dataset-cleaned/gsc-feature-concatenation.csv','1024')
RawTarget_gradient_subtraction = GetTargetVector('Dataset-cleaned/gsc-feature-subtraction.csv','512')

 Target generated for  Dataset-cleaned/human-feature-concatenation.csv
 Target generated for  Dataset-cleaned/human-feature-subtraction.csv
 Target generated for  Dataset-cleaned/gsc-feature-concatenation.csv
 Target generated for  Dataset-cleaned/gsc-feature-subtraction.csv


In [13]:
#A matrix is built for the feature set. 
def GenerateRawData(filePath,position):  
    dataMatrix = [] 
    with open(filePath, 'r') as fi:
        reader = csv.reader(fi)
        for row in reader:
            dataRow = []   
            for i in range(int(position)):
                dataRow.append(float(row[i]))
            dataMatrix.append(dataRow)   
    
    dataMatrix = np.transpose(dataMatrix)        
    print(" Raw data generated for  " + str(filePath))
    return dataMatrix

In [14]:
RawData_human_concatenation = GenerateRawData('Dataset-cleaned/human-feature-concatenation.csv','18')
RawData_human_subtraction = GenerateRawData('Dataset-cleaned/human-feature-subtraction.csv','9')
RawData_gradient_concatenation = GenerateRawData('Dataset-cleaned/gsc-feature-concatenation.csv','1024')
RawData_gradient_subtraction = GenerateRawData('Dataset-cleaned/gsc-feature-subtraction.csv','512')

 Raw data generated for  Dataset-cleaned/human-feature-concatenation.csv
 Raw data generated for  Dataset-cleaned/human-feature-subtraction.csv
 Raw data generated for  Dataset-cleaned/gsc-feature-concatenation.csv
 Raw data generated for  Dataset-cleaned/gsc-feature-subtraction.csv


In [15]:
# This method gets the target value for training set appends the content to variable, and returns it as a list
# 80% of the actual data (test data) is used here, to compose Traininglength
def GenerateTrainingTarget(rawTraining):
    TrainingLen = int(math.ceil(len(rawTraining)*(TrainingPercent*0.01)))
    t           = rawTraining[:TrainingLen]
    return t

# This method constructs the design matrix for training data
# 80% of the actual data (test data) is used construct the feature datamatrix
def GenerateTrainingDataMatrix(rawData):
    T_len = int(math.ceil(len(rawData[0])*0.01*TrainingPercent))
    t = rawData[:,0:T_len]
    return t

#The method assigns target for validation and test dataset. 
# 10% of the actual data (validation data) is used here
def GenerateValTargetVector(rawData, ValPercent, TrainingCount): 
    valSize = int(math.ceil(len(rawData)*ValPercent*0.01))
    V_End = TrainingCount + valSize 
    t =rawData[TrainingCount+1:V_End]
    return t

# This method constructs the design matrix for validation and test data
# 10% of the actual data (validation data) is used here
def GenerateValData(rawData, ValPercent, TrainingCount): 
    valSize = int(math.ceil(len(rawData[0])*ValPercent*0.01))
    V_End = TrainingCount + valSize
    dataMatrix = rawData[:,TrainingCount+1:V_End]
    return dataMatrix

In [16]:
#Preparing Training dataset
TrainingTarget_human_concatenation = np.array(GenerateTrainingTarget(RawTarget_human_concatenation))
TrainingTarget_human_subtraction = np.array(GenerateTrainingTarget(RawTarget_human_subtraction))
TrainingTarget_gradient_concatenation = np.array(GenerateTrainingTarget(RawTarget_gradient_concatenation))
TrainingTarget_gradient_subtraction = np.array(GenerateTrainingTarget(RawTarget_gradient_subtraction))


#It generates a matrix for training set
TrainingData_human_concatenation   = GenerateTrainingDataMatrix(RawData_human_concatenation)
TrainingData_human_subtraction   = GenerateTrainingDataMatrix(RawData_human_subtraction)
TrainingData_gradient_concatenation   = GenerateTrainingDataMatrix(RawData_gradient_concatenation)
TrainingData_gradient_subtraction   = GenerateTrainingDataMatrix(RawData_gradient_subtraction)


In [17]:
#Preparing validating dataset

ValDataAct_human_concatenation = np.array(GenerateValTargetVector(RawTarget_human_concatenation,ValidationPercent, (len(TrainingTarget_human_concatenation))))
ValDataAct_human_subtraction = np.array(GenerateValTargetVector(RawTarget_human_subtraction,ValidationPercent, (len(TrainingTarget_human_subtraction))))
ValDataAct_gradient_concatenation = np.array(GenerateValTargetVector(RawTarget_gradient_concatenation,ValidationPercent, (len(TrainingTarget_gradient_concatenation))))
ValDataAct_gradient_subtraction = np.array(GenerateValTargetVector(RawTarget_gradient_subtraction,ValidationPercent, (len(TrainingTarget_gradient_subtraction))))


ValData_human_concatenation    = GenerateValData(RawData_human_concatenation,ValidationPercent, (len(TrainingTarget_human_concatenation)))
ValData_human_subtraction    = GenerateValData(RawData_human_subtraction,ValidationPercent, (len(TrainingTarget_human_subtraction)))
ValData_gradient_concatenation    = GenerateValData(RawData_gradient_concatenation,ValidationPercent, (len(TrainingTarget_gradient_concatenation)))
ValData_gradient_subtraction    = GenerateValData(RawData_gradient_subtraction,ValidationPercent, (len(TrainingTarget_gradient_subtraction)))


In [18]:

#Preparing Testing dataset
TestDataAct_human_concatenation = np.array(GenerateValTargetVector(RawTarget_human_concatenation,TestPercent, (len(TrainingTarget_human_concatenation)+len(ValDataAct_human_concatenation))))
TestDataAct_human_subtraction = np.array(GenerateValTargetVector(RawTarget_human_subtraction,TestPercent, (len(TrainingTarget_human_subtraction)+len(ValDataAct_human_subtraction))))
TestDataAct_gradient_concatenation = np.array(GenerateValTargetVector(RawTarget_gradient_concatenation,TestPercent, (len(TrainingTarget_gradient_concatenation)+len(ValDataAct_gradient_concatenation))))
TestDataAct_gradient_subtraction = np.array(GenerateValTargetVector(RawTarget_gradient_subtraction,TestPercent, (len(TrainingTarget_gradient_subtraction)+len(ValDataAct_gradient_subtraction))))



#It generates a matrix for validation set
TestData_human_concatenation = GenerateValData(RawData_human_concatenation,TestPercent, (len(TrainingTarget_human_concatenation)+len(ValDataAct_human_concatenation)))
TestData_human_subtraction = GenerateValData(RawData_human_subtraction,TestPercent, (len(TrainingTarget_human_subtraction)+len(ValDataAct_human_subtraction)))
TestData_gradient_concatenation = GenerateValData(RawData_gradient_concatenation,TestPercent, (len(TrainingTarget_gradient_concatenation)+len(ValDataAct_gradient_concatenation)))
TestData_gradient_subtraction = GenerateValData(RawData_gradient_subtraction,TestPercent, (len(TrainingTarget_gradient_subtraction)+len(ValDataAct_gradient_subtraction)))

In [19]:
#Kmeans cluster stores M centroids
#We have M number of individual clusters, and M centroids, with each  
#Mu contains centroids for each basis function, containing detail about all features  
#Mu is the cordinate of centroid for each cluster
kmeans_human_concatenation = KMeans(n_clusters=M, random_state=0).fit(np.transpose(TrainingData_human_concatenation))
Mu_human_concatenation = kmeans_human_concatenation.cluster_centers_

kmeans_human_subtraction = KMeans(n_clusters=M, random_state=0).fit(np.transpose(TrainingData_human_subtraction))
Mu_human_subtraction = kmeans_human_subtraction.cluster_centers_

kmeans_gradient_concatenation = KMeans(n_clusters=M, random_state=0).fit(np.transpose(TrainingData_gradient_concatenation))
Mu_gradient_concatenation = kmeans_gradient_concatenation.cluster_centers_

kmeans_gradient_subtraction = KMeans(n_clusters=M, random_state=0).fit(np.transpose(TrainingData_gradient_subtraction))
Mu_gradient_subtraction = kmeans_gradient_subtraction.cluster_centers_

In [20]:
#This method generates the variance matrix. It contains 
#So in order to make matrix multiplication feasible, we expand the variance vector (varVect)
#Only the diagonal is filled and rest is zero entries. 
#Thus the bigsigma contains only diagonal entries (variance) and the rest are zero (covariance).

def GenerateBigSigma(Data,TrainingPercent,flag):
    BigSigma    = np.zeros((len(Data),len(Data)))
    #Calculate the transpose of RawData, to get matrix of dimensions entries*features
    DataT       = np.transpose(Data)
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01))    
    varVect     = []
    for i in range(0,len(DataT[0])):
        vct = []
        for j in range(0,int(TrainingLen)):
            vct.append(Data[i][j])
            #Returns the variance of the array elements
        varVect.append(np.var(vct))
    #Varvect contains the variance
    #Bigsigma is feature*feature matrix, extended from varvect. Bigsigma only has diagonal filled and rest is zero
    if(flag == 0):
        for j in range(len(Data)):
            BigSigma[j][j] = varVect[j]
    else:
        for j in range(len(Data)):
            BigSigma[j][j] = varVect[j] + 0.25
    BigSigma = np.dot(100,BigSigma)
    #Bigsigma returns the variance matrix
    return BigSigma


In [21]:
#Variance matrix is found using Bigsigma method 
#Variance being a scalar, we generate a Matrix by multiplying variance value for each feature into Identity vector
#Bigsigma is feature*feature matrix, extended from varvect. Bigsigma only has diagonal filled and rest is zero
BigSigma_human_concatenation     = GenerateBigSigma(RawData_human_concatenation, TrainingPercent,0)
BigSigma_human_subtraction     = GenerateBigSigma(RawData_human_subtraction, TrainingPercent,0)
BigSigma_gradient_concatenation     = GenerateBigSigma(RawData_gradient_concatenation, TrainingPercent,1)
BigSigma_gradient_subtraction     = GenerateBigSigma(RawData_gradient_subtraction, TrainingPercent,1)

In [22]:
# Find the scalar product of Gaussian radial basis functions
# Finds  (((x − µj)^-1) (Σ(^-1)(x − µj))) value
def GetScalar(DataRow,MuRow, BigSigInv):  
    R = np.subtract(DataRow,MuRow)
    T = np.dot(BigSigInv,np.transpose(R))  
    L = np.dot(R,T)
    return L

def GetRadialBasisOut(DataRow,MuRow, BigSigInv):
    # Find the Gaussian radial basis functions  is φj (x)
    phi_x = math.exp(-0.5*GetScalar(DataRow,MuRow,BigSigInv))
    return phi_x



# GetPhiMatrix finds Gaussian radial basis functions  is φj (x) = exp (−1/2 (((x − µj)^-1) (Σ(^-1)(x − µj)))) 
def GetPhiMatrix(Data, MuMatrix, BigSigma, TrainingPercent = 80):
    #Datatranspose is transpose matrix of RawData
    DataT = np.transpose(Data)
    TrainingLen = math.ceil(len(DataT)*(TrainingPercent*0.01))
    # PHI is filled with zeros with dimensions of TrainingLen and MuMatrix 
    PHI = np.zeros((int(TrainingLen),len(MuMatrix)))
    # BigSigInv is the inverse of BigSigma of same dimensions feature*feature
    BigSigInv = np.linalg.inv(BigSigma)
    for  C in range(0,len(MuMatrix)):
        for R in range(0,int(TrainingLen)):
            # For M values of Mumatrix and all sample values of Training, generate PHI
            PHI[R][C] = GetRadialBasisOut(DataT[R], MuMatrix[C], BigSigInv)
    return PHI

In [23]:
TRAINING_PHI_human_concatenation = GetPhiMatrix(RawData_human_concatenation, Mu_human_concatenation, BigSigma_human_concatenation, TrainingPercent)
TRAINING_PHI_human_subtraction = GetPhiMatrix(RawData_human_subtraction, Mu_human_subtraction, BigSigma_human_subtraction, TrainingPercent)
TRAINING_PHI_gradient_concatenation = GetPhiMatrix(RawData_gradient_concatenation, Mu_gradient_concatenation, BigSigma_gradient_concatenation, TrainingPercent)
TRAINING_PHI_gradient_subtraction = GetPhiMatrix(RawData_gradient_subtraction, Mu_gradient_subtraction, BigSigma_gradient_subtraction, TrainingPercent)

In [24]:
# Pseudo inverse is found, because the matrix is not a square singular matrix. So  Moore-Penrose pseudo-inverse formula is applied 
# w∗ = (λI + (Φ^T)Φ)^(-1))(Φ^T)t

def GetWeightsClosedForm(PHI, T, Lambda):
    #I is created using the np.identity function as a M*M matrix.
    Lambda_I = np.identity(len(PHI[0]))
    #λI is determined for the I matrix of size M*M
    for i in range(0,len(PHI[0])):
        Lambda_I[i][i] = Lambda
    # Φ^(-1) is found using np functional calls
    # Finds the transpose of the Φ and has dimensions (M, sample)
    PHI_T       = np.transpose(PHI)
    # (Φ^(-1)).Φ is determined
    # (Φ^(-1)).Φ is reduced to a (M, M) matrix
    PHI_SQR     = np.dot(PHI_T,PHI)
    # (λI + (Φ^T)Φ is calculated 
    # Again PHI_SQR_LI becomes (M, M) matrix
    PHI_SQR_LI  = np.add(Lambda_I,PHI_SQR)
    # (λI + (Φ^T)Φ ^(-1)
    # PHI_SQR_INV is again a matrix of size (M, M)
    PHI_SQR_INV = np.linalg.inv(PHI_SQR_LI)
    # (λI + (Φ^T)Φ)^(-1))(Φ^T)
    # INTER becomes (M, sample) because Φ^T is (M, sample)
    INTER       = np.dot(PHI_SQR_INV, PHI_T)
    # (λI + (Φ^T)Φ)^(-1))(Φ^T)t is determined
    # w becomes (M,) because of the dot product being involved
    W           = np.dot(INTER, T)
    ##print ("Training Weights Generated..")
    return W

In [25]:
#Calculate weights for the training dataset
W_human_concatenation  = GetWeightsClosedForm(TRAINING_PHI_human_concatenation,TrainingTarget_human_concatenation,(C_Lambda))
W_human_subtraction  = GetWeightsClosedForm(TRAINING_PHI_human_subtraction,TrainingTarget_human_subtraction,(C_Lambda))

#Calculate weights for the training dataset
W_gradient_concatenation  = GetWeightsClosedForm(TRAINING_PHI_gradient_concatenation,TrainingTarget_gradient_concatenation,(C_Lambda))
W_gradient_subtraction  = GetWeightsClosedForm(TRAINING_PHI_gradient_subtraction,TrainingTarget_gradient_subtraction,(C_Lambda))




In [26]:
#Calculate the PHI matrix for test and validation set for all 4 cases
TEST_PHI_human_concatenation     = GetPhiMatrix(TestData_human_concatenation, Mu_human_concatenation, BigSigma_human_concatenation, 100)
VAL_PHI_human_concatenation      = GetPhiMatrix(ValData_human_concatenation, Mu_human_concatenation, BigSigma_human_concatenation, 100)

TEST_PHI_human_subtraction     = GetPhiMatrix(TestData_human_subtraction, Mu_human_subtraction, BigSigma_human_subtraction, 100)
VAL_PHI_human_subtraction      = GetPhiMatrix(ValData_human_subtraction, Mu_human_subtraction, BigSigma_human_subtraction, 100)


TEST_PHI_gradient_concatenation     = GetPhiMatrix(TestData_gradient_concatenation, Mu_gradient_concatenation, BigSigma_gradient_concatenation, 100)
VAL_PHI_gradient_concatenation      = GetPhiMatrix(ValData_gradient_concatenation, Mu_gradient_concatenation, BigSigma_gradient_concatenation, 100)

TEST_PHI_gradient_subtraction     = GetPhiMatrix(TestData_gradient_subtraction, Mu_gradient_subtraction, BigSigma_gradient_subtraction, 100)
VAL_PHI_gradient_subtraction      = GetPhiMatrix(ValData_gradient_subtraction, Mu_gradient_subtraction, BigSigma_gradient_subtraction, 100)


In [27]:
def GetValTest(VAL_PHI,W):
    Y = np.dot(W,np.transpose(VAL_PHI))
    return Y

In [28]:
#Find the error mean square value
def GetErms(VAL_TEST_OUT,ValDataAct):
    sum = 0.0
    t=0
    accuracy = 0.0
    counter = 0
    val = 0.0
    for i in range (1,len(VAL_TEST_OUT)):
        sum = sum + math.pow((ValDataAct[i] - VAL_TEST_OUT[i]),2)
        # Compare the target value and predicted value
        if(int(np.around(VAL_TEST_OUT[i], 0)) == ValDataAct[i]):
            counter+=1
    accuracy = (float((counter*100))/float(len(VAL_TEST_OUT)))
    return (str(accuracy) + ',' +  str(math.sqrt(sum/len(VAL_TEST_OUT))))

In [29]:
#SGC has large datasamples, which would be ignored by the ususal SGD, so we consider batching to use all data samples 
#The weights are updated after each batch
def SGD_batch(filename,W,TrainingTarget,TRAINING_PHI,VAL_PHI,TEST_PHI,ValDataAct,TestDataAct):
       
    #Use the weights from SGD method
    W_Now        = np.dot(220, W)
    La           = 2
    
    L_Erms_Val   = []
    L_Erms_TR    = []
    L_Erms_Test  = []
    W_Mat        = []
    epoch = 500
    batch_size = 100
    
    
    for j in range(1,epoch):     
        start = 0
        end = batch_size
        learningRate = 1/j
        for i in range(int(len(TrainingTarget) / batch_size)):
            Delta_E_D = - np.dot((TrainingTarget[start:end] - np.dot(np.transpose(W_Now),np.transpose(TRAINING_PHI[start:end]))),TRAINING_PHI[start:end])
            La_Delta_E_W  = np.dot(La,W_Now)
            Delta_E       = np.add(Delta_E_D,La_Delta_E_W)
            #Error Delta value would increase, so we find mean to minimize this overshoot
            mean = len(TrainingTarget)/batch_size
            Delta_E = Delta_E / int(mean)            
            Delta_W = -np.dot(learningRate,Delta_E)
            W_T_Next = W_Now + Delta_W
            W_Now    = W_T_Next
            start = start + batch_size
            end = end + batch_size
               
        #-----------------TrainingData Accuracy---------------------#
        TR_TEST_OUT   = GetValTest(TRAINING_PHI,W_T_Next) 
        Erms_TR       = GetErms(TR_TEST_OUT,TrainingTarget)
        L_Erms_TR.append(float(Erms_TR.split(',')[1]))

        #-----------------ValidationData Accuracy---------------------#
        VAL_TEST_OUT  = GetValTest(VAL_PHI,W_T_Next) 
        Erms_Val      = GetErms(VAL_TEST_OUT,ValDataAct)
        L_Erms_Val.append(float(Erms_Val.split(',')[1]))

        #-----------------TestingData Accuracy---------------------#
        TEST_OUT      = GetValTest(TEST_PHI,W_T_Next) 
        Erms_Test = GetErms(TEST_OUT,TestDataAct)
        L_Erms_Test.append(float(Erms_Test.split(',')[1]))    
    

    print (filename + " Accuracy Training   = " + str(float(Erms_TR.split(',')[0])))
    print (filename + " Accuracy Validation = " + str(float(Erms_Val.split(',')[0])))
    print (filename + " Accuracy Testing    = " + str(float(Erms_Test.split(',')[0])))
    print ("-------------------------------------------------------------------------")

In [30]:
def SGD(filename,W,TrainingTarget,TRAINING_PHI,VAL_PHI,TEST_PHI,ValDataAct,TestDataAct):
    #Update the weights SGD method
    W_Now        = np.dot(220, W)
    La           = 2
    
    L_Erms_Val   = []
    L_Erms_TR    = []
    L_Erms_Test  = []
    W_Mat        = []

    for i in range(1,400):
        learningRate = 1/i
        #print ('---------Iteration: ' + str(i) + '--------------')
        Delta_E_D     = -np.dot((TrainingTarget[i] - np.dot(np.transpose(W_Now),TRAINING_PHI[i])),TRAINING_PHI[i])
        La_Delta_E_W  = np.dot(La,W_Now)
        Delta_E       = np.add(Delta_E_D,La_Delta_E_W)    
        Delta_W       = -np.dot(learningRate,Delta_E)
        W_T_Next      = W_Now + Delta_W
        W_Now         = W_T_Next

        #-----------------TrainingData Accuracy---------------------#
        TR_TEST_OUT   = GetValTest(TRAINING_PHI,W_T_Next) 
        Erms_TR       = GetErms(TR_TEST_OUT,TrainingTarget)
        L_Erms_TR.append(float(Erms_TR.split(',')[1]))

        #-----------------ValidationData Accuracy---------------------#
        VAL_TEST_OUT  = GetValTest(VAL_PHI,W_T_Next) 
        Erms_Val      = GetErms(VAL_TEST_OUT,ValDataAct)
        L_Erms_Val.append(float(Erms_Val.split(',')[1]))

        #-----------------TestingData Accuracy---------------------#
        TEST_OUT      = GetValTest(TEST_PHI,W_T_Next) 
        Erms_Test = GetErms(TEST_OUT,TestDataAct)
        L_Erms_Test.append(float(Erms_Test.split(',')[1]))

    print (filename + " Accuracy Training   = " + str(float(Erms_TR.split(',')[0])))
    print (filename + " Accuracy Validation = " + str(float(Erms_Val.split(',')[0])))
    print (filename + " Accuracy Testing    = " + str(float(Erms_Test.split(',')[0])))
    print ("-------------------------------------------------------------------------")

In [31]:
print ('----------Linear Regression Gradient Descent Solution--------------------')
SGD('human_concatenation',W_human_concatenation,TrainingTarget_human_concatenation,TRAINING_PHI_human_concatenation,VAL_PHI_human_concatenation,TEST_PHI_human_concatenation,ValDataAct_human_concatenation,TestDataAct_human_concatenation)
SGD('human_subtraction',W_human_subtraction,TrainingTarget_human_subtraction,TRAINING_PHI_human_subtraction,VAL_PHI_human_subtraction,TEST_PHI_human_subtraction,ValDataAct_human_subtraction,TestDataAct_human_subtraction)

SGD_batch('gradient_concatenation',W_gradient_concatenation,TrainingTarget_gradient_concatenation,TRAINING_PHI_gradient_concatenation,VAL_PHI_gradient_concatenation,TEST_PHI_gradient_concatenation,ValDataAct_gradient_concatenation,TestDataAct_gradient_concatenation)
SGD_batch('gradient_subtraction',W_gradient_subtraction,TrainingTarget_gradient_subtraction,TRAINING_PHI_gradient_subtraction,VAL_PHI_gradient_subtraction,TEST_PHI_gradient_subtraction,ValDataAct_gradient_subtraction,TestDataAct_gradient_subtraction)


----------Linear Regression Gradient Descent Solution--------------------
human_concatenation Accuracy Training   = 50.07898894154818
human_concatenation Accuracy Validation = 49.36708860759494
human_concatenation Accuracy Testing    = 49.044585987261144
-------------------------------------------------------------------------
human_subtraction Accuracy Training   = 49.842022116903635
human_subtraction Accuracy Validation = 47.46835443037975
human_subtraction Accuracy Testing    = 54.140127388535035
-------------------------------------------------------------------------
gradient_concatenation Accuracy Training   = 54.5
gradient_concatenation Accuracy Validation = 56.45645645645646
gradient_concatenation Accuracy Testing    = 55.85585585585586
-------------------------------------------------------------------------
gradient_subtraction Accuracy Training   = 72.6875
gradient_subtraction Accuracy Validation = 73.47347347347348
gradient_subtraction Accuracy Testing    = 74.5745745745745

## Logistic Regression

In [32]:
#Sigmoid method is implemented 
def sigmoid(val):
    return 1 / (1 + np.exp(-val))

In [34]:
#find the accuracy of the model
def logistic_accuracy(features,labels,weight,filename):
    #The predicted value is the product of feature and weight. The bias is added to the feature set initially
    #Bias is a row filled with ones
    accuracy_list = np.dot(np.hstack((np.ones((features.shape[0], 1)),features)), weight)
    #The predicted unnormalized value is passed to sigmoid method
    predicted_list = np.round(sigmoid(accuracy_list))
    #If the predicted matches the actual target then their count is counted
    accuracy = (predicted_list == labels).sum().astype(float) / len(predicted_list)
    print(" Logistic regression accuracy of " + str(filename) + ' is ' + str(accuracy))
    return accuracy_list, predicted_list, accuracy
    

In [35]:
#Generate target method extracts the features and labels into 2 seperate lists
def generate_target(filename,delimiter_value):
    file_raw_data = genfromtxt(filename, delimiter=',')
    features,emptyarray,labels_temp = np.hsplit(file_raw_data, np.array([int(delimiter_value), int(delimiter_value)]))
    labels = np.asarray(np.asarray(labels_temp).ravel().tolist(), dtype=np.float32)
    #labels and features are the two required nparray
    print(str(filename) + " is processed for logistic regression")
    return features,labels




In [36]:
# This method constructs the design matrix for training data
# 90% of the actual data (test data) is used construct the feature datamatrix
def LogisticGenerateTraining(rawTraining):
    TrainingLen = int(math.ceil(len(rawTraining)*(0.90)))
    t           = rawTraining[:TrainingLen]
    return t


#The method assigns target and feature for est dataset. 
# 10% of the actual data is used here
def LogisticGenerateTesting(rawData, TrainingCount): 
    valSize = int(math.ceil(len(rawData)*0.10))
    V_End = TrainingCount + valSize 
    t =rawData[TrainingCount+1:V_End]
    return t

In [37]:
#This method finds the weight gradient and returns it
def find_logistic_gradient(features, labels):
    bias = np.ones((features.shape[0], 1))
    features = np.hstack((bias, features))
        
    weight = np.zeros(features.shape[1])
    for i in range(1,500):
        learning_rate = 1 / i
        unnormalized_prediction = np.dot(features, weight)
        prediction = sigmoid(unnormalized_prediction)

        # Update weights with log likelihood gradient
        gradient = np.dot(features.T, (labels - prediction))
        weight += learning_rate * gradient
        # Print log-likelihood every so often
    #print(" The final log likelihood is  " + str(log_likelihood_function(features, labels, weight)))
    
    return weight

In [38]:
#Generate features and labels
human_con_features,human_con_labels = generate_target('Dataset-cleaned/human-feature-concatenation.csv',18)
human_sub_features,human_sub_labels = generate_target('Dataset-cleaned/human-feature-subtraction.csv',9)

gsc_con_features,gsc_con_labels = generate_target('Dataset-cleaned/gsc-feature-concatenation.csv',1024)
gsc_sub_features,gsc_sub_labels = generate_target('Dataset-cleaned/gsc-feature-subtraction.csv',512)

Dataset-cleaned/human-feature-concatenation.csv is processed for logistic regression
Dataset-cleaned/human-feature-subtraction.csv is processed for logistic regression
Dataset-cleaned/gsc-feature-concatenation.csv is processed for logistic regression
Dataset-cleaned/gsc-feature-subtraction.csv is processed for logistic regression


In [39]:


#Preparing Training dataset
human_con_labels_train = np.array(LogisticGenerateTraining(human_con_labels))
human_sub_labels_train = np.array(LogisticGenerateTraining(human_sub_labels))
gsc_con_labels_train = np.array(LogisticGenerateTraining(gsc_con_labels))
gsc_sub_labels_train = np.array(LogisticGenerateTraining(gsc_sub_labels))


#It generates a matrix for training set
human_con_features_train   = np.array(LogisticGenerateTraining(human_con_features))
human_sub_features_train   = np.array(LogisticGenerateTraining(human_sub_features))
gsc_con_features_train   = np.array(LogisticGenerateTraining(gsc_con_features))
gsc_sub_features_train   = np.array(LogisticGenerateTraining(gsc_sub_features))


#Preparing Testing dataset
human_con_labels_test = np.array(LogisticGenerateTesting(human_con_labels,len(human_con_labels_train)))
human_sub_labels_test = np.array(LogisticGenerateTesting(human_sub_labels,len(human_sub_labels_train)))
gsc_con_labels_test = np.array(LogisticGenerateTesting(gsc_con_labels,len(gsc_con_labels_train)))
gsc_sub_labels_test = np.array(LogisticGenerateTesting(gsc_sub_labels,len(gsc_sub_labels_train)))


#It generates a matrix for Testing set
human_con_features_test   = np.array(LogisticGenerateTesting(human_con_features,len(human_con_features_train)))
human_sub_features_test   = np.array(LogisticGenerateTesting(human_sub_features,len(human_sub_features_train)))
gsc_con_features_test   = np.array(LogisticGenerateTesting(gsc_con_features,len(gsc_con_features_train)))
gsc_sub_features_test   = np.array(LogisticGenerateTesting(gsc_sub_features,len(gsc_sub_features_train)))

In [40]:
#Generate weights 
w_human_con = find_logistic_gradient(human_con_features_train,human_con_labels_train)
w_human_sub = find_logistic_gradient(human_sub_features_train,human_sub_labels_train)
w_gsc_con = find_logistic_gradient(gsc_con_features_train,gsc_con_labels_train)
w_gsc_sub = find_logistic_gradient(gsc_sub_features_train,gsc_sub_labels_train)

In [41]:
#Find the accuracy
accuracy_human_con = logistic_accuracy(human_con_features_test,human_con_labels_test,w_human_con,'Human concatenation')
accuracy_human_sub = logistic_accuracy(human_sub_features_test,human_sub_labels_test,w_human_sub,'Human subtraction')
accuracy_gsc_con = logistic_accuracy(gsc_con_features_test,gsc_con_labels_test,w_gsc_con,'GSC concatenation')
accuracy_gsc_sub = logistic_accuracy(gsc_sub_features_test,gsc_sub_labels_test,w_gsc_sub,'GSC subtraction')

 Logistic regression accuracy of Human concatenation is 0.5031847133757962
 Logistic regression accuracy of Human subtraction is 0.47770700636942676
 Logistic regression accuracy of GSC concatenation is 0.7947947947947948
 Logistic regression accuracy of GSC subtraction is 0.5865865865865866


## Tensorflow

In [42]:
import tensorflow as tf
from tqdm import tqdm_notebook
from keras.utils import np_utils

Using TensorFlow backend.


In [43]:
#Create column header for the input dataset
def create_array_header(chars,position):
    em = []
    for j in chars:
        for i in range(position):
            em.append(j+str(i))
    em.append('target')
    return em        

In [44]:
#Generate result for vectors
#Results are made into a list from a int, because of hot shot encoding
def create_target_array(original_array):
    iterable = []
    for iter_item in original_array:
        iterator = []
        if(iter_item == 1):
            iterator = iterator + [0,1] 
        else:
            iterator = iterator + [1,0]
        iterable.append(iterator)
    return iterable

In [45]:
#extract the features from the files and shuffle/split the file into training and test data sets 
def tf_feature(filename,columnnames):
    filecontent = pd.read_csv(filename)
    filecontent.columns = columnnames
    feature_id = filecontent.drop(labels=['target'], axis=1).values
    feature_target = filecontent.target.values
    
    np.random.seed(5)
    tf.set_random_seed(5)
    #replace is set as False to avoid sampling twice
    train_index = np.random.choice(len(feature_id), round(len(feature_id) * 0.75), replace=False)
    test_index = np.array(list(set(range(len(feature_id))) - set(train_index)))
    train_feature = (feature_id[train_index])
    train_target = (feature_target[train_index])
    test_feature = (feature_id[test_index])
    test_target = (feature_target[test_index])    
    return train_feature,train_target,test_feature,test_target

In [46]:
def tensorflow_result(array_sequence,filename, position,final_append):
    train_feature,train_target_temp,test_feature,test_target_temp = tf_feature(filename, array_sequence)
    train_target = create_target_array(train_target_temp)
    test_target = create_target_array(test_target_temp)
    training_epochs = 200
    batch_size = 100
    #the train and test sets are obtained
    #batch and epochs are initializd
    #hidden layer is initialized with no of features
    hidden_layer_1 = int(position) 
    #hidden layer is initialized with no of features
    input_layer = int(position)
    #There are 2 prediction classes, so it is 2
    classes = 2
    learning_rate = 0.001
    
    #initialized the  input of tfgraph
    x = tf.placeholder("float", [None, input_layer])
    y = tf.placeholder("float", [None, classes])

    #set values for weight and bias (input and output later)
    weights = {'h1': tf.Variable(tf.random_normal([input_layer, hidden_layer_1])),'out': tf.Variable(tf.random_normal([hidden_layer_1, classes]))}
    biases = {'b1': tf.Variable(tf.random_normal([hidden_layer_1])),'out': tf.Variable(tf.random_normal([classes]))}

    #find the hidden layer. the product of input, weight and bias
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.sigmoid(layer_1)
    #this is passed to sigmoid method
    
    #This is the output later, prod of weight, bias abd hiddenlayer
    pred = tf.matmul(layer_1, weights['out']) + biases['out'] 
    
    #choose an appropriate cost and optimizer method
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
    
    #initializing the session
    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init)
        #training model
        for epoch in range(training_epochs):
            avg_cost = 0
            #extract the feature and target of the batch and find the cost and optimization
            total_batch = int(len(train_feature)/batch_size)
            train_feature_batch = np.array_split(train_feature, total_batch)
            train_target_batch = np.array_split(train_target, total_batch)
            for i in range(total_batch):
                batch_feature, batch_target = train_feature_batch[i], train_target_batch[i]
                _, c = sess.run([optimizer, cost], feed_dict={x: batch_feature,y: batch_target})
            avg_cost += c / total_batch
            #this is the average loss of the iteration

        #test the model built and find its accuracy
        correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
        print("Tensorflow Accuracy for " + final_append + " ", accuracy.eval({x: test_feature, y: test_target}))
        global result 
        result = tf.argmax(pred, 1).eval({x: test_feature, y: test_target})

In [47]:
array_sequence = create_array_header(['a','b'],9)
tensorflow_result(array_sequence,"Dataset-cleaned/human-feature-concatenation.csv",18,'Human Concatination')
array_sequence = create_array_header(['a'],9)
tensorflow_result(array_sequence,"Dataset-cleaned/human-feature-subtraction.csv",9,'Human Subtraction')
array_sequence = create_array_header(['a','b'],512)
tensorflow_result(array_sequence,"Dataset-cleaned/gsc-feature-concatenation.csv",1024,'GSC Concatenation')
array_sequence = create_array_header(['a'],512)
tensorflow_result(array_sequence,"Dataset-cleaned/gsc-feature-subtraction.csv",512,'GSC Subtraction')


Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

Tensorflow Accuracy for Human Concatination  0.5164557
Tensorflow Accuracy for Human Subtraction  0.5493671
Tensorflow Accuracy for GSC Concatenation  0.7616
Tensorflow Accuracy for GSC Subtraction  0.5964
