In [104]:
#SVM
import numpy as np
import csv
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from time import time

class SVM():
        
    def read_data(self, file_string):
        raw_data = []
        #read data in form of list of lists
        with open(file_string, 'r') as f:   
            raw = csv.reader(f, delimiter = '\n')           
            for row in raw:
                raw_data.append(row[0].split(', '))
            # print(raw_data)
        return raw_data

    def __init__(self):
            self.is_continous = [1,0,1,0,1,0,0,0,0,0,1,1,1,0]
            self.discrete = {1:['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked'],
             3:['Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters', '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool'],
            5:['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed', 'Married-spouse-absent', 'Married-AF-spouse'],
            6:['Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Armed-Forces'],
            7:['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried'],
            8:['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'],
            9:['Male', 'Female'],
            13:['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany', 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua', 'Scotland', 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', 'Holand-Netherlands'],
            14:['0','1']}
            self.att_average = {}
            self.sigma = 0.7 #fixed after analyzing accuracies for different sigmas
            self.q = 0       #fixed after analyzing accuracies for different q
            
    def cls_avg(self, data, attr): 
        #function to calculate average of a particular attribute(ignoring '?' values)
        if(attr in self.att_average.keys()):
            return self.att_average[attr]
        
        final_data = data[:]
        cls_sum = 0.0
        for record in data:
            if(record[attr]!='?'):
                cls_sum += float(record[attr])
        cls_sum = cls_sum/len(data)
        self.att_average[attr] = cls_sum
        return cls_sum
         
    def normalise(self,data):
        final_data = data[:]
        # print(final_data)
        maxi = np.amax(np.array(data),axis = 0)
        mini = np.amin(np.array(data), axis = 0)
        
        # mean_lst = np.mean(np.array(data), axis = 0)
        # std_lst = np.std(np.array(data),axis = 0)
        # use (x-mean)/std for better generalisation,will take more training time
        
        for record in final_data:
            for i in range(len(final_data[0])):
                if(i!=14):
                    record[i] = float(record[i])/(maxi[i]-mini[i])
        return final_data
            
    def change_to_continous(self, data, attr):
        #change attribute number attr to continous and normalise
        for record in data:
            if(record[int(attr)]=='?'):
                continue
            record[int(attr)] = float(self.discrete[int(attr)].index(record[int(attr)]) + 1)/len(self.discrete[int(attr)])
        return data
    
    def handle_exceptions(self,data):
        #give average value of the attribute if the value is faulty
        for record in data:
            for i in range(len(record)):
                if(record[i] == '?'):
                    record[i] = self.cls_avg(data,i)
        return data
    
    def process_data(self, data):
        final_data = data[:]
        r = len(final_data[0])
        for i in range(r):
            if(self.is_continous[i]==0):
                final_data = self.change_to_continous(final_data, i)
        print("Changed")
        final_data = self.handle_exceptions(final_data)
        print("Handeled")
        
        for record in final_data:
            for i in range(len(record)):
                record[i] = float(record[i])
        

        final_data = self.normalise(final_data)
        print("Normalised")
        return final_data
    
    def clean_data(self,data,string):
        final_data = data[:]
        final_data = self.process_data(final_data)
        return final_data
    
    def kernel_lin(self,X,Y):
        return np.dot(np.array(X),np.transpose(Y))
    
    def kernel_pol(self,X,Y):
        xy = np.dot(X,np.transpose(Y))
        # print(xy)
        xy += 1
        # print(xy)
        return np.power(xy,self.q)
        
    def kernel_gauss(self,X,Y):
        x_x = np.zeros((X.shape[0],1))
        y_y = np.zeros((1,Y.shape[0]))
        
        for i in range(X.shape[0]):
            x_x[i][0] = np.dot(X[i],np.transpose(X[i]))
        
        for i in range(Y.shape[0]):
            y_y[0][i] = np.dot(Y[i],np.transpose(Y[i]))
                
        X_Y = 2*np.dot(X,np.transpose(Y))
        
        return np.exp((X_Y-x_x-y_y)/float((self.sigma * self.sigma)))
    
    def test_kernels(self, x_train, labels, k):
        # print(self.sigma)
        print("\nLinear kernel")
        estimator = SVC(kernel = self.kernel_lin)
        start = time()
        x_train_temp = x_train
        labels_temp = labels
        accuracy = cross_val_score(estimator,x_train_temp,labels_temp,cv=k)
        stop = time()
        print("Time: ", stop-start, "Accuracy: ",sum(accuracy)/5)
    
        print("\nPolynomial kernel")
        estimator = SVC(kernel = self.kernel_pol)
        for i in range(5):    
            start = time()
            x_train_temp = x_train
            labels_temp = labels
            self.q = i+1
            accuracy = cross_val_score(estimator,x_train_temp,labels_temp,cv=k)
            stop = time()
            print("q = ", self.q, "Time: ", stop-start, "Accuracy: ",sum(accuracy)/5)

        print("\nGaussian Kernel")
        estimator = SVC(kernel = self.kernel_gauss)
        for i in range(9):
            start = time()
            x_train_temp = x_train
            labels_temp = labels
            self.sigma = float(i+1)/10.0
            accuracy = cross_val_score(estimator,x_train_temp,labels_temp,cv=k)
            stop = time()
            print("sigma = ", self.sigma, "Time: ", stop-start, "Accuracy: ",sum(accuracy)/5)

In [105]:
class MultiKernelfixedrules(object):
    def __init__(self, kernels, X=None, Y = None):
        self.kernels = kernels
        self.X = X
        self.Y = Y
        # giving weights according to the accuracies. accuracy[kernel_method]/sum(accuracy)
        self.w =[0.3312,0.3355,0.3332] 
        
    def multiKernelFixed(self, X, Y): #kernel to be used
        return sum([self.w[i] * self.kernels[i](X,Y) for i in range(3)])

    def validation(self):  # Testing purposes
        print("\nMultiKernelFixed: ")
        multi = SVC(kernel=self.multiKernelFixed)
        start = time()
        scores = cross_val_score(multi, self.X, self.Y, cv=5)
        stop = time()
        print('accuracy=', sum(scores)/5, 'time', stop-start)


In [106]:
class MultiKernelheuristic(object):
    def __init__(self, kernels, X=None, Y = None):
        self.kernels = kernels
        self.X = X
        self.Y = Y
        self.nm = [0,0,0]
        self.flag = 0
        
    def heuristic(self,X,Y):
        #to compute coefficients of each gram matrix obtained from seperate kernels
        y_y= np.dot(Y,np.transpose(Y))
        
        for i in range(3):
            gram = self.kernels[i](X,Y)
            frob_k_yy = np.sum(np.multiply(gram,y_y))
            frob_k_k = np.sum(np.multiply(gram,gram))
            
            self.nm[i] = frob_k_yy/(len(self.X) * np.sqrt(frob_k_k))
        self.flag = 1
        s = sum(self.nm)
        self.nm = [x/s for x in self.nm]
        
    def multi_heuristic(self,X,Y): # kernel to be used
        if(not self.flag):
            self.heuristic(X,Y)
        return sum([self.nm[i] * self.kernels[i](X,Y) for i in range(3)])
    
    def validation(self):
        print("\nMultiKernelheuristic: ")
        multi = SVC(kernel=self.multi_heuristic)
        start = time()
        scores = cross_val_score(multi, self.X, self.Y, cv=5)
        stop = time()
        print('accuracy=', sum(scores)/5, 'time', stop-start)
        

In [107]:
class check_class(object):
    def __init__(self,data,labels,s):
        self.x_train = data
        self.labels=labels
        self.s = s
    
    def check_seperate_kernel(self):
        x_trains_temp = self.x_train
        labels_temp = self.labels
        self.s.test_kernels(x_trains_temp,labels_temp,5)
        
    def check_multi_fixed(self):
        x_trains_temp = self.x_train
        labels_temp = self.labels
        mkfr = MultiKernelfixedrules([self.s.kernel_lin,self.s.kernel_pol,self.s.kernel_gauss],x_trains_temp,labels_temp)
        mkfr.validation()
    
    def check_multi_heuristic(self):
        x_trains_temp = self.x_train
        labels_temp = self.labels
        mkh = MultiKernelheuristic([self.s.kernel_lin,self.s.kernel_pol,self.s.kernel_gauss],x_trains_temp,labels_temp)
        mkh.validation()

In [108]:
def main():
    s = SVM()
    raw_data = (s.read_data('/home/desmond/Programming/MachineLearning/AML/PA1/SVM/data/train.csv'))
    
    x_train = np.array(raw_data)[:,:-1].tolist()
    labels = np.array(raw_data)[:,len(raw_data[0])-1].tolist()
    data = s.process_data(x_train)
    for i in range(len(labels)):
        labels[i] = int(labels[i]) 
     
    #getting time and accuracy for every method
    check = check_class(data, labels,s)
    check.check_seperate_kernel()
    check.check_multi_fixed()
    check.check_multi_heuristic()

main()

Changed
Handeled
Normalised

Linear kernel
Time:  3.026012420654297 Accuracy:  0.836250183252

Polynomial kernel
q =  1 Time:  6.155317544937134 Accuracy:  0.836125261328
q =  2 Time:  6.827847242355347 Accuracy:  0.84787526543
q =  3 Time:  25.294585943222046 Accuracy:  0.846999718017
q =  4 Time:  47.95439553260803 Accuracy:  0.84350034126
q =  5 Time:  177.61670446395874 Accuracy:  0.836750730274

Gaussian Kernel
sigma =  0.1 Time:  18.25573420524597 Accuracy:  0.785622667138
sigma =  0.2 Time:  15.058085680007935 Accuracy:  0.815996036522
sigma =  0.3 Time:  12.446285963058472 Accuracy:  0.830624087548
sigma =  0.4 Time:  11.7430739402771 Accuracy:  0.836499714502
sigma =  0.5 Time:  11.401437044143677 Accuracy:  0.837499089795
sigma =  0.6 Time:  11.013935565948486 Accuracy:  0.840375340869
sigma =  0.7 Time:  10.880332231521606 Accuracy:  0.841375497315
sigma =  0.8 Time:  10.83888053894043 Accuracy:  0.841251122315
sigma =  0.9 Time:  10.82202672958374 Accuracy:  0.840876590967


### Comparison of accuracies and training time between Linear, Polynomial and Gaussian Kernels:

#### Linear-
1. Best(only) accuracy is 83.62%.
2. Training time is 3.02s

#### Polynomial
1. Best accuracy is reported for q = 2, and is equal to 84.78%.
2. Training Time for q = 2 is 6.82s

#### Gaussian
1. Best accuracy is reported for sigma = 0.7, and is equal to 84.14%.
2. Training time for sigma = 0.7 is 10.88s

     As we can see from above data accrucay follows order Polynomial > Gaussian > Linear and training time follows the order Gaussian > Polynomial > Linear.

### Comparison between MultiKernelFixedRule with Seperate kernels

#### Accuracy:
1. MultiKernelFixed rule gives accuracy of 83.87%
2. As we can observe accuracy is lower than that of Polynomial but higher than Linear.

#### Time:
1. MultiKernelFixed rule takes 88.71s which is higher than each kernel seperate.


### Comparison of MultiKernelHeuristic with all other methods

#### Accuracy:
1. Accuracy is 83.85 which is comparable to MultiKernelFixedRule and lesser than Polynomial kernel.

#### Time:
1. Training Time is 125.89s which is more than every other method used