In [1]:
# encoding=utf-8
import time
import random
import logging

import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

from generate_dataset import *



In [20]:
class SVM(object):
    
    def __init__(self,kernel='linear',epsilon = 0.001):
        self.kernel = kernel
        self.epsilon = epsilon
    
    def _init_parameters(self, features, lables):
        '''
        Initialize parameters
        '''
        self.X = features
        self.Y = lables
        
        self.b = 0.0
        self.n = len(features[0])
        self.N = len(features)
        self.alpha = [0.0] * self.N
        self.E = [self._E_(i) for i in xrange(self.N)]
        
        #may need to optimize?
        self.C = 1000
        
        #loop for training
        self.Max_Iteration = 5000
    
    def _E_(self,i):
        '''
        Equation(7.105)
        '''
        return self._g_(i)-self.Y[i]
    
    def _g_(self,i):
        '''
        Equation(7.104)
        '''
        result = self.b
        
        for j in xrange(self.N):
            result += self.alpha[j] * self.Y[j] * self._K_(self.X[i],self.X[j])
        
        return result
    
    def _K_(self,x1,x2):
        '''
        Kernel function
        '''
        if self.kernel == 'linear':
            return np.sum([x1[k] * x2[k] for k in xrange(self.n)])
        if self.kernel == 'poly':
            return (np.sum([x1[k] * x2[k] for k in xrange(self.n)])+1)**3
    
    def _satisfy_KKT(self,i):
        '''
        Note that the check is within the tolerance of epsilon
        Equation (7.111-7.113)
        '''
        ygx = self.Y[i] * self._g_(i)
        
        if abs(self.alpha[i]) < self.epsilon:
            return ygx > 1 or ygx == 1
        elif abs(self.alpha[i] - self.C) < self.epsilon:
            return ygx <1 or ygx == 1
        else:
            return abs(ygx-1) < self.epsilon
    
    def _select_two_parameters(self):
        '''
        Select two variables
        Based on method 7.4.2
        '''
        
        index_list = [i for i in xrange(self.N)]
        
        i1_lst_1 = filter(lambda i: self.alpha[i] > 0 and self.alpha[i] < self.C, index_list)
        i1_lst_2 = list(set(index_list) - set(i1_lst_1))
        
        i1_lst = i1_lst_1
        i1_lst.extend(i1_lst_2) 
        
        for i in i1_lst:
            if self._satisfy_KKT(i):
                continue
            
            E1 = self.E[i]
            _max_ = (0,0)
            
            for j in index_list:
                if i == j:
                    continue
                
                E2 = self.E[j]
                
                #P.129 
                #Way to choose second variable 
                if abs(E1 - E2) > _max_[0]:
                    _max_ = (abs(E1 - E2),j)
            
            return i, _max_[1] 
        
    def train(self, features, lables):
        '''
        features = train_features
        labels = train_labels
        '''
        self._init_parameters(features,lables)
        
        for times in xrange(self.Max_Iteration):
            #7.4.2 
            i1, i2 = self._select_two_parameters()
            
            #p.126
            #how to set L&H
            L = max(0, self.alpha[i2]-self.alpha[i1])
            H = min(self.C, self.C + self.alpha[i2] - self.alpha[i1])
            
            if self.Y[i1] == self.Y[i2]:
                L = max(0, self.alpha[i2] + self.alpha[i1] - self.C)
                H = min(self.C, self.alpha[i2] + self.alpha[i1])
            
            E1 = self.E[i1]
            E2 = self.E[i2] 
            eta = self._K_(self.X[i1],self.X[i1]) + self._K_(self.X[i2],self.X[i2]) - \
                  2 * self._K_(self.X[i1],self.X[i2])  #Equation(7.107) 
            
            alpha2_new_unc = self.alpha[i2] + self.Y[i2] * (E1-E2) / eta #Equation(7.106) 
            
            #Equation(7.108)
            alpha2_new = 0
            if alpha2_new_unc > H:
                alpha2_new = H
            elif alpha2_new_unc < L:
                alpha2_new = L
            else:
                alpha2_new = alpha2_new_unc
                
            #Equation(7.109)
            alpha1_new = self.alpha[i1] + self.Y[i1] * self.Y[i2] * (self.alpha[i2]-alpha2_new)
            
            #P.130
            #Equation(7.115)&(7.116)
            b_new = 0
            b1_new = -E1 - self.Y[i1]*self._K_(self.X[i1],self.X[i1])*(alpha1_new-self.alpha[i1]) -\
                     self.Y[i2]*self._K_(self.X[i2],self.X[i1])*(alpha2_new-self.alpha[i2]) + self.b
            
            b2_new = -E2 - self.Y[i1]*self._K_(self.X[i1],self.X[i2])*(alpha1_new-self.alpha[i1]) -\
                     self.Y[i2]*self._K_(self.X[i2],self.X[i2])*(alpha2_new-self.alpha[i2]) + self.b

            
            if alpha1_new > 0 and alpha1_new < self.C:
                b_new = b1_new
            elif alpha2_new > 0 and alpha2_new < self.C:
                b_new = b2_new
            else:
                b_new = (b1_new + b2_new)/2
            
            #update alpha and b
            self.alpha[i1] = alpha1_new
            self.alpha[i2] = alpha2_new
            self.b = b_new
            
            self.E[i1] = self._E_(i1)
            self.E[i2] = self._E_(i2) 
    
    def _predict_(self, feature):
        result = self.b
        
        for i in xrange(self.N):
            result += self.alpha[i]*self.Y[i]*self._K_(feature,self.X[i])
        
        if result >= 0:
            return 1 
        else:
            return -1
    
    def predict(self, features):
        results = []
        
        for feature in features:
            results.append(self._predict_(feature))
        
        return results

        
    

In [21]:
if __name__ == "__main__":
    print "Start reading data."
    
    time1 = time.time()
    
    #split 2/3 as training data, 1/3 as testing data
    train_features, train_labels, test_features, test_labels = generate_dataset(2000,visualization=False)
    
    time2 = time.time()
    print "Reading data cost ", time2-time1, ' seconds.','\n'
    
    print "Start training model."
    svm = SVM()
    svm.train(train_features,train_labels)
    
    time3 = time.time()
    print "Training model cost ", time3-time2, ' seconds.', '\n' 
    
    print "Start predicting."
    test_predict = svm.predict(test_features)
    time4 = time.time()
    print "Predicting data cost ", time4-time3, ' seconds.','\n'
    
    score = accuracy_score(test_labels,test_predict)
    print "SVM got accuracy score: ", score

Start reading data.
Reading data cost  2.99990582466  seconds. 

Start training model.
Training model cost  385.221678019  seconds. 

Start predicting.
Predicting data cost  14.0144660473  seconds. 

SVM got accuracy score:  0.89039039039
