#Assignment - 3: Markov Sampling for letter-recognition.csv

In [4]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale

In [5]:
dataset = pd.read_csv('/content/sample_data/letter-recognition.csv')
dataset.columns = ['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar','ybar', 'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge','xedgey', 'yedge', 'yedgex']

###Parameter Initialisation

In [6]:
def initialiseParams(dataset):
  sampled = pd.DataFrame(columns = dataset.columns)
  uniqueCharacter = list(np.sort(dataset['letter'].unique()))
  countClass = len(uniqueCharacter)
  limit = 100
  m = countClass*limit
  charNo = {}
  c = 0
  for i in uniqueCharacter:
      charNo[i] = c
      c = c + 1
  listOfM = {i:0 for i in uniqueCharacter}
  return sampled, countClass, limit, charNo, m, listOfM

sampled, countClass, limit, charNo, m, listOfM = initialiseParams(dataset)


In [7]:
K, Q, accuracy = 5, 1.2, 0

In [8]:
X = dataset.drop("letter", axis=1)
y = dataset['letter']

scaledX = scale(X)

xTrain, xTest, yTrain, yTest = train_test_split(scaledX, y, test_size = 0.9, random_state = 101)
linearModel = SVC(kernel='linear')
linearModel.fit(xTrain, yTrain)

# predict
predY = linearModel.predict(xTest)
predY

array(['P', 'E', 'M', ..., 'Q', 'F', 'X'], dtype=object)

In [9]:
# Chosing a random sample as first of markov chain
i = np.random.randint(dataset.shape[0])
z0 = dataset.iloc[i]
y0 = linearModel.predict(np.array([z0.drop('letter')]))[0]
if m%countClass==0:
    listOfM[z0['letter']]+=1

In [10]:
d={}
for i, val in z0.items():
    print(i, val)
    d[i] = val
sampled.append(d, ignore_index=True)
sampled

letter W
xbox 2
ybox 0
width 2
height 0
onpix 1
xbar 7
ybar 8
x2bar 4
y2bar 0
xybar 7
x2ybar 8
xy2bar 8
xedge 6
xedgey 9
yedge 0
yedgex 8


Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex


In [11]:
predProb=[]

Utility Function for getting class index

In [12]:
def getCharNo(num):
    return charNo[num]

Utility Function for Loop Condition

In [13]:
def checkExistence(dic, limit):
    for i,val in dic.items():
        if val < limit:
            return True
    return False            

Utility Loss Function

In [14]:
def lossFunction(actual,pred):
    if actual == pred:
        return 1.0
    return np.exp(-2)

In [15]:
lst=[]

Running loop till the limit is reached for every class

In [16]:
count = 0
while checkExistence(listOfM,limit):

    i=np.random.randint(dataset.shape[0])
    while i in lst:
        i=np.random.randint(dataset.shape[0])
    z1=dataset.iloc[i]
    y1=linearModel.predict(np.array([z1.drop('letter')]))[0]
    n=lossFunction(z1['letter'],y1)
    d=lossFunction(z0['letter'],y0)
    p=n/d
    # count = count + 1
    # if count%2000 == 0:
    #   print(count)

    if accuracy==K:
        accuracy=0
        p2=Q*p
        p2=min(p2,1)
        predProb.append([z1['letter'],y1,p2])
        sampled=sampled.append(z1)
        z0=z1
        listOfM[z1['letter']]+=1
        accuracy = accuracy + 1
        lst.append(i)
    elif p==1 and z0['letter']==z1['letter']:
        n=np.exp(-getCharNo(y1)*getCharNo(z1['letter']))
        d=np.exp(-getCharNo(y0)*getCharNo(z0['letter']))

        p1=n/d
        p1=min(p1,1)
        predProb.append([z1['letter'],y1,p1])
        sampled=sampled.append(z1)
        z0=z1
        listOfM[z1['letter']]+=1
        accuracy = accuracy + 1
        lst.append(i)
    elif p<1:
        predProb.append([z1['letter'],y1,p])
        sampled=sampled.append(z1)
        z0=z1
        listOfM[z1['letter']]+=1
        accuracy = accuracy + 1
        lst.append(i)
    elif p==1 and z0['letter']!=z1['letter']:
        predProb.append([z1['letter'],y1,p])
        sampled=sampled.append(z1)
        z0=z1
        listOfM[z1['letter']]+=1
        accuracy = accuracy + 1
        lst.append(i)

In [17]:
sampled

Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
1199,R,3,9,4,6,4,6,8,8,4,7,5,8,2,7,5,11
19449,J,5,9,6,7,4,10,4,6,5,8,6,5,2,8,4,6
16833,V,7,10,7,8,3,3,11,4,4,10,12,8,2,10,1,8
8915,W,3,6,4,4,4,8,7,6,2,6,7,8,5,9,4,6
13110,K,4,8,6,6,4,3,8,2,7,10,11,12,3,8,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15882,H,6,8,9,10,9,7,4,4,2,6,4,6,8,6,11,7
334,J,2,4,4,3,1,8,8,2,7,15,5,8,0,7,1,8
5140,K,6,11,9,8,7,6,6,1,6,9,6,10,5,7,5,8
11259,U,5,5,6,4,3,5,8,5,8,10,8,9,3,9,3,5


In [18]:
print(len(predProb))
predProb

10051


[['R', 'C', 1.0],
 ['J', 'C', 1.0],
 ['V', 'C', 1.0],
 ['W', 'C', 1.0],
 ['K', 'C', 1.0],
 ['Q', 'C', 1],
 ['W', 'C', 1.0],
 ['L', 'C', 1.0],
 ['K', 'C', 1.0],
 ['T', 'C', 1.0],
 ['I', 'C', 1],
 ['K', 'C', 1.0],
 ['L', 'C', 1.0],
 ['M', 'C', 1.0],
 ['G', 'C', 1.0],
 ['P', 'C', 1],
 ['N', 'C', 1.0],
 ['Q', 'C', 1.0],
 ['E', 'C', 1.0],
 ['D', 'C', 1.0],
 ['C', 'C', 1],
 ['J', 'C', 0.1353352832366127],
 ['J', 'C', 1.0],
 ['G', 'C', 1.0],
 ['S', 'C', 1.0],
 ['M', 'C', 1],
 ['N', 'C', 1.0],
 ['T', 'C', 1.0],
 ['A', 'C', 1.0],
 ['K', 'C', 1.0],
 ['X', 'C', 1],
 ['D', 'C', 1.0],
 ['J', 'C', 1.0],
 ['G', 'C', 1.0],
 ['F', 'C', 1.0],
 ['Z', 'C', 1],
 ['X', 'C', 1.0],
 ['I', 'C', 1.0],
 ['F', 'C', 1.0],
 ['I', 'C', 1.0],
 ['N', 'E', 1],
 ['W', 'V', 1.0],
 ['I', 'C', 1.0],
 ['I', 'C', 1.0],
 ['D', 'C', 1.0],
 ['I', 'C', 1],
 ['P', 'C', 1.0],
 ['Z', 'C', 1.0],
 ['S', 'C', 1.0],
 ['A', 'C', 1.0],
 ['M', 'C', 1],
 ['S', 'C', 1.0],
 ['E', 'C', 1.0],
 ['K', 'C', 1.0],
 ['P', 'C', 1.0],
 ['O', 'C', 1],

In [19]:
sampled.to_csv('/content/sample_data/markovSamplesLetter.csv')

In [20]:
prob=[]
for i in predProb:
    prob.append(i[2])
print(len(prob))
print(prob)
print(sampled.shape)
print(type(sampled))
print(sampled['xbar'])

10051
[1.0, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 0.1353352832366127, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 0.1353352832366127, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 1.0, 1, 1.0, 1.0, 1.0, 

In [21]:
sampled['probability']=prob

In [22]:
sampled.to_csv("/content/sample_data/markovSamplesLetterProbability.csv")

In [23]:
for i in lst:
    dataset=dataset.drop([i])
dataset.to_csv('/content/sample_data/remainingLetter.csv')

In [24]:

train = pd.read_csv("/content/sample_data/markovSamplesLetter.csv")
test = pd.read_csv("/content/sample_data/remainingLetter.csv")

In [25]:
train

Unnamed: 0.1,Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
0,1199,R,3,9,4,6,4,6,8,8,4,7,5,8,2,7,5,11
1,19449,J,5,9,6,7,4,10,4,6,5,8,6,5,2,8,4,6
2,16833,V,7,10,7,8,3,3,11,4,4,10,12,8,2,10,1,8
3,8915,W,3,6,4,4,4,8,7,6,2,6,7,8,5,9,4,6
4,13110,K,4,8,6,6,4,3,8,2,7,10,11,12,3,8,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10046,15882,H,6,8,9,10,9,7,4,4,2,6,4,6,8,6,11,7
10047,334,J,2,4,4,3,1,8,8,2,7,15,5,8,0,7,1,8
10048,5140,K,6,11,9,8,7,6,6,1,6,9,6,10,5,7,5,8
10049,11259,U,5,5,6,4,3,5,8,5,8,10,8,9,3,9,3,5


In [26]:
print(train.shape,test.shape)

(10051, 18) (9949, 18)


In [27]:
train

Unnamed: 0.1,Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
0,1199,R,3,9,4,6,4,6,8,8,4,7,5,8,2,7,5,11
1,19449,J,5,9,6,7,4,10,4,6,5,8,6,5,2,8,4,6
2,16833,V,7,10,7,8,3,3,11,4,4,10,12,8,2,10,1,8
3,8915,W,3,6,4,4,4,8,7,6,2,6,7,8,5,9,4,6
4,13110,K,4,8,6,6,4,3,8,2,7,10,11,12,3,8,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10046,15882,H,6,8,9,10,9,7,4,4,2,6,4,6,8,6,11,7
10047,334,J,2,4,4,3,1,8,8,2,7,15,5,8,0,7,1,8
10048,5140,K,6,11,9,8,7,6,6,1,6,9,6,10,5,7,5,8
10049,11259,U,5,5,6,4,3,5,8,5,8,10,8,9,3,9,3,5


In [28]:
test

Unnamed: 0.1,Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
0,0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,12,G,4,9,6,7,6,7,8,6,2,6,5,11,4,8,7,8
2,15,F,6,9,5,4,3,10,6,3,5,10,5,7,3,9,6,9
3,17,C,7,10,5,5,2,6,8,6,8,11,7,11,2,8,5,9
4,20,J,1,3,2,2,1,8,8,2,5,14,5,8,0,7,0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9944,19989,P,2,1,3,2,1,4,10,3,5,10,8,5,0,9,3,7
9945,19990,W,3,8,5,6,5,11,11,2,2,5,8,7,7,12,1,7
9946,19992,E,4,9,5,6,3,5,9,2,10,10,8,9,2,8,5,5
9947,19994,T,5,8,7,7,7,7,9,4,8,7,7,8,3,10,8,6


In [29]:
train = train.drop(train.columns[[0]], axis=1)
test = test.drop(test.columns[[0]], axis=1)
train.columns = ['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar','ybar', 'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge','xedgey', 'yedge', 'yedgex']
test.columns = ['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar','ybar', 'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge','xedgey', 'yedge', 'yedgex']

In [30]:
X_train = np.array(train.drop("letter", axis = 1))
y_train = np.array(train["letter"])

X_test = np.array(test.drop("letter", axis = 1))
y_test = np.array(test["letter"])

In [38]:
def hellinger(x1, x2):
    return np.sqrt(np.dot(x1,x2.T))

In [39]:
from sklearn.metrics.pairwise import chi2_kernel

In [40]:
class KernelManager:
  def __init__(self, xTrain, yTrain, xTest, yTest):
    self.xTrain = xTrain
    self.yTrain = yTrain
    self.xTest = xTest
    self.yTest = yTest

  def linearKernel(self):
    modelLinear = SVC(kernel='linear')
    initialText = "The accuracy using linear kernel : "
    return modelLinear, initialText

  def rbfKernel(self):
    modelLinear = SVC(kernel='rbf')
    initialText = "The accuracy using rbf kernel : "
    return modelLinear, initialText

  def chiSquaredKernel(self):
    modelLinear = SVC(kernel=chi2_kernel)
    initialText = "The accuracy using chi-squared kernel : "
    return modelLinear, initialText

  def hellingerKernel(self):
    modelLinear = SVC(kernel=hellinger)
    initialText = "The accuracy using hellinger kernel : "
    return modelLinear, initialText

  def findAccuracy(self, modelLinear, initialText):
    modelLinear.fit(self.xTrain, self.yTrain)
    y_pred = modelLinear.predict(self.xTest)
    accuracy = metrics.accuracy_score(y_true=self.yTest, y_pred=y_pred)
    print(initialText, accuracy, "\n") 
    return accuracy


In [41]:
manager = KernelManager(X_train, y_train, X_test, y_test)
acc_array = []

In [42]:
model_linear, txt = manager.linearKernel()
acc_array.append(str(1 - float(manager.findAccuracy(model_linear, txt)))[:4])

The accuracy using linear kernel :  0.8459141622273595 



In [43]:
model_linear, txt = manager.rbfKernel()
acc_array.append(str(1 - float(manager.findAccuracy(model_linear, txt)))[:4])

The accuracy using rbf kernel :  0.8978791838375716 



In [44]:
model_linear, txt = manager.chiSquaredKernel()
acc_array.append(str(1 - float(manager.findAccuracy(model_linear, txt)))[:4])

The accuracy using chi-squared kernel :  0.9526585586491104 



In [45]:
model_linear, txt = manager.hellingerKernel()
acc_array.append(str(1 - float(manager.findAccuracy(model_linear, txt)))[:4])

The accuracy using hellinger kernel :  0.7822896773545079 



In [46]:
acc_array.append("NaN")
acc_array

['0.15', '0.10', '0.04', '0.21', 'NaN']

In [47]:
table = pd.DataFrame(columns=['Kernel', 'KPCA', 'SVDD', 'OCSVM', 'OCSSVM', 'OCSSVM with SMO', 'MS_SVM'])
table['Kernel'] = ['Linear', 'RBF', 'Intersection', 'Hellinger', 'χ2']
table['KPCA'] = ['0.02', '0.05', '0.18', '0.01', '0.18']
table['SVDD'] = ['0.09', '0.07', '0.01', '0.02', '0.00']
table['OCSVM'] = ['0.01', '0.14', '0.04', '0.02', '0.02']
table['OCSSVM'] = ['0.07', '0.09', '0.26', '0.13', '0.18']
table['OCSSVM with SMO'] = ['0.04', '0.04', '0.22', '0.10', '0.17']
table['MS_SVM'] = acc_array
table

Unnamed: 0,Kernel,KPCA,SVDD,OCSVM,OCSSVM,OCSSVM with SMO,MS_SVM
0,Linear,0.02,0.09,0.01,0.07,0.04,0.15
1,RBF,0.05,0.07,0.14,0.09,0.04,0.1
2,Intersection,0.18,0.01,0.04,0.26,0.22,0.04
3,Hellinger,0.01,0.02,0.02,0.13,0.1,0.21
4,χ2,0.18,0.0,0.02,0.18,0.17,
