In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import xml.etree.ElementTree as ET  
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from statistics import mean
import math
import os
import pandas as pd
import scipy.stats as stats
import csv
from scipy.io import savemat

In [3]:
def extract_features(path,r):
  features={}

  tree = ET.parse(path)
  root = tree.getroot()
  a=[]
  b=[]
  word=[]
  all_x=[]
  all_y=[]
  all_num=[]
  x_coordinates = []
  y_coordinates = []
  num=[]
  for i in range(1,len(root)):
    for x in root[i]:
      if (x.text=='\n\t\t\t'):
        continue
      else:
        l = list(map(str,x.text.split(',')))
        l = [x.strip() for x in l]
        for x in l:
          xy = list(map(int,x.split(' ')))
          x_coordinates.append(xy[0])
          y_coordinates.append(xy[1])
          num.append(i)
        a.append(x_coordinates)
        b.append(y_coordinates)
        word.append(num)
        x_coordinates = []
        y_coordinates = []
        num=[]  
  for i in range(len(a)):
    for j in range(len(a[i])):
      all_x.append(a[i][j])
      all_y.append(b[i][j])
      all_num.append(word[i][j])
  features['word']= all_num
  #Time Stamp
  time_stamp = np.arange(0,len(all_x)*0.02,0.02)

  #Speed
  tracewise_speed=[0]
  speed=[]
  whole_speed=[]
  for i in range(len(a)):
    for j in range(1,len(a[i])):
      if len(a[i][j:])>=r:
        tracewise_speed.append(((a[i][j]-a[i][j-1])**2+(b[i][j]-b[i][j-1])**2)**(0.5)/(0.02*r))
      else:
        tracewise_speed.append(0)
    speed.append(tracewise_speed)
    tracewise_speed=[0]
  for i in range(len(speed)):
    for j in range(len(speed[i])):
      whole_speed.append(speed[i][j])
  features['speed']=whole_speed

  #writing direction
  tracewise_cos=[0]
  tracewise_sin=[0]
  cos=[]
  sin=[]
  whole_cos=[]
  whole_sin=[]
  for i in range(len(a)):
    for j in range(1,len(a[i])):
      if len(a[i][j:])>=r:
        l=((a[i][j]-a[i][j-1])**2+(b[i][j]-b[i][j-1])**2)**(0.5)
        tracewise_cos.append((a[i][j]-a[i][j-1])/l)
        tracewise_sin.append((b[i][j]-b[i][j-1])/l)
      else:
        tracewise_cos.append(0)
        tracewise_sin.append(0)
    cos.append(tracewise_cos)
    sin.append(tracewise_sin)
    tracewise_cos=[0]
    tracewise_sin=[0]
  for i in range(len(cos)):
    for j in range(len(cos[i])):
      whole_cos.append(cos[i][j])
      whole_sin.append(sin[i][j])
  features['writing_direction_x']=whole_cos
  features['writing_direction_y']=whole_sin

  #Curvature
  cur_x=[]
  cur_y=[]
  tracewise_cur_x=[0]
  tracewise_cur_y=[0]
  whole_cur_x=[]
  whole_cur_y=[]
  for i in range(len(cos)):
    for j in range(1,len(cos[i])):
      if len(cos[i][j:])>=r:
        tracewise_cur_x.append(np.cos(cos[i][j-1])*np.cos(cos[i][j])+np.sin(cos[i][j-1])*np.sin(cos[i][j]))
        tracewise_cur_y.append(np.cos(sin[i][j-1])*np.sin(sin[i][j])-np.sin(sin[i][j-1])*np.cos(sin[i][j]))
      else:
        tracewise_cur_x.append(0)
        tracewise_cur_y.append(0)
    cur_x.append(tracewise_cur_x)
    cur_y.append(tracewise_cur_y)
    tracewise_cur_x=[0]
    tracewise_cur_y=[0]
  for i in range(len(cur_x)):
    for j in range(len(cur_x[i])):
      whole_cur_x.append(cur_x[i][j])
      whole_cur_y.append(cur_y[i][j])
  features['curvature_x']=whole_cur_x
  features['curvature_y']=whole_cur_y
  

  #Vicinity Aspect
  temp_x=[]
  temp_y=[]
  left_x=[]
  right_x=[]
  left_y=[]
  right_y=[]
  var=[]
  vc=[]
  vl=[]
  vs_cos=[]
  vs_sin=[]
  tracewise_var=[]
  for i in range(len(a)):
    for j in range(len(a[i])):
      if len(a[i])>=(2*r+1):
        if len(a[i][:j])>r:
          left_x=a[i][j-r:j]
        elif len(a[i][:j])<=r:
          left_x=a[i][:j]
        if len(a[i][j+1:])>r:
          right_x=a[i][j+1:j+r+1]
        elif len(a[i][j+1:])<=r:
          right_x=a[i][j+1:]
      elif len(a[i])<(2*r+1):
        if len(a[i][:j])>r:
          left_x=a[i][j-r:j]
        elif len(a[i][:j])<=r:
          left_x=a[i][:j]
        if len(a[i][j+1:])>r:
          right_x=a[i][j+1:j+r+1]
        elif len(a[i][j+1:])<=r:
          right_x=a[i][j+1:]

      if len(b[i])>=(2*r+1):
        if len(b[i][:j])>r:
          left_y=b[i][j-r:j]
        elif len(b[i][:j])<=r:
          left_y=b[i][:j]
        if len(b[i][j+1:])>r:
          right_y=b[i][j+1:j+r+1]
        elif len(b[i][j+1:])<=r:
          right_y=b[i][j+1:]
      elif len(b[i])<(2*r+1):
        if len(b[i][:j])>r:
          left_y=b[i][j-r:j]
        elif len(b[i][:j])<=r:
          left_y=b[i][:j]
        if len(b[i][j+1:])>r:
          right_y=b[i][j+1:j+r+1]
        elif len(b[i][j+1:])<=r:
          right_y=b[i][j+1:]

      left_x.append(a[i][j])
      left_y.append(b[i][j])
      temp_x=left_x+right_x
      temp_y=left_y+right_y
      dx=max(temp_x)-min(temp_x)
      dy=max(temp_y)-min(temp_y)
      if dx+dy==0:
        var.append(0)
      else:
        var.append((dy-dx)/(dx+dy))
      
      #vicinity_curliness
      dist=[]
      for k in range(1,len(temp_x)):
        dist.append(((temp_x[k]-temp_x[k-1])**2+(temp_y[k]-temp_y[k-1])**2)**(0.5))
      if dx+dy==0:
        vc.append(0)
      else:
        vc.append(sum(dist)/max(dx,dy))
      dist=[]

  features['vicinity_aspect']=var
  features['vicinity_curliness']=vc

  return(features)

In [4]:
def normalization(fea):
  fea_norm={}
  sp=fea['speed'];
  fea_norm["speed"] = stats.zscore(sp)
  wdx = fea['writing_direction_x'];
  fea_norm["writing_direction_x"] = stats.zscore(wdx)
  wdy = fea['writing_direction_y'];
  fea_norm["writing_direction_y"] = stats.zscore(wdy)
  cx = fea['curvature_x'];
  fea_norm["curvature_x"] = stats.zscore(cx)
  cy = fea['curvature_y'];
  fea_norm["curvature_y"] = stats.zscore(cy)
  va = fea['vicinity_aspect'];
  fea_norm["viscinity_aspect"] = stats.zscore(va)
  vc = fea['vicinity_curliness'];
  fea_norm["viscinity_curliness"] = stats.zscore(vc)
  fea_norm['word']= fea['word']
  
  return fea_norm


In [5]:
folder='/content/drive/MyDrive/Datasets/query'

In [8]:
writers=[1,2,3]

In [13]:
with open('/content/drive/MyDrive/Datasets/2 Features_Codebook Descriptor/Gap_1/Data_1.csv', 'w') as f:
  csvwriter = csv.writer(f)
  a=[[],[],[],[],[],[],[],[],[],[]]
  for x in writers:
    writer_path = os.path.join(folder,str(x))
    for sample in os.listdir(writer_path):
      sample_path=os.path.join(writer_path,sample)
      print(sample_path)
      features = extract_features(sample_path,1)
      normalized = normalization(features)
      c=0
      for key, values in normalized.items():
        if c==7:
          break
        a[c].extend(normalized[key])
        count=len(normalized[key])
        c+=1
      a[9].extend(normalized['word'])
      if sample[1]=='.':
        doc=sample[0]
      else:
        doc=sample[:2]
      doc_num=[doc]*count
      a[8].extend(doc_num)
      if writer_path[-2:-1]=='/':
        writer = writer_path[-1:]
      else:
        writer = writer_path[-2:]
      writer_num=[writer]*len(a[0])
      a[7].extend(writer_num)
  for x in a:
    csvwriter.writerow(x)

/content/drive/MyDrive/Datasets/query/1/62.inkml
/content/drive/MyDrive/Datasets/query/1/7.inkml
/content/drive/MyDrive/Datasets/query/1/2.inkml
/content/drive/MyDrive/Datasets/query/1/44.inkml
/content/drive/MyDrive/Datasets/query/1/6.inkml
/content/drive/MyDrive/Datasets/query/1/48.inkml
/content/drive/MyDrive/Datasets/query/1/46.inkml
/content/drive/MyDrive/Datasets/query/1/65.inkml
/content/drive/MyDrive/Datasets/query/1/56.inkml
/content/drive/MyDrive/Datasets/query/1/68.inkml
/content/drive/MyDrive/Datasets/query/1/9.inkml
/content/drive/MyDrive/Datasets/query/1/8.inkml
/content/drive/MyDrive/Datasets/query/1/38.inkml
/content/drive/MyDrive/Datasets/query/1/28.inkml
/content/drive/MyDrive/Datasets/query/1/26.inkml
/content/drive/MyDrive/Datasets/query/1/50.inkml
/content/drive/MyDrive/Datasets/query/1/4.inkml
/content/drive/MyDrive/Datasets/query/1/18.inkml
/content/drive/MyDrive/Datasets/query/1/30.inkml
/content/drive/MyDrive/Datasets/query/1/22.inkml
/content/drive/MyDrive/Dat

In [14]:
a=[[],[],[],[],[],[],[],[],[],[]]

In [15]:
a=[[],[],[],[],[],[],[],[],[],[]]
with open('/content/drive/MyDrive/Datasets/2 Features_Codebook Descriptor/Gap_1/Data_1.csv') as file:
  csvreader=csv.reader(file)
  c=0
  for row in csvreader:
    a[c].extend(row)
    c+=1

In [16]:
from numpy import asarray
t = np.transpose(asarray(a))
print(t.shape)

(10,)


  


In [17]:
new=np.zeros(t.shape,dtype='float')

In [18]:
for i in range(len(t)):
  for j in range(len(t[i])):
    new[i][j]=t[i][j]

TypeError: ignored

In [None]:
t=[[],[],[],[],[],[],[],[],[],[]]

In [None]:
new.shape

In [None]:
dic = {'data':new}

In [None]:
savemat('/content/drive/MyDrive/Datasets/2 Features_Codebook Descriptor/Gap_1/Data_1.mat',dic)

In [None]:
folder='/content/drive/MyDrive/Datasets/query'

In [None]:
nf=[0]*43
for x in os.listdir(folder):
  writer_path=os.path.join(folder,x)
  nf[int(x)-1]=len(os.listdir(writer_path))

In [None]:
print(nf)

[70, 91, 76, 75, 69, 54, 68, 14, 49, 70, 35, 51, 8, 55, 89, 96, 58, 76, 49, 59, 22, 8, 54, 57, 55, 70, 75, 75, 53, 55, 88, 77, 97, 82, 41, 81, 9, 74, 37, 31, 65, 40, 58]


In [None]:
dic = {'data':nf}

In [None]:
savemat('/content/drive/MyDrive/Datasets/2 Features_Codebook Descriptor/Gap_1/files.mat',dic)