In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
import os
import numpy as np
from preprocessing import *
from features_extraction import *

from glob import glob

In [None]:
# extract strokes (x and y cordinate) from xml file and return a list of strokes as ink object [mark pen up with 1]
def extract_strokes(sample):
  tree = ET.parse(sample)
  r = tree.getroot()
  stroke_set = r.find("StrokeSet")
  strokes = []
  for stroke_node in stroke_set:
      for point in stroke_node:
          x = int(point.attrib['x']) 
          y = int(point.attrib['y'])
          time = float(point.attrib['time'])
          strokes.append([x,y,0])
      strokes[-1][-1] = 1 # pen-up
  return strokes

In [None]:
#  extract handwriting from the raw data
!tar -xf "data/ascii-all.tar.gz"
!tar -xf "data/lineStrokes-all.tar.gz"

In [None]:

PATH= 'data/lineStrokes'
xml_files = [y for x in os.walk(PATH) for y in glob(os.path.join(x[0], '*.xml'))]
PATH= 'data/ascii'
txt_files = [y for x in os.walk(PATH) for y in glob(os.path.join(x[0], '*.txt'))]

In [None]:
#  Clean up the text and format it to be ready for preprocessing
data=[]
for txt_file in txt_files:
  with open(txt_file) as f:
    lines = f.readlines()
    try:
      indx = lines.index('CSR:\n')
    except:
      # print(lines)
      # print(txt_file)
      indx = lines.index('CSR: \n')
      # break
    lines = lines[indx+2:]
    for i,line in enumerate(lines):
      xml_file = txt_file.replace('ascii','lineStrokes').replace('.txt','')
      xml_file = xml_file + f'-{i+1:02}.xml'
      data.append({'file_path':xml_file,'transcript':line.replace('\n','')})
    

In [None]:
#  Convert the list to dataframe to easily process the data
df = pd.DataFrame(data)
df['exists'] = df['file_path'].apply(lambda x :os.path.exists(x))

In [None]:
df['exists'].value_counts()

True     12187
False     1021
Name: exists, dtype: int64

In [None]:
df.head()

Unnamed: 0,file_path,transcript,exists
0,/content/lineStrokes/f07/f07-417/f07-417z-01.xml,Having exhausted their invention in the,True
1,/content/lineStrokes/f07/f07-417/f07-417z-02.xml,"preparation of stimulants for the palate, they...",True
2,/content/lineStrokes/f07/f07-417/f07-417z-03.xml,fresh ground and called another sense to their...,True
3,/content/lineStrokes/f07/f07-417/f07-417z-04.xml,delicate application of odours and richly-dist...,True
4,/content/lineStrokes/f07/f07-417/f07-417z-05.xml,"perfumes, these refined voluptuaries aroused t...",True


In [None]:
# Keep only the files that exist
data =df[df['exists']]

In [None]:
#  name the preprocessing methods and features extraction  
NORM_ARGS = ["origin","smooth", "slope", "resample", "slant", "height"]
FEAT_ARGS = ["x_cor","y_cor","penup","dir", "curv", "vic_aspect", "vic_curl", "vic_line", "vic_slope", "bitmap"]

In [None]:
#  iterate over the data, preporcess and extract the features, then save the data to binary file
for i,sample in data.iterrows():
  strokes = extract_strokes(sample['file_path'])
  strokes = np. array(strokes)
  ink = preprocess_handwriting(strokes, NORM_ARGS)
  feat = calculate_feature_vector_sequence(ink, FEAT_ARGS)
  outfilename = sample['file_path'].split('/')[-1].replace('.xml','.bin')
  outfilename= 'data/bin_files/'+ outfilename
  feat.tofile(outfilename)  

In [None]:
data.head()

Unnamed: 0,file_path,transcript,exists
0,/content/lineStrokes/f07/f07-417/f07-417z-01.xml,Having exhausted their invention in the,True
1,/content/lineStrokes/f07/f07-417/f07-417z-02.xml,"preparation of stimulants for the palate, they...",True
2,/content/lineStrokes/f07/f07-417/f07-417z-03.xml,fresh ground and called another sense to their...,True
3,/content/lineStrokes/f07/f07-417/f07-417z-04.xml,delicate application of odours and richly-dist...,True
4,/content/lineStrokes/f07/f07-417/f07-417z-05.xml,"perfumes, these refined voluptuaries aroused t...",True


In [None]:
# save the transcript and file path to excel file
data.to_excel('data/iam_data.xlsx')