In [66]:
from pathlib import Path

import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xmltodict

from compare_genes import getScores
from xmlparse import loadRef, loadGeometryBases, getXmlScore, minXml
from score_strokes import alignStrokes, strokeErrorMatrix
from exhaustive import computeExhaustive, exhaustScore, exhaustScoreAlignment

## Scoring functions

In [67]:
def heuristic(ref_char, ref_data, char_data):
    ref_geometry, ref_progress_percentage, output_size = ref_data
    g_data, _, base_data, stroke_sets, _, f_names = char_data
    heuristic_scores = {}#[]
    for (geometry_length, bases, stroke_set, f_name) in zip(g_data, base_data, stroke_sets, f_names):
        #print(f_name)
        strokes, p_strokes = geometry_length
        base_matrix = strokeErrorMatrix(strokes, ref_geometry, p_strokes, ref_progress_percentage)
        #if len(ref_geometry) != len(strokes):
        #    print("skip", f_name)
        #    continue
        # Test through row/col
        #print(base_matrix)
        error_maps = np.copy(base_matrix)
        row_stroke_map = np.full(len(strokes), -1)
        col_stroke_map = np.full(len(strokes), -1)
        row_mins = np.min(error_maps, axis=1)
        col_mins = np.min(error_maps, axis=0)
        compare_scores = []
        stroke_maps = {}
        # Iterate over every smallest error per row
        for row_min in range(len(ref_geometry)):
            coords = np.argwhere(error_maps == row_mins[row_min])
            if len(coords) > 1: # In cases where there are identical error values
                for coord in coords:
                    if not np.any(row_stroke_map == coord[0]):#row_stroke_map[coord[0]-1] != -1:
                        loc = coord
                        break
            else:
                loc = coords[0] # Find [row, col] index of current smallest error
            while row_stroke_map[loc[1]] != -1: # Make sure there's no overlap
                error_maps[loc[0]][loc[1]] = 10000
                loc[1] = np.argmin(error_maps[loc[0]])
                # # remind program to switch the priority and repeat
            row_stroke_map[loc[1]] = loc[0]
            #print(row_stroke_map)
        if np.array2string(row_stroke_map) not in stroke_maps:
            stroke_maps[np.array2string(row_stroke_map)] = row_stroke_map
        # example: row 0's smallest error is at index 2 and so stroke_map[2] = 0
        # but row 4's smallest error is also at index 2
        # take row 0, recalculate the smallest error excluding index 2,
        # but it's too difficult so just permutation of all overlaps and rearrange them
        error_maps = np.copy(base_matrix)
        for col_min in range(len(ref_geometry)):
            coords = np.argwhere(error_maps == col_mins[col_min])
            if len(coords) > 1: # In cases where there are identical error values
                for coord in coords:
                    if col_stroke_map[coord[1]-1] != -1:
                        loc = coord
                        break
            else:
                loc = coords[0] # Find [row, col] index of current smallest error
            while np.any(col_stroke_map == loc[0]): # Make sure there's no overlap
                error_maps[loc[0]][loc[1]] = 10000
                loc[0] = np.argmin(error_maps[:, loc[1]])
            col_stroke_map[loc[1]] = loc[0]
        if np.array2string(col_stroke_map) not in stroke_maps:
            stroke_maps[np.array2string(col_stroke_map)] = col_stroke_map
        for s in stroke_maps.values():
            #print(s)
            heuristic_alignment = np.delete(s, np.where(s == -1))+1
            heuristic_xml = minXml(ref_char, bases, stroke_set, heuristic_alignment) #5408.2.8.gene returning stroke order containing only 5 elements
            try:
                heuristic_score = getXmlScore(heuristic_xml)
            except:
                print("err:", f_name)
            compare_scores.append(heuristic_score)
        #heuristic_scores.append(max(compare_scores))
        heuristic_scores[f_name] = max(compare_scores)
    return heuristic_scores

def loadRef2(han_char, ref_dir = "Reference"):
    stroke_list = []
    frac_dists = []
    ref_path = f"{ref_dir}/{han_char}.han"
    ref_xml = open(ref_path, "r").read()
    root = xmltodict.parse(ref_xml)
    bounds = root["hanDefinition"]["bounds"]
    x_min, y_min, x_max, y_max = (float(bounds["@left"]), float(bounds["@bottom"]), float(bounds["@right"]), float(bounds["@top"]))
    scale = (int(x_max-x_min), int(y_max-y_min))
    strokes = root["hanDefinition"]["strokes"]["stroke"]
    if isinstance(strokes, dict):
        strokes = [strokes]
    for stroke in strokes:
        points = stroke["points"]["forward"]
        point_arr = []
        frac_arr = []
        for point in points["pointDistance"]:
            point_arr.append((float(point["@x"])-x_min,
                               float(point["@y"])-y_min))
            frac_arr.append(float(point["@fractionalDistance"]))
        stroke_list.append(np.array(point_arr))
        frac_dists.append(np.array(frac_arr))
    return stroke_list, frac_dists, scale

## Loading genes from the directory and storing scores
This will compare all of the genes in "Genes/sixgenes/test" with all of the archetypes in "Reference/6-stroke_characters"

all_scores is a 2-d list that holds all of the score numbers. Unfortunately, each han character is a row instead of a column

ref_chars holds all of the han character gene names

### Note about loadRef2!
Holiday's original code didn't include loadRef2! I made it. It's the exact same as loadRef, except the ref_path doesn't include the {han_char[0]}000 part of it. I did this so I wouldn't have to make all of those files in my test folder.

In [68]:
all_scores = []
f_names = []
ref_dir = f'{str(Path.home())}/Stylus_Scoring_Generalization/NewRef' # archetype directory
for _, _, f_names in os.walk(ref_dir):
    for f in f_names:
        ref_chars.append(f.split(".")[0])
    #ref_chars.extend(f.split(".")[0] for f in f_names)
ref_chars = list(filter(None, ref_chars))
if '.ipynb_checkpoints' in ref_chars:
    ref_chars.remove('.ipynb_checkpoints')
for char in ref_chars:
    han_char = char[:4:]
    ref_g, ref_l, output_size = loadRef2(han_char, ref_dir)
    #if len(ref_g
    char_data = loadGeometryBases(data_dir, output_size)
    f_names = os.listdir(data_dir)
    f_names.sort()
    heuristic_scores = heuristic(han_char, [ref_g, ref_l, output_size], char_data).values()
    all_scores.append(heuristic_scores)

## Generating a table

In [69]:
xml_dir = "GenXml"
gene_names = []
#gene_names = [f_name.split(".")[0] for (i, f_name) in enumerate(f_names)]
for name in f_names:
    x = len(name) - 5
    gene_names.append(name[:x])
all_scores_t = np.array(all_scores).T
#below code makes a label for which gene is which
genes = pd.Series(gene_names, name="Genes")
f = {}
for (char, scores) in zip(ref_chars, all_scores_t):
    f[char] = scores
#frame = pd.DataFrame.from_records(all_scores_t, columns=ref_chars)
frame = pd.DataFrame(f)
#this allows the label for the genes to be added to the data frame
result = pd.concat([genes, frame], axis=1)
table = pd.DataFrame(result)
table.to_csv('new_heuristic.csv', index = 'false')
