In [17]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from compare_genes import getScores
from xmlparse import loadRef, loadRef2, loadGeometryBases, getXmlScore, minXml
from score_strokes import alignStrokes
from exhaustive import computeExhaustive, exhaustScore, exhaustScoreAlignment

## Scoring functions

In [18]:
# Obtaining scores through heuristic algorithm without getScores
def heuristicScores(algo, ref_char, char_data):
    ref_g, ref_l, output_size = loadRef2(ref_char, "Reference/6-stroke_characters")
    g_data, _, base_data, stroke_sets, _, f_names = char_data
    heuristic_scores = []
    for (gl, bases, stroke_set, f_name) in zip(g_data, base_data, stroke_sets, f_names):
        g, l = gl
        heuristic_alignment = np.array(algo(g, ref_g, l, ref_l))+1
        heuristic_xml = minXml(ref_char, bases, stroke_set, heuristic_alignment)
        heuristic_score = getXmlScore(heuristic_xml)
        heuristic_scores.append(heuristic_score)
    return heuristic_scores

# Obtaining scores through heuristic algorithm with getScores
def heuristicScoresShort(algo, ref_char, data_dir):
    heuristic_scores, _, marks = getScores(algo, ref_char, data_dir)
    return heuristic_scores, marks
#the below function doesn't work properly because scores doesn't have the same number of elements as exhaustive_scores. Anisa has a more-working
#version of this code, for code that actually checks multiple archetypes
#However, it's supposed to use heuristicScores to generate scores with all of the archetypes in ref_dir, then find the greatest one for that character
#and return it
def heuristicScoresWrap(algo, ref_dir, char_data):
    max_score = -1
    ref_directs = os.listdir(f"{ref_dir}")
    ref_directs.sort()
    for ref_direct in ref_directs:
        ref_list = os.listdir(f"{ref_dir}/{ref_direct}")
        ref_list.sort()
        
        for ref_char in ref_list:
            print(ref_char)
            ref_charShort = ref_char[:4:]
            print(ref_charShort)
            score = heuristicScores(algo, ref_charShort, char_data)
            scores = []
            for i in score:
                if (i > max_score):
                    max_score = i
            scores.append(max_score)
    return scores
    
# Obtaining scores through exhaustive search
def exhaustiveScores(ref_char, char_data, data_dir):
    ref_g, ref_l, output_size = loadRef2(ref_char, "Reference/6-stroke_characters")
    g_data, han_chars, base_data, _, _, f_names = char_data
    exhaustive_scores = []
    for (gl, han_char, bases, f_name) in zip(g_data, han_chars, base_data, f_names):
        g, l = gl
        original_score = exhaustScore(ref_char, f_name, data_dir, force_refresh=True, save=False)
        exhaustive_scores.append(original_score)
    return exhaustive_scores
    

## Loading genes from the directory and storing scores
This will compare all of the genes in "Genes/sixgenes/test" with all of the archetypes in "Reference/6-stroke_characters"

all_scores is a 2-d list that holds all of the score numbers. Unfortunately, each han character is a row instead of a column

ref_chars holds all of the han character gene names

In [19]:
all_scores = []
data_dir = "Genes/sixgenes/test"
ref_dir = "Reference/6-stroke_characters"
ref_chars = os.listdir(ref_dir)
for char in ref_chars:
    han_char = char[:4:]
    ref_g, ref_l, output_size = loadRef2(han_char, ref_dir)
    char_data = loadGeometryBases(data_dir, output_size)
    f_names = []
    f_names = os.listdir(data_dir)
    f_names.sort()
    heuristic_scores = heuristicScores(alignStrokes, han_char, char_data)
    all_scores.append(heuristic_scores)

## Generating a table

In [20]:
xml_dir = "GenXml"
#gene_names = [f_name.split(".")[0] for (i, f_name) in enumerate(f_names)]
gene_names = f_names
all_scores_t = np.array(all_scores).T
#below code makes a label for which gene is which
genes = pd.Series(f_names, name="Genes")
frame = {}
frame = pd.DataFrame.from_records(all_scores_t, columns=ref_chars)
#this allows the label for the genes to be added to the data frame
result = pd.concat([genes, frame], axis=1)
table = pd.DataFrame(result)
table


Unnamed: 0,Genes,6709.han,81F3.han,5408.han,5217.han,5B89.han,4EFB.han,5728.han,5B57.han,6210.han
0,4EFB.2.6.gene,0.001285854,5.803043e-05,0.0001412032,4.5e-05,2.72082e-06,0.208971,0.008413,3.294523e-06,0.00104835
1,5408.2.9.gene,0.001601488,0.001167325,0.2063964,4.9e-05,1.709877e-06,3.1e-05,0.000218,9.207894e-08,2.271252e-05
2,56E0.2.14.gene,0.0002622283,0.0001638655,5.017359e-07,4e-06,3.577765e-07,0.003016,0.000284,4.073994e-07,0.0005689849
3,5728.2.7.gene,0.005117442,5.074766e-07,5.306386e-08,0.000422,0.0001036769,0.082959,0.209811,0.0001699518,0.0003490108
4,5B57.2.15.gene,9.053703e-11,5.116237e-05,4.037064e-13,2.9e-05,3.876555e-05,4.8e-05,2e-06,0.2009396,5.884607e-08
5,5B57.2.3.gene,2.079295e-06,2.800913e-07,2.939109e-11,7.8e-05,9.414226e-05,1e-05,2e-06,0.2019582,1.993413e-05
