In [79]:
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from handcoded_tokenizer import STLTokenizer
import ast
import torch

In [24]:
import warnings
warnings.filterwarnings('ignore')

In [74]:
s_7200 = pd.read_csv('formulae_mining/step_7200_formulae.csv')
s_8000 = pd.read_csv('formulae_mining/step_8000_formulae.csv')
s_9600 = pd.read_csv('formulae_mining/step_9600_formulae.csv')
s_10000 = pd.read_csv('formulae_mining/step_10000_formulae.csv')
s_10400 = pd.read_csv('formulae_mining/step_10400_formulae.csv')
s_12000 = pd.read_csv('formulae_mining/step_12000_formulae.csv')

In [73]:
s_7200.tail()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Gold Formula,Generated Formula,Embedding Gold Formula,Embedding Generated Formula
196,196,196.0,"always[6,17] ( x_2 <= 0.5562 )","always[17,21] ( x_2 <= 0.4811 )","[0.026494910940527916, 0.014440430328249931, 0...","[0.023633580654859543, 0.013538600876927376, 0..."
197,197,197.0,( ( x_2 <= 0.0842 and x_1 <= -0.4233 ) or ( x_...,( x_2 >= -0.088 or ( x_2 >= -0.088 or ( x_2 >=...,"[0.08846453577280045, 0.029129117727279663, 0....","[0.011584503576159477, 0.00930745154619217, 0...."
198,198,198.0,"( x_1 <= -0.2029 until[8,19] not ( x_2 >= -0.0...",( x_1 <= -0.0401 and ( x_2 <= -0.081 and x_2 <...,"[0.0009152419515885413, 0.0021476703695952892,...","[0.0010684828739613295, 0.0023432974703609943,..."
199,199,199.0,"( x_2 <= -2.3411 until[15,17] ( x_1 >= 0.538 u...",( x_2 <= -1.0841 and x_2 <= -1.181 ),"[0.0003933501720894128, 0.0013355386909097433,...","[0.0006671140436083078, 0.0017945998115465045,..."
200,7005710602,0.000102,0.00032462782110087574,0.0025793707463890314,"0.011999044567346573]""",


In [47]:
def token_division(input_string):
    tokenizer = STLTokenizer('tokenizer_files/tokenizer.json')
    return [element for element in tokenizer.tokenize(input_string) if element != "pad"]

In [22]:
def bleu_score(dataset):

    bleu_scores = []

    for idx in range(len(dataset)):
        gold = token_division(dataset["Gold Formula"][idx])
        generated = token_division(dataset["Generated Formula"][idx])

        bleu_scores.append(sentence_bleu(gold, generated))

    return np.min(bleu_scores), np.mean(bleu_scores), np.max(bleu_scores)

In [36]:
def exact_match(dataset):

    percentage = []

    for idx in range(len(dataset)):
        gold = token_division(dataset["Gold Formula"][idx])
        generated = token_division(dataset["Generated Formula"][idx])

        match_count = 0
        for gold_token, gen_token in zip(gold, generated):
            if gold_token == gen_token:
                match_count += 1

        percentage.append(match_count/len(gold))


        return np.min(percentage), np.mean(percentage), np.max(percentage)        

In [58]:
def cosine_similarity(dataset):
    
    similarities = []
    
    for idx in range(len(dataset)):
        gold = ast.literal_eval(dataset["Embedding Gold Formula"][idx])
        gen = ast.literal_eval(dataset["Embedding Generated Formula"][idx])

        dot_product = np.dot(gold, gen)
        gold_norm = np.linalg.norm(gold)
        gen_norm = np.linalg.norm(gen)

        similarities.append(dot_product / (gold_norm * gen_norm))

    return np.min(similarities), np.mean(similarities), np.max(similarities)           

In [92]:
def euclidean_distance(dataset):

    distances = []

    for idx in range(len(dataset)):

        gold = torch.tensor(ast.literal_eval(dataset["Embedding Gold Formula"][idx]))
        generated = torch.tensor(ast.literal_eval(dataset["Embedding Generated Formula"][idx]))

        distances.append(torch.dist(gold, generated))

    return np.min(distances), np.mean(distances), np.max(distances)           

In [93]:
euclidean_distance(s_7200)

(0.09882245, 2.5938642, 9.606708)

In [94]:
euclidean_distance(s_12000)

(0.031583704, 2.009391, 8.319774)