<h2><center>Test scripts for amrlib and Smatch APIs</center></h1>

#### Imports, setup and paths:

In [1]:
import re
import spacy
import amrlib
import pandas as pd
from pathlib import Path

amrlib.setup_spacy_extension()
nlp = spacy.load('en_core_web_sm')

ds_path = Path('.')/'datasets'

#### 1.0 Unzipping functions:

In [8]:
import os
import gzip
import tarfile
from pathlib import Path

def extract_all_gz(source_filepath, dest_filepath, block_size=65536):
    with gzip.open(source_filepath, 'rb') as s_file, open(dest_filepath, 'wb') as d_file:
        while True:
            block = s_file.read(block_size)
            if not block:
                break
            else:
                d_file.write(block)


def extract_all_tar_gz(source_dir):
    for filename in os.listdir(source_dir):
        filename = Path(source_dir)/filename
        with tarfile.open(filename, 'r:gz') as f:
            f.extractall(source_dir)
        
#extract_all('datasets/ppdb/ppdb-2.0-s-all.gz', 'datasets/ppdb/ppdb-2.0-s-all')
#extract_all_tar_gz(r'C:\ProgramData\Anaconda3\envs\dlp38\Lib\site-packages\amrlib\data')

#### 1.0 Loading SICK-trial:

In [2]:
sick_path = ds_path/'sts'/'SICK_trial.txt'
sick = pd.read_csv(sick_path, header=0, sep='\t', usecols=list(range(4)))
sick['relatedness_score'] = sick['relatedness_score']/5

print(sick.info())
sick.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pair_ID            500 non-null    int64  
 1   sentence_A         500 non-null    object 
 2   sentence_B         500 non-null    object 
 3   relatedness_score  500 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 15.8+ KB
None


Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score
0,4,The young boys are playing outdoors and the ma...,There is no boy playing outdoors and there is ...,0.72
1,24,A person in a black jacket is doing tricks on ...,A skilled person is riding a bicycle on one wheel,0.68
2,105,Four children are doing backbends in the gym,Four girls are doing backbends and playing out...,0.76
3,116,A player is throwing the ball,Two teams are competing in a football match,0.58
4,119,Five children are standing in front of a woode...,Five children are standing in a wooden hut,0.84


In [3]:
import smatch
from smatch import score_amr_pairs
smatch.single_score = False

stog = amrlib.load_stog_model()
sents_a, sents_b = sick['sentence_A'].to_list(), sick['sentence_B'].to_list()
sents_a_amr = stog.parse_sents(sents_a, add_metadata=False)
sents_b_amr = stog.parse_sents(sents_b, add_metadata=False)

sick_amr_dir = ds_path/'sts'

def save_amr(amrs, filepath):
    with open(sick_amr_dir/filepath, 'w') as f:
        for amr in amrs:
            print(amr, file=f, end='\n\n')
            
save_amr(sents_a_amr, 'sents_a_amr.txt')
save_amr(sents_b_amr, 'sents_b_amr.txt')

with open(sick_amr_dir/'sents_a_amr.txt') as f1, open(sick_amr_dir/'sents_b_amr.txt') as f2:
    f_scores = [f_score for (_, _, f_score) in score_amr_pairs(f1, f2)]

sick['f_score'] = f_scores
sick['mse'] = (sick['relatedness_score'] - sick['f_score'])**2

sick.head()

Loading model C:\ProgramData\Anaconda3\envs\dlp38\Lib\site-packages\amrlib\data\model_stog\model.pt


Unnamed: 0,pair_ID,sentence_A,sentence_B,relatedness_score,f_score,mse
0,4,The young boys are playing outdoors and the ma...,There is no boy playing outdoors and there is ...,0.72,0.8,0.0064
1,24,A person in a black jacket is doing tricks on ...,A skilled person is riding a bicycle on one wheel,0.68,0.26087,0.17567
2,105,Four children are doing backbends in the gym,Four girls are doing backbends and playing out...,0.76,0.434783,0.105766
3,116,A player is throwing the ball,Two teams are competing in a football match,0.58,0.235294,0.118822
4,119,Five children are standing in front of a woode...,Five children are standing in a wooden hut,0.84,0.8,0.0016


In [19]:
sick.sort_values(by='mse', ascending=False).to_csv(sick_amr_dir/'SICK_trial_AMR_SMATCH.tsv', sep='\t', 
                                            float_format='%.3f', index=False)