In [1]:
import pickle

global eval_label
with open('./test_label.pkl', 'rb') as f:
    eval_label = pickle.load(f)

In [2]:
from inference_roberta_v6 import *
tmp_preds = inference([0.20907902353747032, 0.15113470895883632], eval_label)
print(len(tmp_preds), len(eval_label))
print(f'All predicted: {np.isin(tmp_preds.keys(), eval_label.keys()).mean()}')

predictions = {}
for k,v in eval_label.items():
    predictions[k] = tmp_preds[k]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaM

  0%|          | 0/7652 [00:00<?, ?it/s]

7652 7652
All predicted: 1.0


In [3]:
from nltk.stem.porter import *
from rouge import Rouge 
from tqdm.auto import tqdm
import numpy as np

problem_pid = [3511353, 3315219, 3611751, 3219211, 3352309, 3223953,
               3519874, 3357186, 3588814, 3328210, 3076965]

rouge = Rouge()
stemmer = PorterStemmer()

def eval_score(reference, hypothesis):

    stem_h = [' '.join([stemmer.stem(re.sub(r'[^\w\s]',' ',y)) for y in x.split()]) for x in hypothesis]
    stem_h = list(set(stem_h))
    stem_r = [' '.join([stemmer.stem(re.sub(r'[^\w\s]',' ',y)) for y in x.split()]) for x in reference]
    
    recall = []
    precision = []
    
    for h in stem_h:
        precision.append(max([rouge.get_scores(h, r)[0]['rouge-l']['p'] for r in stem_r]))
        
    for r in stem_r:
        if len(stem_h) > 0:
            recall.append(max([rouge.get_scores(h, r)[0]['rouge-l']['r'] for h in stem_h]))
        else:
            recall.append(0)
            
    recall = sum(recall)/len(reference)
    precision = sum(precision)/(len(hypothesis) if len(hypothesis) > 0 else 1e-8)
            
    return recall, precision

def run_eval(predictions):
    scores = []
    for k,v in tqdm(eval_label.items()):
        
        if k in problem_pid:
            continue
        
        p = predictions.get(k)

        if p is not None:
            recall, precision = eval_score(v, p)
        else:
            recall, precision = 0, 0
        scores.append([recall, precision])
        
    scores = np.mean(np.array(scores), 0).tolist()
    return {'Recall': scores[0], 'Precision': scores[1]}

In [4]:
results = run_eval(predictions)
results['F1'] = (2*results['Recall']*results['Precision'])/(results['Recall'] + results['Precision'])
print(results)

  0%|          | 0/7652 [00:00<?, ?it/s]

{'Recall': 0.8771889869764969, 'Precision': 0.7894659451981241, 'F1': 0.8310188502153947}


In [5]:
display_sample = np.random.choice(list(eval_label.keys()), size = 30)

for i, k in enumerate(display_sample):
    p = predictions.get(k)
    l = eval_label[k]
    R, P = eval_score(l, p)

    print(k)
    print(f'Ground Truth: {l}')
    print(f'Extraction: {p}')
    print(f'Recall: {R}; Precision: {P}')
    print('-----------------------------------------')

3505823
Ground Truth: ['FIVN']
Extraction: ['FIVN']
Recall: 1.0; Precision: 1.0
-----------------------------------------
3392712
Ground Truth: ['Elastron', 'Exxon Mobil', 'Kumho Polychem', 'Riken', 'Teknor Apex']
Extraction: ['Elastron', 'Kumho Polychem', 'Ravago', 'Riken', 'SO.F.TER Group', 'Santoprene', 'Teknor Apex']
Recall: 0.8; Precision: 0.5714285714285714
-----------------------------------------
3669133
Ground Truth: ['1stDibs', 'ByteDance', 'Cameo', 'Canva', 'Care.com', 'Depop', 'Etsy', 'Grailed', 'Havenly', 'Instacart', 'Instagram', 'Kickstarter', 'Minted', 'Patreon', 'Roblox', 'Service Titan', 'Snap', 'StockX', 'StyleSeat', 'Substack', 'TikTok', 'Toptal', 'Upwork']
Extraction: ['Cameo', 'Canva', 'Depop', 'Etsy', 'Grailed', 'Instacart', 'Kickstarter', 'Patreon', 'Service Titan', 'Snap', 'StockX', 'Substack', 'Upwork']
Recall: 0.5652173913043478; Precision: 1.0
-----------------------------------------
3625477
Ground Truth: ['AdvancedMD', 'InSync', 'Net Health', 'NetSmart', '

In [6]:
# model = relation_model().to(device)
# model.load_state_dict(torch.load(f'{output_dir}/{model_type}_0.pt'))
# model = model.eval()    