# ANNs-unidirectional predict sentence score

Use GPT-2 to compute the probability of a given sentence.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/giuliarambelli/Event_Knowledge_Model_Comparison/blob/master/ANNs-unidirectional-predict-sentence.ipynb) 

In [1]:
import os
import math
import pandas as pd

## 1. Load the dataset(s)

In [2]:
# Function load for word2word mask tasks (input files must have the first 2 columns for id and sentence)
def load_data(df):
    ids = []
    sents = []
    for index, row in df.iterrows():
        ids.append(row[0])
        if row[1][-1]!='.':
            sents.append(row[1]+' .')
        else:
            sents.append(row[1])
    return (ids, sents)

In [3]:
dtfit=pd.read_csv('datasets/DTFit_vassallo_deps.txt', sep='\t', header=None)
ev1=pd.read_csv('datasets/ev1_deps.txt', sep='\t',header=None)
new_ev=pd.read_csv('datasets/newsentences_EventsAdapt.txt', sep='\t',header=None)

In [4]:
new_ev.head()

Unnamed: 0,0,1
0,0,The raider caught the illness .
1,1,The illness caught the raider .
2,2,The illness was caught by the raider .
3,3,The raider was caught by the illness .
4,4,The marauder contracted the disease .


In [5]:
datasets = {'ev1': load_data(ev1),
            'dtfit': load_data(dtfit),
            'new-EventsAdapt': load_data(new_ev)
           }

## 2. TASK: Run Sequential word prediction (unidirectional ANNs)

In [None]:
!pip install lm-scorer
import torch, sys
import numpy as np
from lm_scorer.models.auto import AutoLMScorer as LMScorer

In [None]:
def gpt(sentences, version):
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    #scorer = LMScorer.from_pretrained("gpt2-xl", device=device)
    scorer = LMScorer.from_pretrained(version, device=device)
    scores = []
    for sent in sentences:
        scores.append(scorer.sentence_score(sent, reduce="prod"))
        
    return scores

In [None]:
#!pip install ipywidgets

In [None]:
out_dir = 'gpt2-probs/'
os.makedirs(out_dir, exist_ok=True)
# supported  models => ["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", distilgpt2"]

version = 'gpt2'

In [None]:
for dataset_name in datasets:
    print('Processing: ', dataset_name)
    ids, sents = datasets[name]
    probs = gpt(sents, version)
    out = os.path.join(out_dir, '{}.{}.sentence-prob.txt'.format(name, version))
    print('Write ', probs)        
    with open(out_lastw, 'w') as fout:
        for i, sent, score in zip(ids,sents,probs):
            fout.write('{}\t{}\t{}\n'.format(i, sent,score))