In [1]:
from transformers import (
    GPT2Config,
    GPT2Tokenizer,
    GPT2DoubleHeadsModel,
    GPT2LMHeadModel,
    AutoTokenizer,
    AutoModelForSequenceClassification,
)

In [6]:
model = '../models/pretrained/gpt2-large'
device = 'cpu'
config = GPT2Config.from_pretrained(model)
tokenizer = GPT2Tokenizer.from_pretrained(model)
model = GPT2LMHeadModel.from_pretrained(model, config=config)
model.config.pad_token_id = model.config.eos_token_id
model.to(device);

### Grammaticality

In [4]:
texts = ['i would like to thank you mr chairman', 
         'i would liking to thanks you mr chair in', 'thnks chair' ]
losses = []
for text in texts:
    tokens_tensor = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt")    
    loss = model(tokens_tensor, labels=tokens_tensor)[0]
    losses.append(loss)
    print(f'{text} loss={loss:.3f}')

i would like to thank you mr chairman loss=4.593
i would liking to thanks you mr chair in loss=7.030
thnks chair loss=8.403


### Content

In [12]:
prompt = 'I have a math test tomorrow. '
texts = ['I might fail the test.', 
         'I have the chance to impress my parents if I do well.', 
         'I have the chance to impress my parents.',
         'I think I will cook dinner now.',
         'My future job will be hard.',
         'I went to the grocery store.',
         'I need to study hard.',
         'My dog is chewing on his toy.',
         'My dog was chewing on his toy and then he got up and chased the cat around the house.']
texts = [prompt+text for text in texts]
losses = []
for text in texts:
    tokens_tensor = tokenizer.encode(text, add_special_tokens=False, return_tensors="pt")    
    loss = model(tokens_tensor, labels=tokens_tensor)[0]
    losses.append(loss)
    print(f'{text} loss={loss:.3f}')

I have a math test tomorrow. I might fail the test. loss=3.165
I have a math test tomorrow. I have the chance to impress my parents if I do well. loss=2.807
I have a math test tomorrow. I have the chance to impress my parents. loss=3.121
I have a math test tomorrow. I think I will cook dinner now. loss=3.649
I have a math test tomorrow. My future job will be hard. loss=3.673
I have a math test tomorrow. I went to the grocery store. loss=3.045
I have a math test tomorrow. I need to study hard. loss=3.049
I have a math test tomorrow. My dog is chewing on his toy. loss=3.679
I have a math test tomorrow. My dog was chewing on his toy and then he got up and chased that cat around the house. loss=3.371


### Lm Scorer?

In [1]:
 #(https://github.com/simonepri/lm-scorer)

### Entailment Scorer

- https://huggingface.co/datasets/snli
- https://huggingface.co/datasets/swag
- https://cims.nyu.edu/~sbowman/multinli/

#### as entailment

In [8]:
from sentence_transformers import CrossEncoder
from scipy.special import softmax

In [5]:
#model = CrossEncoder('cross-encoder/nli-distilroberta-base')

In [4]:
#model.save_pretrained('../models/pretrained/nli-distilroberta-base')

In [2]:
model = CrossEncoder('../models/pretrained/nli-distilroberta-base')

In [23]:
scores = model.predict([('A man is eating pizza', 'A man eats something'), 
                        ('A black race car starts up in front of a crowd of people.',
                         'A man is driving down a lonely road.')])
scores

array([[-3.4729373 ,  3.718417  ,  0.30496567],
       [ 4.732198  , -3.256608  , -2.0175304 ]], dtype=float32)

In [19]:
prompt = 'I have a math test tomorrow.'
texts = ['I do not have a math test tomorrow.',
         'I have no school tomorrow.',
         'I might fail the test.', 
         'I have the chance to impress my parents if I do well.', 
         'I have the chance to impress my parents.',
         'I think I will cook dinner now.',
         'My future job will be hard.',
         'I went to the grocery store.',
         'I need to study hard.',
         'My dog is chewing on his toy.',
         'My dog was chewing on his toy and then he got up and chased the cat around the house.']

pairs = [(prompt, text) for text in texts]
pairs

[('I have a math test tomorrow.', 'I do not have a math test tomorrow.'),
 ('I have a math test tomorrow.', 'I have no school tomorrow.'),
 ('I have a math test tomorrow.', 'I might fail the test.'),
 ('I have a math test tomorrow.',
  'I have the chance to impress my parents if I do well.'),
 ('I have a math test tomorrow.', 'I have the chance to impress my parents.'),
 ('I have a math test tomorrow.', 'I think I will cook dinner now.'),
 ('I have a math test tomorrow.', 'My future job will be hard.'),
 ('I have a math test tomorrow.', 'I went to the grocery store.'),
 ('I have a math test tomorrow.', 'I need to study hard.'),
 ('I have a math test tomorrow.', 'My dog is chewing on his toy.'),
 ('I have a math test tomorrow.',
  'My dog was chewing on his toy and then he got up and chased the cat around the house.')]

In [20]:
scores = model.predict(pairs)

In [21]:
softmax(scores, axis=1).round(2)

array([[1.  , 0.  , 0.  ],
       [0.99, 0.  , 0.01],
       [0.  , 0.01, 0.99],
       [0.  , 0.01, 0.99],
       [0.  , 0.01, 0.99],
       [0.99, 0.  , 0.01],
       [0.01, 0.01, 0.99],
       [0.99, 0.  , 0.01],
       [0.01, 0.03, 0.95],
       [1.  , 0.  , 0.  ],
       [0.99, 0.  , 0.01]], dtype=float32)

In [22]:
for ti, text in enumerate(texts):
    print(f'{prompt} {text} {scores[ti,:].round(2)}')
#scores

I have a math test tomorrow. I do not have a math test tomorrow. [ 4.24 -2.36 -2.21]
I have a math test tomorrow. I have no school tomorrow. [ 3.98 -2.87 -1.29]
I have a math test tomorrow. I might fail the test. [-2.34 -0.94  3.62]
I have a math test tomorrow. I have the chance to impress my parents if I do well. [-2.32 -1.02  3.7 ]
I have a math test tomorrow. I have the chance to impress my parents. [-1.91 -1.37  3.61]
I have a math test tomorrow. I think I will cook dinner now. [ 3.96 -3.82 -0.4 ]
I have a math test tomorrow. My future job will be hard. [-1.48 -1.67  3.43]
I have a math test tomorrow. I went to the grocery store. [ 3.89 -3.78 -0.35]
I have a math test tomorrow. I need to study hard. [-1.56 -0.71  2.63]
I have a math test tomorrow. My dog is chewing on his toy. [ 4.53 -3.7  -1.44]
I have a math test tomorrow. My dog was chewing on his toy and then he got up and chased the cat around the house. [ 4.05 -4.28 -0.24]


#### as zero-shot

In [43]:
from transformers import pipeline
import numpy as np

In [26]:
classifier = pipeline("zero-shot-classification", model='../models/pretrained/nli-distilroberta-base')

sent = "Apple just announced the newest iPhone X"
candidate_labels = ["technology", "sports", "politics"]
res = classifier(sent, candidate_labels)
print(res)

{'sequence': 'Apple just announced the newest iPhone X', 'labels': ['technology', 'sports', 'politics'], 'scores': [0.9906043410301208, 0.005863797850906849, 0.003531870199367404]}


In [30]:
classifier = pipeline("zero-shot-classification", model='../models/pretrained/nli-distilroberta-base')

sent = "I have a math test tomorrow. I do not have a math test tomorrow."
sent = "I have a math test tomorrow. I need to study hard."
candidate_labels = ["probable", "improbable"]
res = classifier(sent, candidate_labels)
print(res)

{'sequence': 'I have a math test tomorrow. I need to study hard.', 'labels': ['improbable', 'probable'], 'scores': [0.5948284268379211, 0.40517154335975647]}


In [32]:
sent = "I have a math test tomorrow."
candidate_labels = ["I went to the park."]
res = classifier(sent, candidate_labels)
print(res)

{'sequence': 'I have a math test tomorrow.', 'labels': ['I went to the park.'], 'scores': [0.008240976370871067]}


In [53]:
prompt = "I have a math test tomorrow."
for ti, text in enumerate(texts):
    res = classifier(prompt, text)
    score = np.round(res['scores'][0],2)
    print(f'{prompt} {text} {score}')

I have a math test tomorrow. I do not have a math test tomorrow. 0.0
I have a math test tomorrow. I have no school tomorrow. 0.01
I have a math test tomorrow. I might fail the test. 0.38
I have a math test tomorrow. I have the chance to impress my parents if I do well. 0.54
I have a math test tomorrow. I have the chance to impress my parents. 0.4
I have a math test tomorrow. I think I will cook dinner now. 0.0
I have a math test tomorrow. My future job will be hard. 0.36
I have a math test tomorrow. I went to the grocery store. 0.01
I have a math test tomorrow. I need to study hard. 0.47
I have a math test tomorrow. My dog is chewing on his toy. 0.0
I have a math test tomorrow. My dog was chewing on his toy and then he got up and chased the cat around the house. 0.0


In [54]:
res = classifier(prompt, texts)
res

{'sequence': 'I have a math test tomorrow.',
 'labels': ['I need to study hard.',
  'I have the chance to impress my parents if I do well.',
  'I might fail the test.',
  'My future job will be hard.',
  'I have the chance to impress my parents.',
  'I have no school tomorrow.',
  'I do not have a math test tomorrow.',
  'My dog is chewing on his toy.',
  'I went to the grocery store.',
  'I think I will cook dinner now.',
  'My dog was chewing on his toy and then he got up and chased the cat around the house.'],
 'scores': [0.24693726003170013,
  0.15573418140411377,
  0.15039072930812836,
  0.13005255162715912,
  0.11426583677530289,
  0.05318985879421234,
  0.04382216930389404,
  0.0410136841237545,
  0.03034592792391777,
  0.02107062004506588,
  0.013177117332816124]}

In [85]:
texts = ['I do not have a math test tomorrow.',
         'I have no school tomorrow.',
         'I might fail the test.', 
         'I have the chance to impress my parents if I do well.', 
         'I have the chance to impress my parents.',
         'I think I will cook dinner now.',
         'My future job will be hard.',
         'I went to the grocery store.',
         'I need to study hard.',
         'My dog is chewing on his toy.',
         'My dog chased a cat around in the park.',
        # 'I have a toothache.',
         'I went to the dentist',
         'I went to the doctor']

In [86]:
# prompt = "I walked my dog to the park."
# for ti, text in enumerate(texts):
#     res = classifier(prompt, text)
#     score = np.round(res['scores'][0],2)
#     print(f'{prompt} {text} {score}')

In [89]:
#prompt = "I went to the dentist."
#prompt = "I have a toothache"
prompt = "I need to have surgery."
res = classifier(prompt, texts)
res 

{'sequence': 'I need to have surgery.',
 'labels': ['My future job will be hard.',
  'I went to the doctor',
  'I might fail the test.',
  'I do not have a math test tomorrow.',
  'I need to study hard.',
  'I went to the dentist',
  'I have no school tomorrow.',
  'I went to the grocery store.',
  'My dog is chewing on his toy.',
  'I have the chance to impress my parents if I do well.',
  'I have the chance to impress my parents.',
  'My dog chased a cat around in the park.',
  'I think I will cook dinner now.'],
 'scores': [0.29420918226242065,
  0.2702980041503906,
  0.11532612144947052,
  0.07720661163330078,
  0.06659335643053055,
  0.058381590992212296,
  0.030719276517629623,
  0.018948888406157494,
  0.018022341653704643,
  0.014573161490261555,
  0.01224702037870884,
  0.012176312506198883,
  0.011298167519271374]}

In [92]:
#np.sum(res['scores'])

In [96]:
scores = []
for ti, text in enumerate(texts):
    res = classifier(prompt, text)
    score = np.round(res['scores'][0],2)
    scores.append(score)
    print(f'{prompt} {text} {score}')

I need to have surgery. I do not have a math test tomorrow. 0.02
I need to have surgery. I have no school tomorrow. 0.01
I need to have surgery. I might fail the test. 0.47
I need to have surgery. I have the chance to impress my parents if I do well. 0.0
I need to have surgery. I have the chance to impress my parents. 0.0
I need to have surgery. I think I will cook dinner now. 0.0
I need to have surgery. My future job will be hard. 0.77
I need to have surgery. I went to the grocery store. 0.0
I need to have surgery. I need to study hard. 0.02
I need to have surgery. My dog is chewing on his toy. 0.0
I need to have surgery. My dog chased a cat around in the park. 0.0
I need to have surgery. I went to the dentist 0.06
I need to have surgery. I went to the doctor 0.46


In [98]:
print(np.array(scores)/np.sum(scores))

[0.01104972 0.00552486 0.25966851 0.         0.         0.
 0.42541436 0.         0.01104972 0.         0.         0.03314917
 0.25414365]


In [110]:
texts = ['I have a math test.', 'I went to the doctors.', 'I had surgery.' ,
'I finished the credit studies for my AP classes.']
for text in texts:
    prompt = text
    text_tmp = texts.copy()
    text_tmp.remove(text)
    res = classifier(prompt, text_tmp)
    print(res)
    print()

{'sequence': 'I have a math test.', 'labels': ['I finished the credit studies for my AP classes.', 'I had surgery.', 'I went to the doctors.'], 'scores': [0.42004308104515076, 0.30406567454338074, 0.2758912444114685]}

{'sequence': 'I went to the doctors.', 'labels': ['I had surgery.', 'I finished the credit studies for my AP classes.', 'I have a math test.'], 'scores': [0.8455312252044678, 0.07818540185689926, 0.07628338038921356]}

{'sequence': 'I had surgery.', 'labels': ['I went to the doctors.', 'I have a math test.', 'I finished the credit studies for my AP classes.'], 'scores': [0.9386029839515686, 0.03370364382863045, 0.027693290263414383]}

{'sequence': 'I finished the credit studies for my AP classes.', 'labels': ['I have a math test.', 'I went to the doctors.', 'I had surgery.'], 'scores': [0.4430335462093353, 0.369669646024704, 0.18729688227176666]}



In [111]:
texts = ['I have a math test.', 'I will go to the doctors.', 'I need surgery.' ,
'I will finish the credit studies for my AP classes.']
for text in texts:
    prompt = text
    text_tmp = texts.copy()
    text_tmp.remove(text)
    res = classifier(prompt, text_tmp)
    print(res)
    print()

{'sequence': 'I have a math test.', 'labels': ['I will finish the credit studies for my AP classes.', 'I will go to the doctors.', 'I need surgery.'], 'scores': [0.5052475929260254, 0.2510741949081421, 0.24367819726467133]}

{'sequence': 'I will go to the doctors.', 'labels': ['I need surgery.', 'I will finish the credit studies for my AP classes.', 'I have a math test.'], 'scores': [0.809511125087738, 0.1013128012418747, 0.0891759917140007]}

{'sequence': 'I need surgery.', 'labels': ['I will go to the doctors.', 'I have a math test.', 'I will finish the credit studies for my AP classes.'], 'scores': [0.8930190801620483, 0.06989777833223343, 0.037083085626363754]}

{'sequence': 'I will finish the credit studies for my AP classes.', 'labels': ['I have a math test.', 'I will go to the doctors.', 'I need surgery.'], 'scores': [0.4230411648750305, 0.39691033959388733, 0.18004855513572693]}



### as zero-shot, for selecting sentences

In [5]:
from transformers import pipeline
import numpy as np
from pathlib import Path

In [2]:
classifier = pipeline("zero-shot-classification", model='../models/pretrained/nli-distilroberta-base')


In [46]:
save_folder = '/home/cgagne/cvar_generation/data/results/single_sentences_I_1'
filename = 'round1_ends.txt'

f = open(Path(save_folder) / filename, "r")
sentences = f.readlines()
sentences = [s.replace('\n','') for s in sentences]

filename = 'prompt_list.txt'
f = open(Path(save_folder) / filename, "r")
sentences2 = f.readlines()
sentences2 = [s.replace('\n','') for s in sentences2]

sentences.extend(sentences2)

In [47]:
len(sentences)

84

In [58]:
# select starting thought
s1 = np.random.choice(sentences)
print(s1)
print('---')

# consider possible next thoughts (should be all, but simplify to 20 for now)
s2_candidates = np.random.choice(sentences, size=len(sentences))
s2_candidates = sentences.copy()
s2_candidates.remove(s1)
res = classifier(s1, s2_candidates)
probs = res['scores'] # calculate probabilities 
probs = probs/np.sum(probs)
for l,s in zip(res['labels'][0:10],res['scores'][0:10]):
    print(f'{l} {s:.3}')
#s2 = np.random.choice(s2_candidates, p=probs)
#print(s2)

I won't slack off.
---
So on and so forth. 0.0795
I did this. 0.0639
Nothing seemed to have changed. 0.0609
We call that work. 0.0495
So that if anyone found me, if you found me, he or she would know my name. 0.0397
Please let me get some speed to finish. 0.0342
I was the same man in the past. 0.0312
I was wearing my hosiery. 0.029
I need to think carefully. 0.028
I have to get my concentration back. 0.0252


### sentiment scorerers

In [70]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax

In [59]:
save_folder = '/home/cgagne/cvar_generation/data/results/single_sentences_I_1'
filename = 'round1_ends.txt'

f = open(Path(save_folder) / filename, "r")
sentences = f.readlines()
sentences = [s.replace('\n','') for s in sentences]

filename = 'prompt_list.txt'
f = open(Path(save_folder) / filename, "r")
sentences2 = f.readlines()
sentences2 = [s.replace('\n','') for s in sentences2]

sentences.extend(sentences2)

In [60]:
len(sentences)

84

In [102]:
scorer = '/home/cgagne/cvar_generation/models/pretrained/nlptown-bert-base-multilingual-uncased-sentiment'
scorer = '/home/cgagne/cvar_generation/models/pretrained/cardiffnlp-twitter-roberta-base-sentiment'
scorer_tokenizer = AutoTokenizer.from_pretrained(scorer)
scorer_model = AutoModelForSequenceClassification.from_pretrained(scorer)
scorer_model.to('cpu');

In [103]:
output = scorer_model(**scorer_tokenizer(sentences, return_tensors='pt', padding=True))

In [112]:
output = scorer_model(**scorer_tokenizer(sentences, return_tensors='pt', padding=True))
probs = softmax(output[0].detach().cpu().numpy(),axis=1)

if probs.shape[1]==5:
    scores = np.dot(probs,np.arange(-2,3))
else:
    scores = np.dot(probs,np.arange(-1,2))
#scores = np.argmax(probs,axis=1)
sort_idx = np.argsort(scores)
scores_sorted = [scores[i] for i in sort_idx]
sentences_sorted = [sentences[i] for i in sort_idx]

In [113]:
for s,sc in zip(sentences_sorted, scores_sorted):
    print(f'{s} {sc:.3f}')

I might even fail the test. -0.930
I might fail the test. -0.915
I now realize I have dog allergies. -0.887
I burned me way too much calories. -0.877
The test can be problematic. -0.870
I need more surgery. -0.818
I slept so badly at first, I fell asleep in the toilet,' said Cobain. -0.803
I need surgery. -0.802
I'm dropping out. -0.785
I don't think I can. -0.663
That is neither of your business Nick replied. -0.474
I have a slow breakup. -0.458
I never saw those road rage incidents, he told a local news outlet. -0.407
Nothing seemed to have changed. -0.377
This is where it stopped. -0.341
I went to the doctors. -0.247
We've got to fix up the house. -0.239
You aren't that stupid, Bob said sarcastically before notifying Chuck with a yes. -0.237
I was the same man in the past. -0.211
I need to think carefully. -0.207
I have a math test. -0.189
I emailed my parents. -0.187
I'm seriously starving, but I still want to win just one game. -0.174
Well, you get the point, Jones said. -0.165
I 

In [115]:
#probs