In [16]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import itertools

from torch import nn

import tqdm

from nltk import sent_tokenize as nltk_tokenizer

## Data Preparation

In [5]:
def flatten_data(csv,use_feedback=False):
    df = pd.read_csv(csv)
    new_df = []#pd.DataFrame(columns=['id','prompt','gen_text','error','feedback','severity','text_before_span','span','text_after_span'])

    for i in tqdm.tqdm(range(len(df)),desc='unrolling data'):
        id = df.loc[i]['id']
        prompt = df.loc[i]['prompt']
        gen_text = df.loc[i]['generation']#.replace(u'\xa0', u' ').replace(u'  ', u' ')
        feedbacks = eval(df.loc[i]['responses'])
        if not use_feedback:
            dic = {"id":id,
                  "prompt":prompt,
                  "gen_text":gen_text,
                  "error":"NA",
                  "feedback":"NA",
                  "severity":"NA",
                  "span_beg":"NA",
                  "span_end":"NA",
                  "span":"NA"}

            new_df.append(dic)
            continue
        else:
            for response in feedbacks:
                if len(response)==0:
                    continue
                for r in response:

                    error = r[0]
                    feedback = r[1].replace("_SEP_",",").replace("_QUOTE_",'"')
                    severity = r[2]
                    beg = r[3]
                    end = r[4]

                    span = gen_text[beg:end]

                    dic = {"id":id,
                          "prompt":prompt,
                          "gen_text":gen_text,
                          "error":error,
                          "feedback":feedback,
                          "severity":severity,
                          "span_beg":beg,
                          "span_end":end,
                          "span":span}

                    new_df.append(dic)

    return pd.DataFrame(new_df)


In [21]:
data = flatten_data('../../../grouped_data.csv',use_feedback=True)

# language_errors = ['Grammar_Usage', 'Off-prompt', 'Redundant', 'Self-contradiction', 'Incoherent']
# data = data[data['error'].isin(language_errors)]
# data = data[data['severity']>1]

unrolling data: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1308/1308 [00:01<00:00, 1110.10it/s]


In [22]:
len(data)

41862

In [24]:
data[data['error']=='Redundant']

Unnamed: 0,id,prompt,gen_text,error,feedback,severity,span_beg,span_end,span
5,0,"In the wild, animals display tender moments of...",To honor the effort he put into his latest set...,Redundant,this may just be overly nit-picky but these tw...,1,127,268,A lion and its cub enjoy a tender moment toget...
9,0,"In the wild, animals display tender moments of...",To honor the effort he put into his latest set...,Redundant,It does not need to go that much into detail b...,1,178,268,The lion's paws rest on top of the front paws ...
12,0,"In the wild, animals display tender moments of...",To honor the effort he put into his latest set...,Redundant,I think this is very mild and could almost not...,1,178,268,The lion's paws rest on top of the front paws ...
28,1,The long-rumored Apple car might finally becom...,"According to the Financial Times, Apple's been...",Redundant,repeats concept.,1,168,183,autonomous car.
71,2,Earbuds and headphones are among the most pers...,Wifi range and device integration features als...,Redundant,It's assumed that a person wearing hearing aid...,1,148,165,with hearing loss
...,...,...,...,...,...,...,...,...,...
41692,1301,Contrary reports that she died after a stray b...,She was shot in the head by a stray bullet dur...,Redundant,The selected span contains the same informati...,1,401,433,she was hit by the stray bullet.
41694,1301,Contrary reports that she died after a stray b...,She was shot in the head by a stray bullet dur...,Redundant,"Repeats the first sentence, but sounds appropr...",1,401,433,she was hit by the stray bullet.
41698,1301,Contrary reports that she died after a stray b...,She was shot in the head by a stray bullet dur...,Redundant,This is already implied that it was a stray bu...,1,420,433,stray bullet.
41770,1304,Donald Trump's Space Force is preparing to act...,The Space Fence is a $1.6 billion project that...,Redundant,This is being repeated in the prompt. Given th...,1,340,486,President Donald Trump's Space Force is prepar...


In [8]:
data['span_len'] = data['span'].apply(lambda x: len(x))
#px.histogram(data['span_len'],nbins=100)

In [9]:
data[data['span_len']>20].shape

(0, 10)

In [10]:
data = data[data['span_len']>20]

In [11]:
data['error'].value_counts()

Series([], Name: error, dtype: int64)

In [17]:
data['gen_sentences'] = data['gen_text'].apply(lambda x: nltk_tokenizer(x))
data['span_is_sentence'] = [1 if x in y else 0 for x,y in zip(data['span'],data['gen_sentences'])]

In [18]:
data['span_is_sentence'].value_counts()

Series([], Name: span_is_sentence, dtype: int64)

In [19]:
i = 392
#data.loc[i]['gen_sentences'] = [s.strip() for s in data.loc[i]['gen_sentences']]
s = " ".join(data.iloc[i]['gen_sentences'])
s

IndexError: single positional indexer is out-of-bounds

In [15]:
data.iloc[i]['gen_text']

IndexError: single positional indexer is out-of-bounds

In [14]:
data.iloc[i]['span']

'Will Starbucks launch vegan options at all of its more than 5,000 locations in the United States?'

In [15]:
data.iloc[i][['span_beg','span_end']]

span_beg    499
span_end    596
Name: 1037, dtype: object

In [16]:
s[data.iloc[i]['span_beg']:data.iloc[i]['span_end']]==data.iloc[i]['span']

False

In [17]:
##check to see if the above technique is working fine for most data points
data['tech_works'] = [" ".join(data.iloc[i]['gen_sentences'])[data.iloc[i]['span_beg']:data.iloc[i]['span_end']] == data.iloc[i]['span'] for i in range(len(data))]

In [18]:
data['tech_works'].value_counts()

True     8401
False     183
Name: tech_works, dtype: int64

In [19]:
data = data[data['tech_works']==True]

In [20]:
data = data.reset_index()

In [21]:
def label_err_sentence(sentences,span_beg,span_end,multi_class=False,error_type=None):
    
    output = []
    total_len = sum([len(s) for s in sentences]) + len(sentences) - 1 #total sentences length + (sentences-1) spaces
    
    idx = 0
    
    if multi_class:
        label = error_type+1
    else:
        label = 1
    
    for s in sentences:
        if idx>=span_beg and idx<=span_end:
            output.append(label)
        elif idx+len(s)>=span_beg and idx+len(s)<=span_end:
            output.append(label)
        elif idx<=span_beg and idx+len(s)>=span_end:
            output.append(label)
        else:
            output.append(0)
        
        idx += len(s)+1
            
    return output

In [22]:
i = 101
data.iloc[i]['gen_sentences']

["Given the fact that lots of cuts are a part of some diets, it seems obvious that this year could end up being A New Year's Eve for many people, either in class or in life.",
 '* Runewee County is the majority weight loss state for about 65 percent of adults and deaths are linked to losing other lifestyle factors, such as diabetes or atopic dermatitis.',
 'MD Anderson University obesity research from 2012 shows that African American males who were obese between 22 and 40 were more likely to end up with skin, dental and vision problems than their white, "newborn."']

In [23]:
data.iloc[i]['span']

'African American males who were obese between 22 and 40 were more likely to end up with skin, dental and vision problems than their white,'

In [24]:
label_err_sentence(data.iloc[i]['gen_sentences'],data.iloc[i]['span_beg'],data.iloc[i]['span_end'],multi_class=True,error_type=1)

[0, 0, 2]

In [25]:
data['error'].value_counts()

Off-prompt            2445
Redundant             2131
Incoherent            1900
Grammar_Usage         1076
Self-contradiction     849
Name: error, dtype: int64

In [26]:
ERROR_MAP = {'Redundant':0,
             'Off-prompt':1,
             'Grammar_Usage':2,
             'Incoherent':3,
             'Self-contradiction':4,
             'Needs_Google':5,
             'Technical_Jargon':6,
             'Commonsense':7,
             'Encyclopedic':8,
             'Bad_Math':9}

INV_ERROR_MAP = {v:k for k,v in ERROR_MAP.items()}

In [67]:
rating_scores = {'Excellent':3 , 'Acceptable':2 , 'Could be Improved':1, 'Bad': -1}

def process_df(df):
    df['list_feedback'] = df['feedback'].apply(lambda x: [ r + "___" + e for r,e in zip(x['rating'],x['explanation']) ])
    df['sampled_feedback'] = df['list_feedback'].apply(lambda x: np.random.choice(x).split("___") )
    df['rating_score'] = df['sampled_feedback'].apply(lambda x: rating_scores[x[0]])
    df['rating'] = df['sampled_feedback'].apply(lambda x: x[0])
    df['explanation'] = df['sampled_feedback'].apply(lambda x: x[1])
    return df

In [68]:
train_df = process_df(pd.DataFrame(dataset['train']))
val_df = process_df(pd.DataFrame(dataset['validation']))
test_df = process_df(pd.DataFrame(dataset['test']))

In [69]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset,DataLoader

# Load model from HuggingFace Hub
bert_chkpt = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(bert_chkpt)
model = AutoModel.from_pretrained(bert_chkpt)

In [70]:
tokenizer.all_special_tokens

['<s>', '</s>', '[UNK]', '<pad>', '<mask>']

In [71]:
train_df.head()

Unnamed: 0,question,answer,feedback,list_feedback,sampled_feedback,rating_score,rating,explanation
0,How do I get help finding a job?,Coronavirus (COVID-19) information for job see...,"{'rating': ['Excellent', 'Could be Improved'],...",[Excellent___Has a link to detailed informatio...,"[Could be Improved, This answer provides a lin...",1,Could be Improved,"This answer provides a link for job searches, ..."
1,How do I get help finding a job?,Coronavirus (COVID-19) information for job see...,"{'rating': ['Excellent', 'Excellent'], 'explan...",[Excellent___A link to a job search website is...,"[Excellent, A link to a job search website is ...",3,Excellent,"A link to a job search website is included, as..."
2,How do I get help finding a job?,Coronavirus (COVID-19) information and support...,"{'rating': ['Bad', 'Acceptable'], 'explanation...",[Bad___Talks about tax credits for businesses ...,"[Bad, Talks about tax credits for businesses t...",-1,Bad,Talks about tax credits for businesses that hi...
3,If I am in Australia on a worker holiday marke...,Frequently Asked Questions\nWorking holiday ma...,"{'rating': ['Could be Improved', 'Acceptable']...",[Could be Improved___Answer is about Working H...,"[Could be Improved, Answer is about Working Ho...",1,Could be Improved,"Answer is about Working Holiday Makers, but do..."
4,If I am in Australia on a worker holiday marke...,Frequently Asked Questions\nCOVID-19 Pandemic ...,"{'rating': ['Bad', 'Could be Improved'], 'expl...",[Bad___Discusses pandemic visas. Doesn't menti...,"[Could be Improved, This answer is very vague ...",1,Could be Improved,This answer is very vague and does not answer ...


In [72]:
train_df['answer'].loc[0]

'Coronavirus (COVID-19) information for job seekers\nExisiting job seekers\nIf you are a current job seeker or participant, this fact sheet provides\nimportant information about mutual obligation requirements, appointments with\nyour provider, and what to do if you are self-isolating:\n\nInformation for job seekers and participants\n\nIf you are participating in the ParentsNext program, this fact sheet provides\nimportant information about your activities and appointments.\n\n\nInformation for ParentsNext participants\n\n\nParentsNext participants Frequently Asked Questions\n\n\nIf you are a New Business Assistance with NEIS participant, these Frequently\nAsked Questions (FAQ) provides information about accessing the Coronavirus\nSupplement and what support is available during this time:\n\nNew Business Assistance with NEIS participants - Frequently Asked Questions\n\nIf you are a New Business Assistance with NEIS provider, these Frequently\nAsked Questions (FAQ) provides information a

In [73]:
tokenizer('Hello, how are you doing?'+ f" {tokenizer.eos_token} " + "Hemlooooo",add_special_tokens=True,return_tensors='pt', return_length=1)

{'input_ids': tensor([[    0,  7596,  1014,  2133,  2028,  2021,  2729,  1033,     2, 19614,
          4139,  9545,  9545,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'length': tensor([14])}

In [74]:
from nltk import tokenize as nltk_tokenizer
len(nltk_tokenizer.sent_tokenize(train_df['answer'].loc[0]))

3

In [75]:
tok_inp = tokenizer(nltk_tokenizer.sent_tokenize(train_df['answer'].loc[0]),add_special_tokens=False,return_token_type_ids=True)#,max_length=200,padding='max_length')
tok_inp

{'input_ids': [[21891, 23354, 1010, 2526, 17262, 1015, 2543, 1011, 2596, 2009, 3109, 24075, 4658, 17421, 3440, 3109, 24075, 2069, 2021, 2028, 1041, 2787, 3109, 29448, 2034, 13184, 1014, 2027, 2759, 7127, 3644, 2594, 2596, 2059, 8207, 14991, 5922, 1014, 14655, 2011, 2119, 10806, 1014, 2002, 2058, 2004, 2083, 2069, 2021, 2028, 2973, 1015, 11167, 22252, 1028, 2596, 2009, 3109, 24075, 2002, 6822, 2069, 2021, 2028, 8023, 2003, 2000, 3012, 2642, 18417, 2569, 1014, 2027, 2759, 7127, 3644, 2594, 2596, 2059, 2119, 3454, 2002, 14655, 1016], [2596, 2009, 3012, 2642, 18417, 6822, 3012, 2642, 18417, 6822, 4707, 2360, 3984, 2069, 2021, 2028, 1041, 2051, 2453, 5379, 2011, 11269, 2487, 13184, 1014, 2126, 4707, 2360, 3984, 1010, 6908, 4164, 1011, 3644, 2596, 2059, 3233, 2079, 2000, 21891, 23354, 12452, 2002, 2058, 2494, 2007, 2804, 2080, 2027, 2055, 1028, 2051, 2453, 5379, 2011, 11269, 2487, 6822, 1015, 4707, 2360, 3984, 2069, 2021, 2028, 1041, 2051, 2453, 5379, 2011, 11269, 2487, 10806, 1014, 2126, 47

In [54]:
tokenizer.sep_token

'</s>'

# NP Extraction

In [28]:
import benepar, spacy
benepar.download('benepar_en3')
nlp = spacy.load('en_core_web_md')
if spacy.__version__.startswith('2'):
        nlp.add_pipe(benepar.BeneparComponent("benepar_en3"))
else:
    nlp.add_pipe("benepar", config={"model": "benepar_en3"})

In [31]:
doc = nlp(train_df['answer'].loc[0].replace('\n',' '))
nps = []
for np in doc.noun_chunks:
    nps.append(np.text)

set(nps)

{'Coronavirus (COVID-19) information',
 'FAQ',
 'Information',
 'NEIS participant',
 'NEIS participants',
 'NEIS provider',
 'NEIS providers',
 'New Business Assistance',
 'New Enterprise Incentive Scheme',
 'ParentsNext participants',
 'Questions',
 'a New Business Assistance',
 'a current job seeker',
 'appointments',
 'important information',
 'information',
 'job seekers',
 'mutual obligation requirements',
 'participant',
 'participants',
 'the Coronavirus Supplement',
 'the Coronavirus situation',
 'the ParentsNext program',
 'these Frequently Asked Questions',
 'this fact sheet',
 'this time',
 'what',
 'what support',
 'you',
 'your activities',
 'your provider'}

In [216]:
import tqdm

class feedback_QA_dataset(Dataset):
    
    def __init__(self,df,max_length=500):
        self.df = df
        self.max_len = max_length
        self.data = []
        
        for i in tqdm.tqdm(range(len(self.df)),desc='vectorizing..'):
            
            d = {}
            
            tok_question = tokenizer(self.df.iloc[i]['question'], add_special_tokens=False)
            tok_answer = tokenizer(self.df.iloc[i]['answer'], add_special_tokens=False, max_length=self.max_len-len(tok_question['input_ids']), padding='max_length', truncation='only_first')
            tok_feedback = tokenizer(self.df.iloc[i]['explanation'], add_special_tokens=False, max_length=self.max_len, padding='max_length', truncation='only_first')
            
            d['sentence'] = [tokenizer.bos_token_id] + tok_question['input_ids'] + [tokenizer.sep_token_id]*2 + tok_answer['input_ids']
            d['sentence_attn'] = [1] + tok_question['attention_mask'] + [1,1] + tok_answer['attention_mask']
            d['feedback'] = tok_feedback['input_ids']
            d['feedback_attn'] = tok_feedback['attention_mask']
            
            d['sentence_pool_mask'] = [0] + [0]*len(tok_question['input_ids']) + [0,0] + tok_answer['attention_mask']
            d['feedback_pool_mask'] = tok_feedback['attention_mask']
            
            answer_phrases = nltk_tokenizer.sent_tokenize(self.df.iloc[i]['answer'])
            tok_phrases = tokenizer(answer_phrases,add_special_tokens=False,return_token_type_ids=True)
            
            d['answer_phrases_pool_mask'] = []
            
            for j in range(len(answer_phrases)):
                answer_phrases_attn_mask = tok_phrases['token_type_ids'].copy()
                answer_phrases_attn_mask[j] = tok_phrases['attention_mask'][j].copy()
                answer_phrases_attn_mask = list(itertools.chain.from_iterable(answer_phrases_attn_mask))
                pad_len = len(tok_answer['attention_mask']) - len(answer_phrases_attn_mask)
                answer_phrases_attn_mask += [0]*pad_len
                
                answer_phrase_pool_mask = [0] + [0]*len(tok_question['input_ids']) + [0,0] + answer_phrases_attn_mask
                
                d['answer_phrases_pool_mask'].append(answer_phrase_pool_mask)
            
            if len(d['answer_phrases_pool_mask'][0])>len(d['sentence_pool_mask']):
                continue
                
            self.data.append(d)

    def add_neg_samples(self):
        for i in tqdm.tqdm(range(self.__len__()),desc='adding neg samples...'):
            self.data[i]['feedback_set'] = [self.data[i]['feedback']]
            self.data[i]['feedback_attn_set'] = [self.data[i]['feedback_attn']]
            self.data[i]['feedback_pool_mask_set'] = [self.data[i]['feedback_pool_mask']]
            L = list(range(self.__len__()))
            L.remove(i)
            neg_samples_idx = np.random.choice(L,size=4)
            for n_id in neg_samples_idx:
                self.data[i]['feedback_set'].append(self.data[n_id]['feedback'])
                self.data[i]['feedback_attn_set'].append(self.data[n_id]['feedback_attn'])
                self.data[i]['feedback_pool_mask_set'].append(self.data[n_id]['feedback_pool_mask'])
            for k in self.data[i].keys():
                self.data[i][k] = torch.tensor(self.data[i][k])
                
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        return self.data[idx]

In [217]:
train_dataset = feedback_QA_dataset(train_df)
train_dataset.add_neg_samples()
valid_dataset = feedback_QA_dataset(val_df)
valid_dataset.add_neg_samples()
test_dataset = feedback_QA_dataset(test_df)
test_dataset.add_neg_samples()

vectorizing..: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5660/5660 [00:18<00:00, 299.12it/s]
adding neg samples...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 5279/5279 [00:22<00:00, 230.19it/s]
vectorizing..: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1410/1410 [00:04<00:00, 290.53it/s]
adding neg samples...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1316/1316 [00:05<00:00, 252.15it/s]
vectorizing..: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1995/1995 [00:07<00:00, 260.13it/s]
adding neg samples...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1835/1835 [00:07<00:00

In [218]:
train_DL = DataLoader(train_dataset,batch_size=1,shuffle=True)
valid_DL = DataLoader(valid_dataset,batch_size=1,shuffle=True)
test_DL = DataLoader(test_dataset,batch_size=1,shuffle=False)

In [219]:
for b in train_DL:
    for k in b.keys():
        print(k,b[k].shape)
    break

sentence torch.Size([1, 503])
sentence_attn torch.Size([1, 503])
feedback torch.Size([1, 500])
feedback_attn torch.Size([1, 500])
sentence_pool_mask torch.Size([1, 503])
feedback_pool_mask torch.Size([1, 500])
answer_phrases_pool_mask torch.Size([1, 3, 503])
feedback_set torch.Size([1, 5, 500])
feedback_attn_set torch.Size([1, 5, 500])
feedback_pool_mask_set torch.Size([1, 5, 500])


In [220]:
from transformers import BartForConditionalGeneration

device = 'cuda:0'

model = AutoModel.from_pretrained(bert_chkpt).to(device)

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    se = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return F.normalize(se, p=2, dim=1)

j = 0

with torch.no_grad():
    for b in test_DL:
        se = mean_pooling( model(input_ids = b['sentence'].to(device),attention_mask=b['sentence_attn'].to(device)), b['sentence_pool_mask'].to(device))
        fe = mean_pooling(model(input_ids = b['feedback_set'][0].to(device),attention_mask=b['feedback_attn_set'][0].to(device)), b['feedback_pool_mask_set'][0].to(device))
        pmo = model(input_ids = b['sentence'].to(device),attention_mask=b['sentence_attn'].to(device))
        print(pmo[0].shape,b['answer_phrases_pool_mask'].shape)
        pe = [mean_pooling(pmo,b['answer_phrases_pool_mask'][0][i].to(device) ) for i in range(b['answer_phrases_pool_mask'][0].shape[0])]
        pe = torch.stack(pe).squeeze(1)
        cos_sim = F.cosine_similarity(se,fe,dim=1)
        cos_phrase_sim = torch.matmul(pe,fe.transpose(1,0))
        print(fe.shape,se.shape,pe.shape,cos_sim,cos_phrase_sim.mean(0))
        
        sent_probs = F.softmax(cos_sim,dim=-1)
        phrase_probs = F.softmax(cos_phrase_sim,dim=-1)
        
        print('\nInput: ',tokenizer.decode(b['sentence'][0],skip_special_tokens=True),'\n')
        print('Feedback: ',tokenizer.decode(b['feedback'][0],skip_special_tokens=True),'\n')
        for i in range(b['answer_phrases_pool_mask'][0].shape[0]):
            relevance = phrase_probs[i][0] - sent_probs[0]
            
            phrase_tok = torch.mul(b['sentence'][0],b['answer_phrases_pool_mask'][0][i])
            print(f"Phrase {i}:",tokenizer.decode(phrase_tok,skip_special_tokens=True))
            print(f"Relevance of phrase {i} is {relevance}",'\n')
        
#         print('softmax: ',F.softmax(cos_sim),F.softmax(cos_phrase_sim,dim=-1))
        
#         tgt_tensor = torch.zeros(b['feedback_set'].shape[1] , device=device)
#         tgt_tensor[0] = 1.0
#         print('CE Loss: ', F.cross_entropy(cos_sim,target=tgt_tensor), F.cross_entropy(cos_phrase_sim.mean(0),target=torch.tensor([1.0,0,0,0,0]).to(device)))
        print('----------------------------')
        j+=1
        if j>5:
            break

del model

torch.Size([1, 503, 768]) torch.Size([1, 2, 503])
torch.Size([5, 768]) torch.Size([1, 768]) torch.Size([2, 768]) tensor([0.1847, 0.1714, 0.1046, 0.1936, 0.2543], device='cuda:0') tensor([0.1812, 0.1683, 0.1025, 0.1908, 0.2488], device='cuda:0')

Input:  what are my options if i can not support myself on a whm visa? frequently asked questions covid - 19 pandemic - australian government endorsed event ( agee ) stream of the temporary activity ( subclass 408 ) visa frequently asked questions when can i apply for the covid - 19 pandemic event visa? you should only apply for this visa is you are unable to depart australia, your temporary visa expires in less than 28 days ( or did not expire more than 28 days ago ) and you have no other visa options available to you. 

Feedback:  this only talks about visa application, it fails to talk about the topic 

Phrase 0: frequently asked questions covid - 19 pandemic - australian government endorsed event ( agee ) stream of the temporary activity ( 

In [130]:
t = torch.tensor([[[1,2,3,4,5],[6,7,8,9,0]]])
t.repeat(2,1,1)

tensor([[[1, 2, 3, 4, 5],
         [6, 7, 8, 9, 0]],

        [[1, 2, 3, 4, 5],
         [6, 7, 8, 9, 0]]])

In [221]:
class discriminator(nn.Module):
    def __init__(self, model_chkpt, device='cuda:0'):
        super().__init__()
        
        self.model = AutoModel.from_pretrained(model_chkpt).to(device)
        self.device = device
        
    def mean_pooling(self,model_output,attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        se = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return F.normalize(se, p=2, dim=1)
        
    def forward(self, b):
        sent_model_out = self.model(input_ids = b['sentence'].to(self.device),attention_mask=b['sentence_attn'].to(self.device))
        feedback_model_out = self.model(input_ids = b['feedback_set'][0].to(self.device),attention_mask=b['feedback_attn_set'][0].to(self.device))
        
        sent_emb = self.mean_pooling( sent_model_out, b['sentence_pool_mask'].to(self.device))
        feedback_emb = self.mean_pooling( feedback_model_out, b['feedback_pool_mask_set'][0].to(self.device))
        
        # print(pmo[0].shape,b['answer_phrases_pool_mask'].shape)
        phrase_emb = [ self.mean_pooling( sent_model_out, b['answer_phrases_pool_mask'][0][i].to(self.device) ) for i in range(b['answer_phrases_pool_mask'][0].shape[0])]
        phrase_emb = torch.stack(phrase_emb).squeeze(1)
        cos_sim = F.cosine_similarity(sent_emb,feedback_emb,dim=1)
        cos_phrase_sim = torch.matmul(phrase_emb,feedback_emb.transpose(1,0))
        
        tgt_tensor = torch.zeros(b['feedback_set'].shape[1] , device=self.device)
        tgt_tensor[0] = 1.0 #the relevant feedback is always present at index 0
        
        return_dict = {'sent_ce_loss': F.cross_entropy(cos_sim,target=tgt_tensor),
                       'avg_phrase_ce_loss': F.cross_entropy(cos_phrase_sim.mean(0),target=tgt_tensor),
                       'sent_probs': F.softmax(cos_sim,dim=-1),
                       'phrase_probs': F.softmax(cos_phrase_sim,dim=-1)}
        
        return return_dict
        
        

In [222]:
def train(discriminator,train_dl,valid_dl,epochs,batch_size,optimizer,PATIENCE=20,save_dir=None):

    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    
    discriminator.train()
    
    loss_acc = 0
    num_batches = 0
    total_steps = 0
    best_valid_loss = np.inf
    patience = PATIENCE
    
    train_loss_arr,valid_loss_arr = [],[]
    
    optimizer.zero_grad()
    discriminator.zero_grad()
    
    for E in range(epochs):
        
        valid_loss = validate(discriminator,valid_dl)
        valid_loss_arr.append(valid_loss/len(valid_dl))
        
        num_samples = 0
        
        for b in train_dl:
            
            y = discriminator(b)
                          # decoder_input_ids=b['feedback'].squeeze(1)[:,:-1].to(device),
                          # decoder_attention_mask=b['feedback_attn'].squeeze(1)[:,:-1].to(device))
            loss = y['sent_ce_loss'] + y['avg_phrase_ce_loss'] #F.cross_entropy(y.logits.permute(0,2,1), b['feedback'].squeeze(1)[:,1:].to(device), ignore_index=tokenizer.pad_token_id)
            
            num_samples+=1
            
            loss.backward()
            loss_acc += loss.item()
            
            if num_samples%batch_size==0:
                optimizer.step()

                num_batches += 1
                total_steps += 1
            
                train_loss_arr.append(loss_acc/num_batches)
                
                optimizer.zero_grad()
            
                if total_steps%100==0 and total_steps!=0:
                    print("Epoch:",E,"\t","Steps taken:",total_steps,"\tLoss:",loss_acc/num_batches)
            
        #print("Epoch:",E,"\t","Steps taken:",total_steps,"\tLoss:",loss_acc/num_batches)
        
        torch.save({'model_state':discriminator.state_dict(),
                    'optimizer':optimizer.state_dict(),
                    'epoch':E},
                    f"{save_dir}/Epoch_{E}_model_chkpt.pth.tar")
        
        if valid_loss<best_valid_loss:
            best_valid_loss = valid_loss
            patience = PATIENCE
            
            torch.save({'model_state':discriminator.state_dict(),
                        'optimizer':optimizer.state_dict(),
                        'epoch':E},
                        f"{save_dir}/best_model_chkpt.pth.tar")
        else:
            patience -= 1
            print(f"REDUCING PATIENCE...{patience}")

        if patience<=0:
            print("RUNNING OUT OF PATIENCE... TERMINATING")
            break
    
    
    return train_loss_arr,valid_loss_arr
                

In [223]:
def validate(discriminator,valid_dl):
    
    discriminator.eval()
    valid_loss = 0
    with torch.no_grad():
        for b in valid_dl:
            y = discriminator(b)
                          # decoder_input_ids=b['feedback'].squeeze(1)[:,:-1].to(device),
                          # decoder_attention_mask=b['feedback_attn'].squeeze(1)[:,:-1].to(device))
            loss = y['sent_ce_loss'] + y['avg_phrase_ce_loss'] #F.cross_entropy(y.logits.permute(0,2,1), b['feedback'].squeeze(1)[:,1:].to(device), ignore_index=tokenizer.pad_token_id)
            valid_loss += loss.item()
            
    print("Validation Loss:",valid_loss)
    return valid_loss

In [224]:
import os

from transformers import AutoModel

EPOCHS = 50
BATCH_SIZE = 16

device = 'cuda:0'

# MPNet = AutoModel.from_pretrained(bert_chkpt).to(device)
discriminator_model = discriminator(bert_chkpt,device=device)

optimizer = torch.optim.AdamW(discriminator_model.parameters(),lr=1e-5)

save_dir = 'Detect_Span_FB_MPNET_chkpts_1'
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

train_loss,valid_loss = train(discriminator_model,
                              train_DL,
                              valid_DL,
                              EPOCHS,
                              BATCH_SIZE,
                              optimizer,
                              PATIENCE=5,
                              save_dir=save_dir)



Validation Loss: 3615.1683789491653
Epoch: 0 	 Steps taken: 100 	Loss: 37.71666325211525
Epoch: 0 	 Steps taken: 200 	Loss: 36.52932067990303
Epoch: 0 	 Steps taken: 300 	Loss: 36.01980754494667
Validation Loss: 2872.763494491577
Epoch: 1 	 Steps taken: 400 	Loss: 35.59265251427889
Epoch: 1 	 Steps taken: 500 	Loss: 35.165306582212445
Epoch: 1 	 Steps taken: 600 	Loss: 34.886897427837056
Validation Loss: 2844.0738703012466
Epoch: 2 	 Steps taken: 700 	Loss: 34.64229964988572
Epoch: 2 	 Steps taken: 800 	Loss: 34.32756020218134
Epoch: 2 	 Steps taken: 900 	Loss: 34.09135734041532
Validation Loss: 2844.4556016921997
Epoch: 3 	 Steps taken: 1000 	Loss: 33.94833424413204
Epoch: 3 	 Steps taken: 1100 	Loss: 33.71300002867525
Epoch: 3 	 Steps taken: 1200 	Loss: 33.50649184077978
Epoch: 3 	 Steps taken: 1300 	Loss: 33.35283208085941
REDUCING PATIENCE...4
Validation Loss: 2829.7463079690933
Epoch: 4 	 Steps taken: 1400 	Loss: 33.189559691463195
Epoch: 4 	 Steps taken: 1500 	Loss: 33.0169333608

In [None]:
import json

with open('train_loss.json','w') as f:
    json.dump(train_loss,f)

with open('valid_loss.json','w') as f:
    json.dump(valid_loss,f)

In [None]:
train_loss_ds = np.array(train_loss)[np.round(np.linspace(0, len(train_loss) - 1, len(valid_loss))).astype(int)]
loss_df = pd.DataFrame({'train_loss':train_loss_ds , 'valid_loss':valid_loss})

In [None]:
from plotly import express as px
px.line(loss_df,y=['train_loss','valid_loss'])

In [None]:
discriminator.load_state_dict(torch.load('GenFB_BART_chkpts_1/Epoch_0_model_chkpt.pth.tar')['model_state'])

In [None]:
i = 0
for b in train_DL:
    out = discriminator.generate(inputs=b['input'][0:1,0].to(device),top_p=0.5)
    print(tokenizer.decode(b['input'][0:1,0][0],skip_special_tokens=True))
    print(tokenizer.decode(b['feedback'][0:1,0][0],skip_special_tokens=True))
    print(tokenizer.decode(out[0]))
    print("--------------------------------------------------------")
    i+=1
    if i>10:
        break