In [31]:
import nltk
from nltk.corpus import wordnet as wn
from sense2vec import Sense2Vec
from collections import OrderedDict
from similarity.normalized_levenshtein import NormalizedLevenshtein
import string
from string import punctuation
import random
import re
import nltk
import itertools
import pke
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import traceback
from nltk.tokenize import sent_tokenize
from flashtext import KeywordProcessor
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
pl.seed_everything(42)
from tqdm import tqdm
import copy
from transformers import AdamW,T5ForConditionalGeneration,T5Tokenizer,get_linear_schedule_with_warmup

Global seed set to 42


### Distractor Generate

In [2]:
s2v = Sense2Vec().from_disk('s2v_old')

In [3]:
class Distractors:
    def __init__(self,word,s2v):
        self.output = []
        self.word = word.lower()
        self.word = word.replace(" ", "_")
        sense = s2v.get_best_sense(self.word)
        self.most_similar = s2v.most_similar(sense, n=20)
        self.threshold = 0.7
    
    def __noise__(self):
        "All edits that are one edit away from `word`."
        self.letters    = 'abcdefghijklmnopqrstuvwxyz '+string.punctuation
        self.splits     = [(self.word[:i], self.word[i:])    for i in range(len(self.word) + 1)]
        self.deletes    = [L + R[1:]               for L, R in self.splits if R]
        self.transposes = [L + R[1] + R[0] + R[2:] for L, R in self.splits if len(R)>1]
        self.replaces   = [L + c + R[1:]           for L, R in self.splits if R for c in self.letters]
        self.inserts    = [L + c + R               for L, R in self.splits for c in self.letters]
        return set(self.deletes + self.transposes + self.replaces + self.inserts)
    
    def __generate_distractors__(self):
        for each_word in self.most_similar:
            self.append_word = each_word[0].split("|")[0].replace("_", " ").lower()
            if len(self.append_word.split())>1:
                self.append_word=' '.join([self.append_word.title().split()[0],self.append_word.title().split()[-1]])
            if self.append_word.lower() != self.word and self.append_word.title() not in self.output:
                self.output.append(self.append_word.title())
        return list(OrderedDict.fromkeys(self.output))
        
        
    def find_all_distractors(self):
        normalized_levenshtein = NormalizedLevenshtein()
        self.all_edits = self.__noise__()
        self.out=self.__generate_distractors__()
        self.filtered_distractors_edit_distance = [x for x in self.out if x.lower() not in self.all_edits]
        self.filtered_distractors_edit_distance_and_levenshtein_distance =[x for x in self.filtered_distractors_edit_distance if normalized_levenshtein.distance(x.lower(),self.word.lower())>self.threshold]
        #self.output = self.filtered_distractors_edit_distance_and_levenshtein_distance[0][:4]+[self.word]
        #return random.sample(self.output,len(self.output))
        return self.filtered_distractors_edit_distance_and_levenshtein_distance[:4]

In [4]:
word='India'
distractors=Distractors(word,s2v)
options=distractors.find_all_distractors()

### Fill in the Blanks Generate

In [5]:
class FIB:
    
    def __init__(self,path):
        
        self._noun_adj_verb_=[]
        self.pos = {'VERB', 'ADJ', 'NOUN'}
        self.doc=open(path,mode='r').readlines()
        self.doc=[re.sub('\n','',i.strip()) for i in self.doc]
        self.text=''.join(self.doc)
        self.sentences = sent_tokenize(open(path,mode='r').read())
        self.keyword_sentences = {}
        self.stoplist = list(string.punctuation)
        self.stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        self.stoplist += stopwords.words('english')
        self.blank_sentences = []
        self.processed = []
        self.keys=[]
        self.df=pd.DataFrame()
        
    def get_noun_adj_verb(self):
        try:
            extractor = pke.unsupervised.MultipartiteRank()
            extractor.load_document(input=self.text)
            extractor.candidate_selection(pos=self.pos, stoplist=self.stoplist)
            extractor.candidate_weighting(alpha=1.1,
                                      threshold=0.75,
                                      method='average')
            self.keyphrases = extractor.get_n_best(n=30)
            for val in self.keyphrases:
                self._noun_adj_verb_.append(val[0])
        except:
            self._noun_adj_verb_ = []
            traceback.print_exc()
        return self._noun_adj_verb_
    
    def get_sentences_for_keyword(self):
        self.keywords=self.get_noun_adj_verb()
        keyword_processor = KeywordProcessor()
        for word in self.keywords:
            self.keyword_sentences[word] = []
            keyword_processor.add_keyword(word)
        for sentence in self.sentences:
            self.keywords_found = keyword_processor.extract_keywords(sentence)
            for key in self.keywords_found:
                self.keyword_sentences[key].append(sentence)

        for key in self.keyword_sentences.keys():
            values = self.keyword_sentences[key]
            values = sorted(values, key=len, reverse=True)
            self.keyword_sentences[key] = values
        return self.keyword_sentences
    
    def get_fill_in_the_blanks(self):
        self.sentence_mapping=self.get_sentences_for_keyword()
        for key in self.sentence_mapping:
            if len(self.sentence_mapping[key])>0:
                sent = self.sentence_mapping[key][0]
                # Compile a regular expression pattern into a regular expression object, which can be used for matching and other methods
                insensitive_sent = re.compile(re.escape(key), re.IGNORECASE)
                no_of_replacements =  len(re.findall(re.escape(key),sent,re.IGNORECASE))
                line = insensitive_sent.sub(' _________ ', sent)
                if (self.sentence_mapping[key][0] not in self.processed) and no_of_replacements<2:
                    self.blank_sentences.append(line)
                    self.processed.append(self.sentence_mapping[key][0])
                    self.keys.append(key)
        self.df["Fill in the Blank"]=self.blank_sentences
        self.df["Answers"]=self.keys
        return self.df

In [6]:
listy=FIB('worksheet1.txt').get_fill_in_the_blanks()
listy

Unnamed: 0,Fill in the Blank,Answers
0,"Soon Sunderlal Bahuguna, another social activi...",villagers
1,The success of the movement led to a fifteen y...,felling
2,Villagers in Chamoli district of Uttar Pradesh...,protested
3,"In 1974, in a village called Reni, the governm...",women led
4,"In many ways, the Chipko Movement is also cons...",cut
5,The Chipko _________ from the Bishnoi commun...,movement draws inspiration
6,This gesture of villagers gave the _________ ...,name
7,"In the 18th century, this group of _________ ...",people
8,The people of the _________ understood that ...,community
9,"In 1973, a similar movement took place in Indi...",different purpose


### Fill in the blanks with options

In [7]:
class FIBMCQ:
    
    def __init__(self,path):
        
        self._noun_adj_verb_=[]
        self.pos = {'VERB', 'ADJ', 'NOUN'}
        self.doc=open(path,mode='r').readlines()
        self.doc=[re.sub('\n','',i.strip()) for i in self.doc]
        self.text=''.join(self.doc)
        self.sentences = sent_tokenize(open(path,mode='r').read())
        self.keyword_sentences = {}
        self.stoplist = list(string.punctuation)
        self.stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        self.stoplist += stopwords.words('english')
        self.blank_sentences = []
        self.processed = []
        self.keys=[]
        self.option_1=[]
        self.option_2=[]
        self.option_3=[]
        self.option_4=[]
        self.df=pd.DataFrame()
        
    def get_noun_adj_verb(self):
        for i in range(len(self.sentences)):
            for j in nltk.pos_tag(nltk.word_tokenize(self.sentences[i])):
                if j[1][:2]=='NN':
                    self._noun_adj_verb_.append(j[0])
        try:
            extractor = pke.unsupervised.MultipartiteRank()
            extractor.load_document(input=self.text)
            extractor.candidate_selection(pos=self.pos, stoplist=self.stoplist)
            extractor.candidate_weighting(alpha=1.1,
                                      threshold=0.75,
                                      method='average')
            self.keyphrases = extractor.get_n_best(n=30)
            for val in self.keyphrases:
                if val[0] not in self._noun_adj_verb_:
                    self._noun_adj_verb_.append(val[0])
        except:
            self._noun_adj_verb_ = []
            traceback.print_exc()
        return self._noun_adj_verb_
    
    def get_sentences_for_keyword(self):
        self.keywords=self.get_noun_adj_verb()
        keyword_processor = KeywordProcessor()
        for word in self.keywords:
            self.keyword_sentences[word] = []
            keyword_processor.add_keyword(word)
        for sentence in self.sentences:
            self.keywords_found = keyword_processor.extract_keywords(sentence)
            for key in self.keywords_found:
                self.keyword_sentences[key].append(sentence)

        for key in self.keyword_sentences.keys():
            values = self.keyword_sentences[key]
            values = sorted(values, key=len, reverse=True)
            self.keyword_sentences[key] = values
        return self.keyword_sentences
    
    def get_fill_in_the_blanks(self):
        self.sentence_mapping=self.get_sentences_for_keyword()
        for key in self.sentence_mapping:
            try:
                distractors=Distractors(key,s2v)
                self.options=distractors.find_all_distractors()
                if len(self.options)==0 or len(self.options)<4:
                    raise
            except:
                continue
            if len(self.sentence_mapping[key])>0:
                sent = self.sentence_mapping[key][0]
                # Compile a regular expression pattern into a regular expression object, which can be used for matching and other methods
                insensitive_sent = re.compile(re.escape(key), re.IGNORECASE)
                no_of_replacements =  len(re.findall(re.escape(key),sent,re.IGNORECASE))
                line = insensitive_sent.sub(' _________ ', sent)
                if (self.sentence_mapping[key][0] not in self.processed) and no_of_replacements<2:
                    self.blank_sentences.append(line)
                    self.processed.append(self.sentence_mapping[key][0])
                    self.keys.append(key)
                    self.option_1.append(self.options[0])
                    self.option_2.append(self.options[1])
                    self.option_3.append(self.options[2])
                    self.option_4.append(self.options[3])
        self.df["Fill in the Blank"]=self.blank_sentences
        self.df["Answers"]=self.keys
        self.df["Option_1"]=self.option_1
        self.df["Option_2"]=self.option_2
        self.df["Option_3"]=self.option_3
        self.df["Option_4"]=self.option_4
        return self.df

In [8]:
df=FIBMCQ('worksheet1.txt').get_fill_in_the_blanks()
df

Unnamed: 0,Fill in the Blank,Answers,Option_1,Option_2,Option_3,Option_4
0,_________ led India to freedom using non-vio...,Gandhiji,Ambedkar,Modi,Kejriwal,Ikeda
1,"In 1973, a similar movement took place in ___...",India,Bangladesh,Malaysia,Brunei,China
2,Villagers in Chamoli _________ of Uttar Prad...,district,County,School Board,Superintendent,Charter School
3,The success of the movement led to a fifteen y...,Uttarakhand,Bihar,Kerala,Himachal Pradesh,West Bengal
4,The _________ chipko in Hindi means to embra...,word,Phrase,Actual Meaning,Different Word,Literal Meaning
5,This _________ of villagers gave the name Ch...,gesture,Handshake,Hand Shake,Polite Smile,Smile
6,"Soon Sunderlal Bahuguna, another social activi...",villagers,Iron Golems,Townsfolk,Townspeople,Pigmen
7,It was a non-violent method of protesting agai...,mass,Gravitational Force,Gravity,Gravitational Potential,Potential Energy
8,"In the 18th _________ , this group of people,...",century,Millennium,Millenium,100 Years,150 Years
9,"In 1974, in a village called Reni, the governm...",Devi,Tutte,Dico,Piuttosto,Nella


### True or False generate

In [9]:
class TrueFalse:
    
    def __init__(self,path):
        
        self._noun_adj_verb_=[]
        self.pos = {'VERB', 'ADJ', 'NOUN'}
        self.doc=open(path,mode='r').readlines()
        self.doc=[re.sub('\n','',i.strip()) for i in self.doc]
        self.text=''.join(self.doc)
        self.sentences = sent_tokenize(open(path,mode='r').read())
        self.keyword_sentences = {}
        self.stoplist = list(string.punctuation)
        self.stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
        self.stoplist += stopwords.words('english')
        self.blank_sentences = []
        self.keys=[]
        self.df=pd.DataFrame()
        
    def get_noun_adj_verb(self):
        for i in range(len(self.sentences)):
            for j in nltk.pos_tag(nltk.word_tokenize(self.sentences[i])):
                if j[1][:2]=='NN':
                    self._noun_adj_verb_.append(j[0])
        try:
            extractor = pke.unsupervised.MultipartiteRank()
            extractor.load_document(input=self.text)
            extractor.candidate_selection(pos=self.pos, stoplist=self.stoplist)
            extractor.candidate_weighting(alpha=1.1,
                                      threshold=0.75,
                                      method='average')
            self.keyphrases = extractor.get_n_best(n=30)
            for val in self.keyphrases:
                if val[0] not in self._noun_adj_verb_:
                    self._noun_adj_verb_.append(val[0])
        except:
            self._noun_adj_verb_ = []
            traceback.print_exc()
        return self._noun_adj_verb_
    
    def get_sentences_for_keyword(self):
        self.keywords=self.get_noun_adj_verb()
        keyword_processor = KeywordProcessor()
        for word in self.keywords:
            self.keyword_sentences[word] = []
            keyword_processor.add_keyword(word)
        for sentence in self.sentences:
            self.keywords_found = keyword_processor.extract_keywords(sentence)
            for key in self.keywords_found:
                self.keyword_sentences[key].append(sentence)

        for key in self.keyword_sentences.keys():
            values = self.keyword_sentences[key]
            values = sorted(values, key=len, reverse=True)
            self.keyword_sentences[key] = values
        return self.keyword_sentences
    
    def get_True_False(self):
        self.sentence_mapping=self.get_sentences_for_keyword()
        for key in self.sentence_mapping:
            try:
                distractors=Distractors(key,s2v)
                self.options=distractors.find_all_distractors()
                if len(self.options)==0:
                    raise
            except:
                continue
            for stats in self.sentence_mapping[key]:
                false=random.randint(0,1)
                if false==1:
                    insensitive_sent = re.compile(re.escape(key), re.IGNORECASE)
                    no_of_replacements =  len(re.findall(re.escape(key),stats,re.IGNORECASE))
                    line = insensitive_sent.sub(random.choice(self.options), stats)
                    self.blank_sentences.append(line)
                    self.keys.append('False')
                else:
                    self.blank_sentences.append(stats)
                    self.keys.append('True')
        self.df["Statements"]=self.blank_sentences
        self.df["Answers"]=self.keys
        return self.df
        

In [10]:
df=TrueFalse('worksheet1.txt').get_True_False()
df

Unnamed: 0,Statements,Answers
0,Gandhiji led India to freedom using non-violen...,True
1,"In 1973, a similar movement took place in Indi...",True
2,Gandhiji led Malaysia to freedom using non-vio...,False
3,Gandhiji led India to freedom using non-violen...,True
4,The word chipko in Hindi means to embrace or c...,True
...,...,...
97,"Soon Sunderlal Bahuguna, another social activi...",True
98,"In 1974, in a village called Reni, the governm...",True
99,"In 1974, in a village called Reni, the governm...",False
100,"The environmentalist, Chandi Prasad Bhatt, fou...",False


### Fine Tine T5 Model to generate question

#### Prep Dataset

In [13]:
from datasets import load_dataset
train_dataset = load_dataset('squad', split='train')
valid_dataset = load_dataset('squad', split='validation')

Downloading builder script:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to C:\Users\shaik\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to C:\Users\shaik\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


Reusing dataset squad (C:\Users\shaik\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


In [14]:
from pprint import pprint 
sample_validation_dataset = next(iter(valid_dataset))
pprint (sample_validation_dataset)

{'answers': {'answer_start': [177, 177, 177],
             'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']},
 'context': 'Super Bowl 50 was an American football game to determine the '
            'champion of the National Football League (NFL) for the 2015 '
            'season. The American Football Conference (AFC) champion Denver '
            'Broncos defeated the National Football Conference (NFC) champion '
            'Carolina Panthers 24–10 to earn their third Super Bowl title. The '
            "game was played on February 7, 2016, at Levi's Stadium in the San "
            'Francisco Bay Area at Santa Clara, California. As this was the '
            '50th Super Bowl, the league emphasized the "golden anniversary" '
            'with various gold-themed initiatives, as well as temporarily '
            'suspending the tradition of naming each Super Bowl game with '
            'Roman numerals (under which the game would have been known as '
            '"Super

In [15]:
context = sample_validation_dataset['context']
question = sample_validation_dataset['question']
answer = sample_validation_dataset['answers']['text'][0]

print ("context: ",context)
print ("question: ",question)
print ("answer: ",answer)

context:  Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.
question:  Which NFL team represented the AFC at Super Bowl 50?
answer:  Denver Broncos


In [33]:
pd.set_option("display.max_colwidth", -1)
df_train = pd.DataFrame( columns = ['context', 'answer','question'])
df_validation = pd.DataFrame( columns = ['context', 'answer','question'])

  pd.set_option("display.max_colwidth", -1)


In [34]:
for index,val in enumerate(tqdm(train_dataset)):
    passage = val['context']
    question = val['question']
    answer = val['answers']['text'][0]
    no_of_words = len(answer.split())
    values=[passage,answer,question]
    if no_of_words < 7:
        row = pd.Series(values, index=df_train.columns)
        df_train=df_train.append(row,ignore_index=True,) 

100%|███████████████████████████████████████████████████████████████████████████| 87599/87599 [05:40<00:00, 256.97it/s]


In [36]:
for index,val in enumerate(tqdm(valid_dataset)):
    passage = val['context']
    question = val['question']
    answer = val['answers']['text'][0]
    no_of_words = len(answer.split())
    values=[passage,answer,question]
    if no_of_words < 7:
        row = pd.Series(values, index=df_train.columns)
        df_validation=df_validation.append(row,ignore_index=True,) 

100%|███████████████████████████████████████████████████████████████████████████| 10570/10570 [00:13<00:00, 785.12it/s]


In [37]:
df_train = df_train.sample(frac=1)
df_validation = df_validation.sample(frac=1)

In [38]:
df_train.head()

Unnamed: 0,context,answer,question
59709,"Leucothea was daughter of Orchamus and sister of Clytia. She fell in love with Apollo who disguised himself as Leucothea's mother to gain entrance to her chambers. Clytia, jealous of her sister because she wanted Apollo for herself, told Orchamus the truth, betraying her sister's trust and confidence in her. Enraged, Orchamus ordered Leucothea to be buried alive. Apollo refused to forgive Clytia for betraying his beloved, and a grieving Clytia wilted and slowly died. Apollo changed her into an incense plant, either heliotrope or sunflower, which follows the sun every day.",Orchamus,Who ordered Leucothea to be buried alive?
58088,"The climate of Western Alaska is determined in large part by the Bering Sea and the Gulf of Alaska. It is a subarctic oceanic climate in the southwest and a continental subarctic climate farther north. The temperature is somewhat moderate considering how far north the area is. This region has a tremendous amount of variety in precipitation. An area stretching from the northern side of the Seward Peninsula to the Kobuk River valley (i. e., the region around Kotzebue Sound) is technically a desert, with portions receiving less than 10 in (25 cm) of precipitation annually. On the other extreme, some locations between Dillingham and Bethel average around 100 in (250 cm) of precipitation.",the region around Kotzebue Sound,Which area of Western Alaska is techincally a desert?
12495,"The Soviet regime first came to power on November 7, 1917, immediately after the Russian Provisional Government, which governed the Russian Republic, was overthrown in the October Revolution. The state it governed, which did not have an official name, would be unrecognized by neighboring countries for another five months.",five,"In months, for how long was the new Soviet state unrecognized?"
12825,"Alexander Bell was born in Edinburgh, Scotland, on March 3, 1847. The family home was at 16 South Charlotte Street, and has a stone inscription marking it as Alexander Graham Bell's birthplace. He had two brothers: Melville James Bell (1845–70) and Edward Charles Bell (1848–67), both of whom would die of tuberculosis. His father was Professor Alexander Melville Bell, a phonetician, and his mother was Eliza Grace (née Symonds). Born as just ""Alexander Bell"", at age 10 he made a plea to his father to have a middle name like his two brothers.[N 6] For his 11th birthday, his father acquiesced and allowed him to adopt the name ""Graham"", chosen out of respect for Alexander Graham, a Canadian being treated by his father who had become a family friend. To close relatives and friends he remained ""Aleck"".",Edinburgh,What city was Bell born in?
3289,"Initially, officials were unable to contact the Wolong National Nature Reserve, home to around 280 giant pandas. However, the Foreign Ministry later said that a group of 31 British tourists visiting the Wolong Panda Reserve in the quake-hit area returned safe and uninjured to Chengdu. Nonetheless, the well-being of an even greater number of pandas in the neighbouring panda reserves remained unknown. Five security guards at the reserve were killed by the earthquake. Six pandas escaped after their enclosures were damaged. By May 20, two pandas at the reserve were found to be injured, while the search continued for another two adult pandas that went missing after the quake. By May 28, 2008, one panda was still missing. The missing panda was later found dead under the rubble of an enclosure. Nine-year-old Mao Mao, a mother of five at the breeding center, was discovered on Monday, her body crushed by a wall in her enclosure. Panda keepers and other workers placed her remains in a small wooden crate and buried her outside the breeding centre.",31,How many British visitors to the Reserve left unharmed?


In [39]:
df_validation.head()

Unnamed: 0,context,answer,question
6584,"Private schools generally prefer to be called independent schools, because of their freedom to operate outside of government and local government control. Some of these are also known as public schools. Preparatory schools in the UK prepare pupils aged up to 13 years old to enter public schools. The name ""public school"" is based on the fact that the schools were open to pupils from anywhere, and not merely to those from a certain locality, and of any religion or occupation. According to The Good Schools Guide approximately 9 per cent of children being educated in the UK are doing so at fee-paying schools at GSCE level and 13 per cent at A-level.[citation needed] Many independent schools are single-sex (though this is becoming less common). Fees range from under £3,000 to £21,000 and above per year for day pupils, rising to £27,000+ per year for boarders. For details in Scotland, see ""Meeting the Cost"".",public,What schools do preparatory schools prepare British children to attend?
6495,"Private schools, also known as independent schools, non-governmental, or nonstate schools, are not administered by local, state or national governments; thus, they retain the right to select their students and are funded in whole or in part by charging their students tuition, rather than relying on mandatory taxation through public (government) funding; at some private schools students may be able to get a scholarship, which makes the cost cheaper, depending on a talent the student may have (e.g. sport scholarship, art scholarship, academic scholarship), financial need, or tax credit scholarships that might be available.",independent,"Along with non-governmental and nonstate schools, what is another name for private schools?"
4933,"The Pilgrim Street building was refurbished between November 2006 and May 2008; during the refurbishment works, the cinema relocated to the Old Town Hall, Gateshead. In May 2008 the Tyneside Cinema reopened in the restored and refurbished original building. The site currently houses three cinemas, including the restored Classic —the United Kingdom's last surviving news cinema still in full-time operation—alongside two new screens, a roof extension containing the Tyneside Bar, and dedicated education and teaching suites.",three,How many cinemas are currently housed at one site?
1966,"Chris Keates, the general secretary of National Association of Schoolmasters Union of Women Teachers, said that teachers who have sex with pupils over the age of consent should not be placed on the sex offenders register and that prosecution for statutory rape ""is a real anomaly in the law that we are concerned about."" This has led to outrage from child protection and parental rights groups. Fears of being labelled a pedophile or hebephile has led to several men who enjoy teaching avoiding the profession. This has in some jurisdictions reportedly led to a shortage of male teachers.",child protection and parental rights groups,A statement made by Chris Keates caused issues with whom?
3072,"In Berlin, the Huguenots created two new neighbourhoods: Dorotheenstadt and Friedrichstadt. By 1700, one-fifth of the city's population was French speaking. The Berlin Huguenots preserved the French language in their church services for nearly a century. They ultimately decided to switch to German in protest against the occupation of Prussia by Napoleon in 1806-07. Many of their descendents rose to positions of prominence. Several congregations were founded, such as those of Fredericia (Denmark), Berlin, Stockholm, Hamburg, Frankfurt, Helsinki, and Emden.",1806-07,What years did this occupation take place?


In [40]:
df_train.to_csv('t5/dataset/squad_t5_train.csv', index = False)
df_validation.to_csv('t5/dataset/squad_t5_val.csv', index = False)

In [41]:
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [42]:
import copy

class QuestionGenerationDataset(Dataset):
    def __init__(self, tokenizer, filepath, max_len_inp=512,max_len_out=96):
        self.path = filepath

        self.passage_column = "context"
        self.answer = "answer"
        self.question = "question"

        # self.data = pd.read_csv(self.path)
        self.data = pd.read_csv(self.path,nrows=1000)

        self.max_len_input = max_len_inp
        self.max_len_output = max_len_out
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        self.skippedcount =0
        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
        target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

        labels = copy.deepcopy(target_ids)
        labels [labels==0] = -100

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask,"labels":labels}

    def _build(self):
        for idx in tqdm(range(len(self.data))):
            passage,answer,target = self.data.loc[idx, self.passage_column],self.data.loc[idx, self.answer], self.data.loc[idx, self.question]

            input_ = "context: %s  answer: %s </s>" % (passage, answer)
            target = "question: %s </s>" % (str(target))

            # get encoding length of input. If it is greater than self.max_len skip it
            test_input_encoding = self.tokenizer.encode_plus(input_,
                                        truncation=False,
                                        return_tensors="pt")
            
            length_of_input_encoding = len(test_input_encoding['input_ids'][0])


            if length_of_input_encoding > self.max_len_input:
              self.skippedcount = self.skippedcount + 1
              continue

            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len_input, pad_to_max_length=True, return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len_output, pad_to_max_length=True,return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [43]:
train_dataset = QuestionGenerationDataset(t5_tokenizer,'t5/dataset/squad_t5_train.csv')

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
 47%|████████████████████████████████████▊                                         | 472/1000 [00:00<00:00, 617.29it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (530 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 577.59it/s]


In [50]:
validation_dataset = QuestionGenerationDataset(t5_tokenizer,'t5/dataset/squad_t5_val.csv')

100%|█████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 591.44it/s]


#### Train Model

In [51]:
class T5FineTuner(pl.LightningModule):
    def __init__(self,hparams, t5model, t5tokenizer):
        super(T5FineTuner, self).__init__()
        self.hparams = hparams
        self.model = t5model
        self.tokenizer = t5tokenizer


    def forward( self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None):
         outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )
         
         return outputs


    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_input_ids = batch["target_ids"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log('train_loss',loss)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_input_ids = batch["target_ids"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log("val_loss",loss)
        return loss

    def train_dataloader(self):
        return DataLoader(train_dataset, batch_size=self.hparams.batch_size,num_workers=4)

    def val_dataloader(self):
        return DataLoader(validation_dataset, batch_size=self.hparams.batch_size,num_workers=4)



    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=3e-4, eps=1e-8)
        return optimizer

In [53]:
import argparse
args_dict = dict(
    batch_size=4,
)

args = argparse.Namespace(**args_dict)


model = T5FineTuner(args,t5_model,t5_tokenizer)

trainer = pl.Trainer(max_epochs = 1, gpus=1,progress_bar_refresh_rate=30)

trainer.fit(model)

print ("Saving model")
save_path_model = 't5/model/'
save_path_tokenizer = 't5/tokenizer/'
model.model.save_pretrained(save_path_model)
t5_tokenizer.save_pretrained(save_path_tokenizer)

AttributeError: can't set attribute