In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install SentencePiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
from transformers import XLNetTokenizer, XLNetModel
import torch.nn as nn
from torch.optim import Adam
import pandas as pd
import torch
import os
import random
import math
import nltk
from nltk.corpus import stopwords
from torch.nn.functional import normalize
import plotly.graph_objects as go
from torch.autograd.grad_mode import no_grad

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
sw_nltk = stopwords.words('english')

In [6]:
def optimizer_to(optim, device):
    for param in optim.state.values():
        # Not sure there are any global tensors in the state dict
        if isinstance(param, torch.Tensor):
            param.data = param.data.to(device)
            if param._grad is not None:
                param._grad.data = param._grad.data.to(device)
        elif isinstance(param, dict):
            for subparam in param.values():
                if isinstance(subparam, torch.Tensor):
                    subparam.data = subparam.data.to(device)
                    if subparam._grad is not None:
                        subparam._grad.data = subparam._grad.data.to(device)

In [7]:
class SentenceDetectionModel(nn.Module):
    def __init__(self, num_sentences, train_data_file):
        super(SentenceDetectionModel, self).__init__()
        self.num_sentences = num_sentences
        self.embedding_size = 768
        
        self.tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased", do_lower_case=True)
        self.xlnetModel = XLNetModel.from_pretrained('xlnet-base-cased').to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True)
        #self.multi_head_attention_sentences = nn.ModuleList([nn.MultiheadAttention(self.embedding_size, 8).to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True) for i in range(10)])
        #self.multi_head_attention_inter_sentence = nn.MultiheadAttention(self.embedding_size, 16).to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True)
        
        self.hidden = nn.Linear((self.num_sentences) * self.embedding_size, 4000, bias=True).to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True)
        self.activation = nn.Tanh().to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True);
        self.dropout = nn.Dropout(0.5).to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True);
        
        self.hidden1 = nn.Linear(4000, 2000, bias=True).to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True)
        self.activation1 = nn.Tanh().to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True);
        self.dropout1 = nn.Dropout(0.1).to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True);

        self.hidden2 = nn.Linear(2000, 1000, bias=True).to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True)
        self.activation2 = nn.Tanh().to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True);
        self.dropout2 = nn.Dropout(0.1).to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True);

        self.hidden3 = nn.Linear(1000, 100, bias=True).to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True)
        self.activation3 = nn.Tanh().to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True);
        self.dropout3 = nn.Dropout(0.1).to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True);

        self.out = nn.Linear(100, self.num_sentences, bias=True).to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True)
        self.softmax = nn.Softmax(-1).to(torch.device("cuda:0"), dtype=torch.float, non_blocking=True)

        #Freeze the parameters of the model.
        #for param in self.xlnetModel.base_model.parameters():
            #param.requires_grad = False

        data = self.extract_data_from_file(train_data_file)
        random.shuffle(data)
        train_size = math.floor(0.8*len(data))
        val_size = math.floor(0.1*len(data))
        train, val ,test = data[:train_size], data[train_size:train_size+val_size], data[train_size+val_size:]
        self.train_dataset = train
        self.test_dataset = test
        self.validation_dataset = val
        self.validation_error = []
        self.train_error = []
             
    def get_train_dataset(self):
      return self.train_dataset
    def get_test_dataset(self):
      return self.test_dataset

    def forward(self, input):
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        input = input.to(device)
        model_outputs = self.xlnetModel(**input)

        sentence_section_output = model_outputs.last_hidden_state[:, 0:480, :]
        sentence_embeddings = torch.zeros((1, self.num_sentences, self.embedding_size))
        
        #cls_tensor = model_outputs.last_hidden_state[:,514,:]
        #cls_tensor = cls_tensor.to('cuda:0')
        
        for i in range(self.num_sentences):
            sentence_start = (i*48)
            sentence_end = ((i+1)*48)
            sentence = sentence_section_output[:,  sentence_start:sentence_end  , :];
            #sentence = sentence.half()
            sentence = sentence.to('cuda:0').float()
            #sentence_attention_tensor = normalize(sentence + self.multi_head_attention_sentences[i](sentence,sentence,sentence)[0], p=2)
            sentence_tensor = torch.mean(sentence, 1)
            sentence_embeddings[0][i] = sentence_tensor
      
        
        
        sentence_embeddings = sentence_embeddings.to('cuda:0').float()
        #concat_sentence_cls = torch.cat((sentence_embeddings, 
        #                                      cls_tensor.reshape(1, cls_tensor.shape[0], 
        #                                                             cls_tensor.shape[1])), dim=1)
        
        #print(question_tensor.reshape(1, question_tensor.shape[0],question_tensor.shape[1]).shape)
        #print(sentence_embeddings.shape)
        #sentence_embeddings = sentence_embeddings.to(torch.float16)
        #concat_sentence_attention = normalize(sentence_embeddings + self.multi_head_attention_inter_sentence(sentence_embeddings, sentence_embeddings,sentence_embeddings)[0], p=2)
        
        hidden = self.dropout(self.activation(self.hidden(
            sentence_embeddings.reshape(1, (self.num_sentences) * self.embedding_size))))
        hidden1 = self.dropout1(self.activation1(self.hidden1(hidden)))
        hidden2 = self.dropout2(self.activation2(self.hidden2(hidden1)))
        hidden3 = self.dropout3(self.activation3(self.hidden3(hidden2)))
        #hidden1 = self.dropout1(self.activation1(self.hidden1(hidden)))
        
        output = self.out(hidden3)
        classification = self.softmax(output)
        return classification
        
    def train_model(self, epochs):
        
        #some code borrowed from https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
        train_data = self.train_dataset
        loss_function = nn.BCELoss(reduction='sum')
        optimiser = Adam(self.parameters(), lr=2e-5)
        optimizer_to(optimiser, "cuda:0")
        
        for i in range(epochs):
            running_loss = 0
            print("starting epoch " + str(i))
            random.shuffle(train_data)
            
            self.train()
            net_train_loss = 0;
            for index, ((question, sentences), (data, label)) in enumerate(train_data):
                optimiser.zero_grad()
                
                out = self(data)
                label = label.to("cuda:0")
                loss = loss_function(out, label)
                #print(out)
                loss.backward()
                optimiser.step()
                #print(self.output.parameters)
                
                running_loss += loss.item()
                net_train_loss += loss.item()
                if index % 10 == 9:    # print every 10 mini-batches
                    print(f'[{index + 1}, {index + 1:5d}] loss: {running_loss / 10:.3f}')
                    running_loss = 0.0
            
            self.train_error.append(net_train_loss)
            self.eval()
            with no_grad():
              net_loss = 0;
              for index, ((question, sentences), (data, label)) in enumerate(self.validation_dataset):
                output = self(data)
                label = label.to("cuda:0")
                loss = loss_function(out, label)
                net_loss = net_loss + loss.item()
              self.validation_error.append(net_loss)
              print("Validation : " + str(net_loss))
    
    def tokenize_sentences(self, sentences, question):
        tokenized_question = self.tokenizer(question, max_length = 32, padding = 'max_length', truncation = True, add_special_tokens=False, return_tensors='pt')
        tokenized_sentences = self.tokenizer(sentences, max_length = 48, padding = 'max_length', truncation = True, add_special_tokens=False, return_tensors='pt')
        tokenized_sentences['input_ids'] = tokenized_sentences.input_ids.reshape(1, tokenized_sentences.input_ids.shape[0] * tokenized_sentences.input_ids.shape[1])
        tokenized_sentences['token_type_ids'] = tokenized_sentences.token_type_ids.reshape(1, tokenized_sentences.token_type_ids.shape[0] * tokenized_sentences.token_type_ids.shape[1])
        tokenized_sentences['attention_mask'] = tokenized_sentences.attention_mask.reshape(1, tokenized_sentences.attention_mask.shape[0] * tokenized_sentences.attention_mask.shape[1])
        tokenized_sentences['input_ids'] = torch.cat((tokenized_sentences['input_ids'], self.tokenizer("", return_tensors='pt').input_ids[:,0].reshape(1,1)), dim = 1)
        tokenized_sentences['token_type_ids'] = torch.cat((tokenized_sentences['token_type_ids'], torch.tensor([[0]])), dim = 1)
        tokenized_sentences['attention_mask'] = torch.cat((tokenized_sentences['attention_mask'], torch.tensor([[1]])), dim = 1)
        tokenized_sentences['input_ids'] = torch.cat((tokenized_sentences['input_ids'], tokenized_question.input_ids), dim=1)
        tokenized_sentences['attention_mask'] = torch.cat((tokenized_sentences['attention_mask'], tokenized_question.attention_mask), dim=1)
        tokenized_question.token_type_ids[tokenized_question.token_type_ids == 0] = 1
        tokenized_sentences['token_type_ids'] = torch.cat((tokenized_sentences['token_type_ids'], tokenized_question.token_type_ids), dim=1)
        tokenized_sentences['input_ids'] = torch.cat((tokenized_sentences['input_ids'], self.tokenizer("", return_tensors='pt').input_ids), dim = 1)
        tokenized_sentences['token_type_ids'] = torch.cat((tokenized_sentences['token_type_ids'], torch.tensor([[1,2]])), dim = 1)
        tokenized_sentences['attention_mask'] = torch.cat((tokenized_sentences['attention_mask'], self.tokenizer("", return_tensors='pt').attention_mask), dim = 1)
        return tokenized_sentences

          
    def extract_data_from_file(self, file_path):
        data = pd.read_excel(file_path)
        train_data = []
        for row in data.iloc:
            labels = row["Valid Sentences"]
            label_tensor = torch.zeros(10)
            for label in str(labels).split(","):
                parsed_label = int(label.strip())
                label_tensor[parsed_label] = 1
            label_tensor = label_tensor.reshape(1, 10) 

            question = row["question"]
            #question = " ".join([word for word in question.split() if word.lower() not in sw_nltk])

            sentences = []
            for sent in str(row["Passage (10 sentences)"]).split('\n'):
              #processed_words = [word for word in sent.split() if word.lower() not in sw_nltk]
              #sent = " ".join(processed_words)
              sentences.append(sent)
                
            if(len(sentences) != 10):
                print(sentences)
                raise("Invalid number of sentences")
            
            #sentence_label_list = [(sentences[0:5], label_tensor[: , 0:5]), (sentences[5:], label_tensor[:, 5:])]
          

            #for sentence, label in sentence_label_list:
            #  if(1 not in label):
            #    continue;

            datapoint = ((question, sentences), (self.tokenize_sentences(sentences, question), label_tensor))
            train_data.append(datapoint)
        return train_data

In [8]:
sentenceDetection = SentenceDetectionModel(10, "/content/Book3.xlsx")

#sentenceDetection.load_state_dict(torch.load("/content/Model_plain_3_hidden_layer_5_epoch.pt"))

#data = data.to(device)


Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
sentenceDetection.validation_dataset[10][0]

('In which script is Urdu written?',
 ['Hindi is also spoken by a large population of Madheshis (people having roots in north-India but having migrated to Nepal over hundreds of years) of Nepal.',
  'Apart from this, Hindi is spoken by the large Indian diaspora which hails from, or has its origin from the "Hindi Belt" of India.',
  'A substantially large North Indian diaspora lives in countries like the United States of America, the United Kingdom, the United Arab Emirates, Trinidad and Tobago, Guyana, Suriname, South Africa, Fiji and Mauritius, where it is natively spoken at home and among their own Hindustani-speaking communities.',
  'Outside India, Hindi speakers are 8 million in Nepal; 863,077 in United States of America; 450,170 in Mauritius; 380,000 in Fiji; 250,292 in South Africa; 150,000 in Suriname; 100,000 in Uganda; 45,800 in United Kingdom; 20,000 in New Zealand; 20,000 in Germany; 26,000 in Trinidad and Tobago; 3,000 in Singapore.',
  '==. Comparison with Modern Standard

In [90]:
sentenceDetection.train_model(1)

starting epoch 0
[10,    10] loss: 0.419
[20,    20] loss: 1.540
[30,    30] loss: 0.952
[40,    40] loss: 0.090
[50,    50] loss: 1.859
[60,    60] loss: 0.996
[70,    70] loss: 0.760
[80,    80] loss: 0.356
[90,    90] loss: 0.375
[100,   100] loss: 0.627
[110,   110] loss: 0.600
[120,   120] loss: 0.546
[130,   130] loss: 0.839
[140,   140] loss: 0.950
[150,   150] loss: 0.433
[160,   160] loss: 0.427
[170,   170] loss: 0.421
[180,   180] loss: 0.680
[190,   190] loss: 0.071
[200,   200] loss: 0.409
[210,   210] loss: 0.668
[220,   220] loss: 0.696
[230,   230] loss: 1.671
[240,   240] loss: 0.074
Validation : 325.12775526195765


In [91]:
import numpy as np;

fig = go.Figure()

fig.add_trace(
    go.Scatter(
    x=np.arange(len(sentenceDetection.validation_error)),
    y=sentenceDetection.validation_error,
    name="validation_error"
    )
    )

fig.add_trace(
    go.Scatter(
    x=np.arange(len(sentenceDetection.validation_error)),
    y=sentenceDetection.train_error,
    name="train_error"
    )
    )

fig.update_layout(
    xaxis_title="Epoch",
    yaxis_title="Error",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)
fig.show()

In [60]:
sentenceDetection.eval()
for index, ((question, sentences), (data, label)) in enumerate(sentenceDetection.train_dataset[40:50]):
    train_out = sentenceDetection(data)
    print(question, sentences)
    print(label)
    print(train_out)
    print((train_out[0] > 0.1).nonzero().reshape(1, (train_out[0] > 0.1).nonzero().shape[0]))
    

How many million light-years away is NGC 520, a pair of colliding galaxies, located? ['Star is the closest-known white dwarf to us, 12.35 magnitude.', 'Due to the dimness of these stars, the constellation is essentially invisible in or near any major city due to light pollution.', '=== Deep-sky objects ===', 'M74 is a loosely wound (type Sc) spiral galaxy in Pisces, found at a distance of 30 million light years (redshift 0.0022).', 'It has many clusters of young stars and the associated nebulae, showing extensive regions of star formation.', 'It was discovered by Pierre Méchain, a French astronomer, in 1780.', 'A type II-P supernova was discovered in the outer regions of M74 by Robert Evans in June 2003; the star that underwent the supernova was later identified as a red supergiant with a mass of 8 solar masses.', 'NGC 488 is an isolated face-on prototypical spiral galaxy.', 'NGC 520 is a pair of colliding galaxies located 90 million lightyears away.', 'CL 0024+1654 is a massive galaxy

In [92]:
#sentenceDetection.eval()
for index, ((question, sentences), (data, label)) in enumerate(sentenceDetection.test_dataset):
    test_out = sentenceDetection(data)
    print(question, sentences)
    print(label)
    print(test_out)
    print((test_out[0] > 0.1).nonzero().reshape(1, (test_out[0] > 0.1).nonzero().shape[0]))
    

To which territory was Nagpur annexed in 1853? ['Every inch that we recede will be occupied by them.', 'The British East India Company gained Mumbai in the early 17th century, and became one of their major trading ports.', 'The Company slowly expanded areas under its rule during the 18th century.', 'The British governed western Maharashtra as part of the Bombay Presidency, which spanned an area from Karachi in Pakistan to northern Deccan.', 'A number of the Maratha states persisted as princely states, retaining autonomy in return for acknowledging British suzerainty.', 'The largest princely states in the territory were Nagpur, Satara and Kolhapur; Satara was annexed to the Bombay Presidency in 1848, and Nagpur was annexed in 1853 to become Nagpur Province, later part of the Central Provinces.', "Berar, which had been part of the Nizam of Hyderabad's kingdom, was occupied by the British in 1853 and annexed to the Central Provinces in 1903.", "However, a large region called Marathwada re

In [57]:
PATH = "Model_plain_3_hidden_layer_9_epochs_dict_Try1.pt"
torch.save(sentenceDetection.state_dict(), PATH)


In [43]:
test_data_1 = ("What is the world's most influential language?", ["Given the demographic prospects of the French-speaking nations of Africa, researcher Pascal-Emmanuel Gobry wrote in 2014 that French \"could be the language of the future\".",
"Significant as a judicial language, French is one of the official languages of such major international and regional courts, tribunals, and dispute-settlement bodies as the African Court on Human and Peoples' Rights, the Caribbean Court of Justice, the Court of Justice for the Economic Community of West African States, the Inter-American Court of Human Rights, the International Court of Justice, the International Criminal Tribunal for the former Yugoslavia, International Criminal Tribunal for Rwanda, the International Tribunal for the Law of the Sea the International Criminal Court and the World Trade Organization Appellate Body.",
"It is the sole internal working language of the Court of Justice of the European Union, and makes with English the European Court of Human Rights's two working languages.",
"In 1997, George Weber published, in Language Today, a comprehensive academic study entitled \"The World's 10 most influential languages\".",
"In the article, Weber ranked French as, after English, the second most influential language of the world, ahead of Spanish.",
"His criteria were the numbers of native speakers, the number of secondary speakers (especially high for French among fellow world languages), the number of countries using the language and their respective populations, the economic power of the countries using the language, the number of major areas in which the language is used, and the linguistic prestige associated with the mastery of the language (Weber highlighted that French in particular enjoys considerable linguistic prestige).",
"In a 2008 reassessment of his article, Weber concluded that his findings were still correct since \"the situation among the top ten remains unchanged.",
"\"Knowledge of French is often considered to be a useful skill by business owners in the United Kingdom; a 2014 study found that 50% of British managers considered French to be a valuable asset for their business, thus ranking French as the most sought-after foreign language there, ahead of German (49%) and Spanish (44%).",
"MIT economist Albert Saiz calculated a 2.3% premium for those who have French as a foreign language in the workplace.",
"In English-speaking Canada, the United Kingdom, and the Republic of Ireland, French is the first foreign language taught and in number of pupils is far ahead of other languages."])

In [44]:
test_data_2 = (sentenceDetection.tokenize_sentences(test_data_1[1], test_data_1[0]), torch.tensor([[0,0,0,0,0,0,0,0,1,0]]))
test_data_single = (test_data_1, test_data_2)


In [93]:
sentenceDetection.eval()
t = sentenceDetection(test_data_single[1][0])
print(t)
print((t[0] > 0.1).nonzero().reshape(1, (t[0] > 0.1).nonzero().shape[0]))

tensor([[0.0048, 0.0024, 0.0030, 0.9652, 0.0053, 0.0034, 0.0040, 0.0037, 0.0065,
         0.0019]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[3]], device='cuda:0')
