In [32]:
!pip install transformers



In [33]:
#importing all necessary libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import MultiStepLR

from sklearn.preprocessing import LabelEncoder
from itertools import chain # for flatting the list
#from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix


from transformers import BertModel
from transformers import BertTokenizer
from transformers import AutoModel, AutoTokenizer

###
###from indic_transliteration import sanscript
###from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate

import re
import unicodedata
import codecs
import random

In [34]:
# for reproducibility
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(0)

In [35]:
random.random()

0.24562119808717897

In [75]:
#dataset classs for ATIS data
class ATIS(Dataset):

  def __init__(self, df, translit_prob=0, shuffle_prob=0, max_token_len=100):
    super(ATIS, self).__init__()

    self.df = df
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    self.max_token_len = max_token_len
    self.translit_prob = translit_prob
    self.shuffle_prob = shuffle_prob


  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):

    query = self.df.iloc[index, 0]
    query = self.query_preprocessing(query)

    if random.random() < self.shuffle_prob:
      query_list = query.split()
      if len(query_list) < 10:
        random.shuffle(query_list)
        query = " ".join(query_list)

    tokens = self.tokenizer.tokenize(query)
    tokens = ['[CLS]'] + tokens + ['[SEP]']

    if len(tokens) < self.max_token_len:
      tokens = tokens + ['[PAD]' for i in range(self.max_token_len-len(tokens))]
    else:
      tokens = tokens[:self.max_token_len-1] + ['[SEP]']

    token_ids = self.tokenizer.convert_tokens_to_ids(tokens)

    token_id_tensor = torch.tensor(token_ids)
    attention_mask_tensor = (token_id_tensor != 0).long()

    return token_id_tensor, attention_mask_tensor

  def unicodeToAscii(self, s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

  def normalizeString(self, s):
    s = self.unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r"", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s


  def query_preprocessing(self, query_text):
    q= self.normalizeString(query_text)

    return q

In [76]:
#either read data from a csv file or convert a list of sentences into a dataframe
final_test_file='/kaggle/input/tanglishbertmodel/tamil_test_s.csv'

In [77]:
df=pd.read_csv(final_test_file,header=None)

In [78]:
df = df.drop(df.columns[0], axis=1)

In [79]:
df

Unnamed: 0,1
0,yarayellam fdfs ppga ippove ready agitinga
1,ennada viswasam mersal sarkar madhri time la l...
2,yuvan vera level ya valuable script sk in action
3,all the best annatelugu makkal selvan fans
4,verithanama iruku nu solravanga like podunga
5,mokka ya tha iruku trailer antha level ku peru...
6,wait and watch ennu neraya neraya neraya erika...
7,tamizha na la mudiyatha thu ethuvume illasanka...
8,padu mokkai ean thalayai kooni kondu nikkuthu ...
9,kannane kanne milion views


In [101]:
sentences=['yarayellam fdfs ppga ippove ready agitinga']
df2 = pd.DataFrame({'Sentences': sentences})

In [102]:
df2

Unnamed: 0,Sentences
0,yarayellam fdfs ppga ippove ready agitinga


In [103]:
# creating instance of datset class

final_test_set = ATIS(df2,max_token_len=120,translit_prob=0,shuffle_prob=0)

In [104]:

print(len(final_test_set))

1


In [106]:
final_test_set[0]

(tensor([  101, 10549, 23432, 14058, 10147,   174, 10162, 25743, 11309, 10483,
           177, 45565, 10612, 42374, 20735, 24906,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [107]:

test_loader =DataLoader(final_test_set, batch_size = 64, num_workers =4, shuffle=False)

In [108]:
for i, data in enumerate(test_loader, 0):
  print(data[0].shape)
  print(data[1].shape)
  break

torch.Size([1, 120])
torch.Size([1, 120])


In [109]:
class INTENT_CLASSIFIER(nn.Module):

  def __init__(self, freeze_bert=True):
    super(INTENT_CLASSIFIER, self).__init__()

    self.bert_layers = BertModel.from_pretrained('bert-base-multilingual-cased',return_dict=False)
    self.linear1 = nn.Linear(768, 300)
    self.linear11 = nn.Linear(300, 8)
    self.linear2 = nn.Linear(8, 2)
    self.dropout = nn.Dropout(0.5)

    if freeze_bert:
      for param in self.bert_layers.parameters():
        param.requires_grad = False


  def forward(self, token_ids, atten_mask):
    """Both argument are of shape: batch_size, max_seq_len"""
    _, CLS = self.bert_layers(token_ids, attention_mask = atten_mask)
    logits = self.dropout(self.linear1(CLS))
    logits = self.dropout(self.linear11(logits))
    logits = self.linear2(logits)

    return logits

In [110]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [111]:
model = torch.load('/kaggle/input/tanglishbertmodel/best_model.pth',map_location=torch.device('cpu'))

In [112]:
model.to(device)
print(f"model loaded to {device}")

model loaded to cpu


In [113]:
# Getting the test accuracy
def test():
  correct = 0
  total = 0
  model.eval()
  y_test_prediction = []
  y_test_true = []

  with torch.no_grad():
      for data in test_loader:
          tokens, masks= data

          tokens = tokens.to(device)
          masks = masks.to(device)
          

          outputs = model(tokens, masks)
          _, predicted = torch.max(outputs.data, 1)
          

          y_test_prediction += predicted.detach().cpu().numpy().tolist()


  return y_test_prediction

In [114]:
y_test_prediction = test()
#y_train_true, y_train_prediction= train_acc()

In [115]:
y_test_prediction

[1]