In [1]:
#importing all necessary libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import MultiStepLR

from sklearn.preprocessing import LabelEncoder
from itertools import chain # for flatting the list
#from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix


from transformers import BertModel
from transformers import BertTokenizer
from transformers import AutoModel, AutoTokenizer

###
###from indic_transliteration import sanscript
###from indic_transliteration.sanscript import SchemeMap, SCHEMES, transliterate

import re
import unicodedata
import codecs
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# for reproducibility
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(0)

In [3]:
random.random()

0.8711168786507556

In [5]:
#dataset classs for ATIS data
class ATIS(Dataset):

  def __init__(self, df, translit_prob=0, shuffle_prob=0, max_token_len=100):
    super(ATIS, self).__init__()

    self.df = df
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    self.max_token_len = max_token_len
    self.translit_prob = translit_prob
    self.shuffle_prob = shuffle_prob


  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):

    query = self.df.iloc[index, 0]
    query = self.query_preprocessing(query)

    if random.random() < self.shuffle_prob:
      query_list = query.split()
      if len(query_list) < 10:
        random.shuffle(query_list)
        query = " ".join(query_list)

    tokens = self.tokenizer.tokenize(query)
    tokens = ['[CLS]'] + tokens + ['[SEP]']

    if len(tokens) < self.max_token_len:
      tokens = tokens + ['[PAD]' for i in range(self.max_token_len-len(tokens))]
    else:
      tokens = tokens[:self.max_token_len-1] + ['[SEP]']

    token_ids = self.tokenizer.convert_tokens_to_ids(tokens)

    token_id_tensor = torch.tensor(token_ids)
    attention_mask_tensor = (token_id_tensor != 0).long()

    return token_id_tensor, attention_mask_tensor

  def unicodeToAscii(self, s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

  def normalizeString(self, s):
    s = self.unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r"", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s


  def query_preprocessing(self, query_text):
    q= self.normalizeString(query_text)

    return q

In [59]:
#either read data from a csv file or convert a list of sentences into a dataframe
final_test_file='tamil_dev.csv'

In [60]:
df=pd.read_csv(final_test_file,header=None)

In [61]:
df = df.drop(df.columns[0], axis=1)

In [62]:
df

Unnamed: 0,1
0,daily likes views pakka vanthavaga ellarukum ...
1,k dislikes ethuku da intha trailerku poi apdi...
2,it looks like hindi movie amitab bachan
3,thalaivarukku nejamavey vayasaagiduchu sivaji ...
4,thala nu sollu thala nemirinthu nillu
...,...
1017,earphone la you bgm kekum pothu vera level
1018,sappypathy vijayamma bigilu ummpi get more dis...
1019,stylea erukana ha ha naturally
1020,style la irukana hahaha mass dialogue


In [63]:
# creating instance of datset class

final_test_set = ATIS(df,max_token_len=120,translit_prob=0,shuffle_prob=0)

In [64]:

print(len(final_test_set))

1022


In [65]:
final_test_set[0]

(tensor([  101, 27636, 11850, 10107, 33396, 17284, 10371, 23266, 79313, 19357,
         15121, 40310, 10465, 10145, 10710, 46750,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [66]:

test_loader =DataLoader(final_test_set, batch_size = len(final_test_set), shuffle=False)

In [5]:
class INTENT_CLASSIFIER(nn.Module):

  def __init__(self, freeze_bert=True):
    super(INTENT_CLASSIFIER, self).__init__()

    self.bert_layers = BertModel.from_pretrained('bert-base-multilingual-cased',return_dict=False)
    self.linear1 = nn.Linear(768, 300)
    self.linear11 = nn.Linear(300, 8)
    self.linear2 = nn.Linear(8, 2)
    self.dropout = nn.Dropout(0.5)

    if freeze_bert:
      for param in self.bert_layers.parameters():
        param.requires_grad = False


  def forward(self, token_ids, atten_mask):
    """Both argument are of shape: batch_size, max_seq_len"""
    _, CLS = self.bert_layers(token_ids, attention_mask = atten_mask)
    logits = self.dropout(self.linear1(CLS))
    logits = self.dropout(self.linear11(logits))
    logits = self.linear2(logits)

    return logits

In [6]:
model = torch.load('best_model.pth',map_location=torch.device('cpu'))

In [9]:
torch.save(model, "bert_model.pt")

In [69]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [70]:
model.to(device)
print(f"model loaded to {device}")

model loaded to cpu


In [71]:
# Getting the test accuracy
def test():
  correct = 0
  total = 0
  model.eval()
  y_test_prediction = []
  y_test_true = []

  with torch.no_grad():
      for data in test_loader:
          tokens, masks= data
          
          tokens = tokens.to(device)
          masks = masks.to(device)
          

          outputs = model(tokens, masks)
          
          _, predicted = torch.max(outputs.data, 1)
          
          y_test_prediction += predicted.detach().cpu().numpy().tolist()
          


  return y_test_prediction

In [72]:
y_test_prediction = test()
#y_train_true, y_train_prediction= train_acc()

In [75]:
np.unique(y_test_prediction, return_counts=True)

(array([0, 1]), array([ 53, 969]))

In [93]:
df.iloc[29]

1    enna da trailer kannapundai madiri iruku
Name: 29, dtype: object

In [84]:
np.argwhere(np.array(y_test_prediction) == 0)

array([[  2],
       [ 29],
       [ 30],
       [ 68],
       [ 72],
       [ 79],
       [ 96],
       [101],
       [108],
       [116],
       [128],
       [135],
       [140],
       [151],
       [218],
       [221],
       [228],
       [230],
       [235],
       [249],
       [283],
       [307],
       [356],
       [359],
       [403],
       [423],
       [457],
       [465],
       [469],
       [477],
       [502],
       [517],
       [552],
       [572],
       [593],
       [610],
       [633],
       [634],
       [679],
       [681],
       [682],
       [737],
       [746],
       [760],
       [772],
       [831],
       [853],
       [888],
       [894],
       [927],
       [961],
       [971],
       [973]])

In [89]:
df

Unnamed: 0,1
0,daily likes views pakka vanthavaga ellarukum ...
1,k dislikes ethuku da intha trailerku poi apdi...
2,it looks like hindi movie amitab bachan
3,thalaivarukku nejamavey vayasaagiduchu sivaji ...
4,thala nu sollu thala nemirinthu nillu
...,...
1017,earphone la you bgm kekum pothu vera level
1018,sappypathy vijayamma bigilu ummpi get more dis...
1019,stylea erukana ha ha naturally
1020,style la irukana hahaha mass dialogue


In [43]:
df2 = pd.DataFrame(df.loc[0, :])

In [44]:
df2

Unnamed: 0,0
1,yarayellam fdfs ppga ippove ready agitinga


In [45]:
type(df2)

pandas.core.frame.DataFrame

In [46]:
# creating instance of datset class

final_test_set2 = ATIS(df2,max_token_len=120,translit_prob=0,shuffle_prob=0)

In [47]:

test_loader2 =DataLoader(final_test_set2, batch_size = len(final_test_set2), shuffle=False)

In [48]:
def test():
  correct = 0
  total = 0
  model.eval()
  y_test_prediction = []
  y_test_true = []

  with torch.no_grad():
      for data in test_loader2:
          tokens, masks= data
          
          tokens = tokens.to(device)
          masks = masks.to(device)
          

          outputs = model(tokens, masks)
          
          _, predicted = torch.max(outputs.data, 1)
          
          y_test_prediction += predicted.detach().cpu().numpy().tolist()
          


  return y_test_prediction

In [97]:
def sa_bert(sentences):
    sent_df = pd.DataFrame({"Sentences": [sentences]})
    # creating instance of datset class
    test_set = ATIS(sent_df,max_token_len=120,translit_prob=0,shuffle_prob=0)
    t_loader = DataLoader(test_set, batch_size = len(test_set), shuffle=False)
    model.eval()
    y_test_prediction = []
    with torch.no_grad():
        for data in t_loader:
            tokens, masks= data
            tokens = tokens.to(device)
            masks = masks.to(device)
            outputs = model(tokens, masks)
            _, predicted = torch.max(outputs.data, 1)
            y_test_prediction += predicted.detach().cpu().numpy().tolist()
    return y_test_prediction

In [100]:
sa_bert("enna da ithu hindi padam")

[0]