In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from transformers import AutoModel
from transformers import AutoTokenizer

In [2]:
def get_max_len(model, data_path):
    df = pd.read_csv(data_path + "_train.tsv", sep='\t')
    tokenizer = AutoTokenizer.from_pretrained(model)

    max_len = []

    for i in range(len(df)):
        text = df.cleaned_text[i]

        s = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            return_token_type_ids=False,
            padding="do_not_pad",
            return_attention_mask=True,
            return_tensors="pt",  # to get a torch.Tensor
            truncation=True,
        )

        max_len.append(len(s['input_ids'][0]))

    print(f"len for {model} --- {data_path.split('/')[-1].split('_')[0]} => max = {max(max_len)}")

In [3]:
get_max_len('distilbert-base-cased', './data/tamil_offensive')
# get_max_len('distilbert-base-cased', './data/malayalam_hasoc')

get_max_len('ai4bharat/indic-bert', './data/tamil_offensive')
# get_max_len('ai4bharat/indic-bert', './data/malayalam_hasoc')

len for distilbert-base-cased --- tamil => max = 187
len for ai4bharat/indic-bert --- tamil => max = 276


In [2]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
df = pd.read_csv("./data/tamil_codemix_offensive_train.tsv", sep='\t')

def convert(text):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        padding="max_length",
        return_attention_mask=True,
        return_tensors="pt",  # to get a torch.Tensor
        truncation=True,
    )

In [11]:
class TransformerClassifier(nn.Module):
    def __init__(self, model_name, hidden_states=256, dropout=0.5, n_classes=2):
        super().__init__()

        self.bert = AutoModel.from_pretrained(model_name)
        self.lin0 = nn.Linear(self.bert.config.hidden_size, hidden_states)
        self.lin1 = nn.Linear(hidden_states, n_classes)
        self.reLU = nn.ReLU()
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = output.last_hidden_state
        output = torch.sum(output, dim=1) / torch.sum(attention_mask, axis=1).view(output.shape[0], 1)
        y = self.lin0(output)
        y = self.reLU(y)
        y = self.dropout(y)
        return self.lin1(y)


model = TransformerClassifier('distilbert-base-cased', 768, 0.3)

In [12]:
data1 = convert(df.cleaned_text[0])
data2 = convert(df.cleaned_text[1])

in_ = torch.cat([data1['input_ids'], data2['input_ids']])
out_ = torch.cat([data1['attention_mask'], data2['attention_mask']])

outputs = model(
    input_ids=in_,
    attention_mask=out_,
)

In [13]:
outputs

tensor([[ 1.5818, -0.7787],
        [ 0.4718,  0.1129]], grad_fn=<AddmmBackward>)