In [13]:
import glob
import pandas as pd
import torch.nn.functional as F
from transformers import  AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [3]:
INT_TO_LABEL_MAPPING = {0: 'Not_offensive', 1: 'Offensive_Untargetede', 2: 'Offensive_Targeted_Insult_Individual', 
                       3: 'Offensive_Targeted_Insult_Group', 4: 'not-Tamil', 5: 'Offensive_Targeted_Insult_Other'}

In [4]:
LABEL_TO_INT_MAPPING = {v:k for k,v in INT_TO_LABEL_MAPPING.items()}

In [5]:
LABEL_TO_INT_MAPPING

{'Not_offensive': 0,
 'Offensive_Untargetede': 1,
 'Offensive_Targeted_Insult_Individual': 2,
 'Offensive_Targeted_Insult_Group': 3,
 'not-Tamil': 4,
 'Offensive_Targeted_Insult_Other': 5}

In [6]:
DATA_DIR = '/home/temp/data/eacl'
LANG = 'ta'

In [None]:
train = pd.read_csv(f"{DATA_DIR}/{LANG}/tamil_offensive_full_train_transliterated.csv")

In [None]:
train['label'] = train['label'].apply(lambda x: LABEL_TO_INT_MAPPING[x])

In [36]:
train.to_csv(f"{DATA_DIR}/{LANG}/tamil_offensive_full_train_transliterated.csv", index=False)

In [20]:
valid = pd.read_csv(f"{DATA_DIR}/{LANG}/tamil_offensive_full_dev_transliterated.csv")

In [21]:
valid

Unnamed: 0,sentence,label
0,Handsome hunk keri vaa thalaivaa,0
1,thenkaachi maavattam naataar chamuthaayam chaa...,0
2,je vous aime bravo pour clip de merde que j √©c...,4
3,chirappu melum ithu poonra pataippukal mika av...,0
4,Vera level BGM ..semma trailer. ü§û,0
...,...,...
4383,mishkin -chinimaavin chaliththu poona vattath...,0
4384,Sivaji -Bhajii Sapdu Petta -Sweet Sapdu Seri...,0
4385,8k dislike sure all vijay fans,5
4386,Lady super star Manju warrier Fans Hit like,0


In [39]:
valid['label'] = valid['label'].apply(lambda x: LABEL_TO_INT_MAPPING[x])

In [41]:
valid.to_csv(f"{DATA_DIR}/{LANG}/tamil_offensive_full_dev_transliterated.csv", index=False)

In [8]:
model_checkpoint = f"{DATA_DIR}/{LANG}/datasets/robert_ta_classification"

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [24]:
len(tokenizer("‡ÆÖ‡ÆÆ‡ØÜ‡Æ∞‡Æø‡Æï‡Øç‡Æï‡Ææ ‡Æá‡Æµ‡Øç‡Æµ‡Æ≥‡Æµ‡ØÅ ‡ÆÆ‡ØÅ‡ÆÆ‡Øç‡ÆÆ‡ØÅ‡Æ∞‡ÆÆ‡Ææ‡Æï ‡Æµ‡Æ∞‡Øç‡Æ§‡Øç‡Æ§‡Æï ‡Æ™‡Øã‡Æ∞‡Æø‡Æ≤‡Øç ‡Æà‡Æü‡ØÅ‡Æ™‡Æü‡Øç‡Æü‡ØÅ ‡Æö‡Æü‡Øç‡Æü‡ØÜ‡Æ©‡Øç‡Æ±‡ØÅ ‡Æ™‡Æø‡Æ©‡Øç‡Æµ‡Ææ‡Æô‡Øç‡Æï‡Æø‡ÆØ ‡Æ™‡Øã‡Æµ‡Æ§‡Øá ‡Æö‡Æ®‡Øç‡Æ§‡Øá‡Æï‡ÆÆ‡Øç ‡Æé‡Æ¥‡ØÅ‡Æ®‡Øç‡Æ§‡Æ§‡ØÅ ‡ÆÖ‡ÆÆ‡ØÜ‡Æ∞‡Æø‡Æï‡Øç‡Æï‡Ææ‡Æï‡Ææ‡Æ∞‡Æ©‡Øç ‡Æµ‡Æ∞‡Øç‡Æ§‡Øç‡Æ§‡Æï ‡Æ™‡Øã‡Æ∞‡Æø‡Æ©‡Ææ‡Æ≤‡Øç ‡Æö‡ØÜ‡ÆØ‡Øç‡ÆØ ‡ÆÆ‡ØÅ‡Æü‡Æø‡ÆØ‡Ææ‡Æ§ ‡Æµ‡Æø‡Æ∑‡ÆØ‡Æ§‡Øç‡Æ§‡Øà ‡Æö‡ÆÆ‡Ææ‡Æ§‡Ææ‡Æ©‡ÆÆ‡Øç ‡Æö‡ØÜ‡ÆØ‡Øç‡Æ§‡ØÅ‡Æµ‡Æø‡Æü‡Øç‡Æü‡ØÅ ‡Æ§‡Æô‡Øç‡Æï‡Æ≥‡Øç ‡ÆÆ‡ØÄ‡Æ§‡ØÅ ‡Æö‡Æ®‡Øç‡Æ§‡Øá‡Æï‡ÆÆ‡Øç ‡Æµ‡Æ∞‡Ææ‡ÆÆ‡Æ≤‡Øç bio war ‡ÆÆ‡ØÇ‡Æ≤‡ÆÆ‡Øç ‡Æá‡Æ§‡Øà ‡Æö‡ØÜ‡ÆØ‡Øç‡Æ§‡Æø‡Æ∞‡ØÅ‡Æï‡Øç‡Æï‡Æï‡Øç‡Æï‡ØÇ‡Æü‡ØÅ‡ÆÆ‡Øç ‡ÆÖ‡Æ§‡Æ©‡Ææ‡Æ≤‡Øç‡Æ§‡Ææ‡Æ©‡Øç ‡Æé‡Æ©‡Øç ‡ÆÖ‡Æ™‡Øç‡Æ™‡Æ©‡Øç ‡Æï‡ØÅ‡Æ§‡Æø‡Æ∞‡ØÅ‡Æï‡Øç‡Æï‡ØÅ‡Æ≥‡Øç ‡Æá‡Æ≤‡Øç‡Æ≤‡Øà ‡Æé‡Æ©‡Øç‡Æ™‡Æ§‡ØÅ ‡Æ™‡Øã‡Æ≤ ‡Æá‡Æ∏‡Øç‡Æ∞‡Øá‡Æ≤‡Øç ‡Æ§‡Ææ‡Æ©‡Ææ‡Æï ‡ÆÆ‡ØÅ‡Æ©‡Øç‡Æµ‡Æ®‡Øç‡Æ§‡ØÅ ‡Æá‡Æ§‡Æ±‡Øç‡Æï‡ØÅ ‡Æí‡Æ∞‡ØÅ ‡Æï‡Ææ‡Æ∞‡Æ£‡ÆÆ‡Øç ‡Æï‡Æ±‡Øç‡Æ™‡Æø‡Æï‡Øç‡Æï‡Æø‡Æ±‡Æ§‡ØÅ ‡Æá‡Æ®‡Øç‡Æ§ ‡Æµ‡Øà‡Æ∞‡Æ∏‡Øç ‡Æ™‡Ææ‡ÆÆ‡Øç‡Æ™‡ØÅ ‡Æ™‡Æ≥‡Øç‡Æ≥‡Æø ‡Æá‡Æ§‡ØÅ ‡Æ™‡Øã‡Æ©‡Øç‡Æ± ‡Æâ‡ÆØ‡Æø‡Æ∞‡Æø‡Æ©‡Æô‡Øç‡Æï‡Æ≥‡Øà ‡Æâ‡Æ£‡Øç‡Æ™‡Æµ‡Æ∞‡Øç‡Æï‡Æ≥‡ØÅ‡Æï‡Øç‡Æï‡ØÅ ‡ÆÖ‡Æ§‡Æø‡Æï‡ÆÆ‡Ææ‡Æ© ‡Æ™‡Ææ‡Æ§‡Æø‡Æ™‡Øç‡Æ™‡ØÅ‡Æï‡Æ≥‡Øà ‡Æè‡Æ±‡Øç‡Æ™‡Æü‡ØÅ‡Æ§‡Øç‡Æ§ ‡Æï‡ØÇ‡Æü‡Æø‡ÆØ ‡Æµ‡Æï‡Øà‡ÆØ‡Æø‡Æ≤‡Øç ‡Æâ‡Æ∞‡ØÅ‡Æµ‡Ææ‡Æï‡Øç‡Æï‡Æø ‡Æá‡Æ∞‡ØÅ‡Æï‡Øç‡Æï‡Æï‡Øç‡Æï‡ØÇ‡Æü‡ØÅ‡ÆÆ‡Øç ‡Æá‡Æ©‡Øç‡Æ±‡Øà‡ÆØ ‡Æö‡ØÇ‡Æ¥‡Øç‡Æ®‡Æø‡Æ≤‡Øà‡Æï‡Øç‡Æï‡ØÅ ‡Æé‡Æ®‡Øç‡Æ§ ‡Æµ‡Øà‡Æ∞‡Æ∏‡ØÅ‡ÆÆ‡Øç ‡Æ§‡Ææ‡Æ©‡Ææ‡Æï ‡Æâ‡Æ∞‡ØÅ‡Æµ‡Ææ‡Æµ‡Æ§‡Æø‡Æ≤‡Øç‡Æ≤‡Øà ‡ÆÖ‡Æ©‡Øà‡Æ§‡Øç‡Æ§‡ØÅ‡ÆÆ‡Øç ‡Æâ‡Æ∞‡ØÅ‡Æµ‡Ææ‡Æï‡Øç‡Æï‡Æ™‡Øç‡Æ™‡Æü‡ØÅ‡Æ™‡Æµ‡Øà ‡Æè‡Æ¥‡Ææ‡ÆÆ‡Øç ‡ÆÖ‡Æ±‡Æø‡Æµ‡ØÅ ‡Æ™‡Æü‡Æ§‡Øç‡Æ§‡Æø‡Æ©‡Øç ‡Æï‡Æ§‡Øà‡ÆØ‡Øà ‡ÆÖ‡ÆÆ‡ØÜ‡Æ∞‡Æø‡Æï‡Øç‡Æï‡Ææ ‡Æö‡ØÄ‡Æ©‡Ææ‡Æµ‡Æø‡Æ≤‡Øç ‡Æ®‡Æü‡Øà‡ÆÆ‡ØÅ‡Æ±‡Øà ‡Æ™‡Æü‡ØÅ‡Æ§‡Øç‡Æ§‡Æø ‡Æµ‡Æø‡Æü‡Øç‡Æü‡Æ§‡ØÅ ‡Æ™‡Øã‡Æ≤‡ØÅ‡ÆÆ‡Øç ‡Æï‡Æ£‡Øç‡Æü‡Æø‡Æ™‡Øç‡Æ™‡Ææ‡Æï ‡Æá‡Æ§‡Æ±‡Øç‡Æï‡Ææ‡Æ© ‡ÆÆ‡Æ∞‡ØÅ‡Æ®‡Øç‡Æ§‡ØÅ ‡ÆÖ‡ÆÆ‡ØÜ‡Æ∞‡Æø‡Æï‡Øç‡Æï‡Ææ‡Æµ‡Æø‡Æ©‡Øç ‡Æ®‡Æü‡Øç‡Æ™‡ØÅ ‡Æ®‡Ææ‡Æü‡Ææ‡Æ© ‡Æè‡Æ§‡Øã ‡Æí‡Æ∞‡ØÅ ‡Æ®‡Ææ‡Æü‡Øç‡Æü‡Æø‡Æ©‡Øç ‡ÆÆ‡ØÇ‡Æ≤‡ÆÆ‡Øá ‡Æµ‡ØÜ‡Æ≥‡Æø‡Æµ‡Æ∞‡ØÅ‡ÆÆ‡Øç ‡Æí‡Æ∞‡Øá ‡Æï‡Æ≤‡Øç‡Æ≤‡Æø‡Æ≤‡Øç ‡Æá‡Æ∞‡Æ£‡Øç‡Æü‡ØÅ ‡ÆÆ‡Ææ‡Æô‡Øç‡Æï‡Ææ‡ÆØ‡Øç ‡Æé‡Æ©‡Øç‡Æ™‡Æ§‡ØÅ ‡Æ™‡Øã‡Æ≤ ‡Æá‡Æ∏‡Øç‡Æ∞‡Øá‡Æ≤‡Øç ‡Æö‡ØÄ‡Æ©‡Ææ‡Æµ‡Æø‡Æ©‡Øç ‡Æ™‡ÆØ‡Ææ‡Æµ‡Ææ‡Æ∞‡Øç ‡ÆÜ‡Æ∞‡Ææ‡ÆØ‡Øç‡Æö‡Øç‡Æö‡Æø‡ÆØ‡Æø‡Æ©‡Øç ‡ÆÆ‡ØÄ‡Æ§‡ØÅ ‡Æ™‡Æ¥‡Æø ‡Æö‡ØÅ‡ÆÆ‡Æ§‡Øç‡Æ§‡Æ™‡Øç‡Æ™‡Æü‡Øç‡Æü‡ØÅ‡Æ≥‡Øç‡Æ≥‡Æ§‡ØÅ")['attention_mask'])


Token indices sequence length is longer than the specified maximum sequence length for this model (2267 > 512). Running this sequence through the model will result in indexing errors


2267

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=6)

In [11]:
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(10000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [27]:
def get_robert_preds(model, sentences):
    model.eval()
    labels = []
    prob0 = []
    prob1 = []
    prob2 = []
    prob3 = []
    prob4 = []
    prob5 = []
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True)
        outputs = model(**inputs)
        prob = F.softmax(outputs['logits'])
        
        label = prob.argmax().item()

        prob_arr = prob.detach().numpy()[0]
                
        prob0.append(prob_arr[0])
        prob1.append(prob_arr[1])
        prob2.append(prob_arr[2])
        prob3.append(prob_arr[3])
        prob4.append(prob_arr[4])
        prob5.append(prob_arr[5])
        
        labels.append(label)
        
    return labels, prob0, prob1, prob2, prob3, prob4, prob5

### Geting Predictions on validation

In [28]:
valid['pred_label'], valid['prob0'], valid['prob1'], valid['prob2'], valid['prob3'], valid['prob4'], valid['prob5'] = get_robert_preds(model, valid['sentence'].values)


  prob = F.softmax(outputs['logits'])


In [30]:
valid['label'].value_counts()

0    3193
1     356
2     307
3     295
4     172
5      65
Name: label, dtype: int64

In [29]:
INT_TO_LABEL_MAPPING

{0: 'Not_offensive',
 1: 'Offensive_Untargetede',
 2: 'Offensive_Targeted_Insult_Individual',
 3: 'Offensive_Targeted_Insult_Group',
 4: 'not-Tamil',
 5: 'Offensive_Targeted_Insult_Other'}

In [37]:
valid[valid['label'] != valid['pred_label']]['label'].value_counts(), valid['label'].value_counts()


(0    273
 3    221
 1    211
 2    210
 5     65
 4     37
 Name: label, dtype: int64,
 0    3193
 1     356
 2     307
 3     295
 4     172
 5      65
 Name: label, dtype: int64)

In [36]:
valid[valid['label'] != valid['pred_label']]

Unnamed: 0,sentence,label,pred_label,prob0,prob1,prob2,prob3,prob4,prob5
17,dei YENDA ungalukku inthe illatha Vella Surya ...,2,1,0.178183,0.572586,0.124952,0.092578,0.001544,0.030158
21,thala innum ipdi full white evvalavu naal nadi...,2,0,0.988744,0.006968,0.001753,0.000818,0.000555,0.001163
36,Comment la en da picha edukuringa... pichakara...,3,1,0.048038,0.588367,0.020289,0.266699,0.001578,0.075030
41,Verithanan semma mass trailer.... theatre la k...,0,4,0.336287,0.016323,0.054608,0.009774,0.576574,0.006434
44,Enna Style u.. Enna Screen Presence u.. Pesama...,2,0,0.843837,0.057799,0.060578,0.027245,0.001842,0.008698
...,...,...,...,...,...,...,...,...,...
4361,hey hi all Tamil idiot people ungala Mari peop...,1,3,0.094424,0.057748,0.197697,0.596027,0.021999,0.032105
4369,Ivarukkku eppodhum thalaivar kalaigner lightaa...,2,4,0.076376,0.056359,0.059041,0.014705,0.782467,0.011052
4372,Trailer Nala irukanu oru than comment pandranu...,1,0,0.744816,0.149109,0.042122,0.025749,0.023841,0.014364
4378,Wigpathy Visay na Padam Flop than ithula Kabal...,2,0,0.332910,0.227047,0.189520,0.215943,0.000502,0.034079


In [38]:
valid.to_csv(f"{DATA_DIR}/{LANG}/tamil_valid_results_robert.csv", index=False)

In [48]:
valid

Unnamed: 0,sentence,label,pred_label,prob0,prob1,prob2,prob3,prob4,prob5
0,Handsome hunk keri vaa thalaivaa,0,0,0.880710,0.005286,0.006391,0.005176,0.090608,0.011829
1,thenkaachi maavattam naataar chamuthaayam chaa...,0,0,0.996509,0.001597,0.000421,0.000549,0.000660,0.000263
2,je vous aime bravo pour clip de merde que j √©c...,4,5,0.018960,0.004502,0.011228,0.002337,0.005905,0.957067
3,chirappu melum ithu poonra pataippukal mika av...,0,0,0.992613,0.002417,0.001253,0.000846,0.002630,0.000242
4,Vera level BGM ..semma trailer. ü§û,0,0,0.996036,0.000653,0.000412,0.000499,0.001740,0.000660
...,...,...,...,...,...,...,...,...,...
4383,mishkin -chinimaavin chaliththu poona vattath...,0,0,0.444480,0.101870,0.301202,0.029649,0.120794,0.002004
4384,Sivaji -Bhajii Sapdu Petta -Sweet Sapdu Seri...,0,0,0.891564,0.015096,0.074603,0.003599,0.012186,0.002953
4385,8k dislike sure all vijay fans,5,4,0.319985,0.192327,0.096509,0.033262,0.355819,0.002098
4386,Lady super star Manju warrier Fans Hit like,0,0,0.996193,0.000986,0.000585,0.000454,0.000566,0.001215


In [32]:
accuracy_score(valid['label'], valid['pred_label'])

0.7682315405651777

In [33]:
precision_recall_fscore_support(valid['label'], valid['pred_label'], average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


(0.7409166349443926, 0.7682315405651777, 0.7521580831394454, None)

In [39]:
valid['label'].value_counts(), train['label'].value_counts()

(0    3193
 1     356
 2     307
 3     295
 4     172
 5      65
 Name: label, dtype: int64,
 0    25425
 1     2906
 3     2557
 2     2343
 4     1454
 5      454
 Name: label, dtype: int64)

### Geting Predictions on test

In [41]:
test = pd.read_csv(f"{DATA_DIR}/{LANG}/tamil_offensive_full_test_transliterated.csv")
test

Unnamed: 0,sentence
0,14.12.2018epo trailer pathutu irken ...Semay...
1,Paka thana poro movie la Enna irukunu
2,‚ÄúU kena tunggu lebih lama lagi untuk tahu saya...
3,Suriya anna vera level anna mass
4,suma kaththaatha da sound over a pooda kudaath...
...,...
4387,mannu ponnu rentume onnu athula evan kaiya vac...
4388,Babu mele ko ye song sunke kuch yesa feel hua ...
4389,asuran= aadukalam+pudupettai+ wada chennai..ye...
4390,Vijay's all movies look like same.


In [42]:
test['pred_label'], test['prob0'], test['prob1'], test['prob2'], test['prob3'], test['prob4'], test['prob5'] = get_robert_preds(model, test['sentence'].values)


  prob = F.softmax(outputs['logits'])


In [45]:
test

Unnamed: 0,sentence,pred_label,prob0,prob1,prob2,prob3,prob4,prob5
0,14.12.2018epo trailer pathutu irken ...Semay...,0,0.631904,0.192155,0.145426,0.014910,0.004915,0.010689
1,Paka thana poro movie la Enna irukunu,0,0.681908,0.242631,0.024117,0.024993,0.000566,0.025784
2,‚ÄúU kena tunggu lebih lama lagi untuk tahu saya...,4,0.021094,0.002091,0.002920,0.001125,0.971724,0.001045
3,Suriya anna vera level anna mass,0,0.997296,0.000871,0.000296,0.000596,0.000379,0.000562
4,suma kaththaatha da sound over a pooda kudaath...,3,0.049638,0.187159,0.266210,0.451914,0.000599,0.044481
...,...,...,...,...,...,...,...,...
4387,mannu ponnu rentume onnu athula evan kaiya vac...,0,0.839950,0.002526,0.010315,0.139060,0.000590,0.007559
4388,Babu mele ko ye song sunke kuch yesa feel hua ...,4,0.004683,0.001244,0.003200,0.001922,0.987880,0.001071
4389,asuran= aadukalam+pudupettai+ wada chennai..ye...,0,0.338746,0.322753,0.150264,0.138287,0.019870,0.030080
4390,Vijay's all movies look like same.,0,0.635425,0.026070,0.135930,0.183290,0.000935,0.018350


In [44]:
test['pred_label'].value_counts()

0    3433
1     368
2     260
3     179
4     152
Name: pred_label, dtype: int64

In [46]:
test.to_csv(f"{DATA_DIR}/{LANG}/tamil_test_results_robert.csv", index=False)