In [1]:
from fastai.text import *
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import sentencepiece as spm
import re
import pdb
import random
from fastai.imports import *
import torch
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, matthews_corrcoef, f1_score


In [2]:
import fastai, torch, fastprogress
fastai.__version__ , torch.__version__, fastprogress.__version__

('1.0.60', '1.7.1', '0.2.7')

In [5]:
torch.cuda.set_device(0)

In [6]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value)  
    torch.manual_seed(seed_value)  
    random.seed(seed_value)
    if use_cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)  
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

In [7]:
random_seed(42, True)

In [9]:
path = Path('./')

In [None]:
BASE_DIR = "home/temp/data/eacl/ta/"

In [None]:
df_train = pd.read_csv(f"{BASE_DIR}/tamil_offensive_full_train_transliterated.csv")
df_train.dropna(inplace=True)
# df_train.head()

In [11]:
df_train.shape

(35139, 2)

In [12]:
Counter(df_train['label'])

Counter({0: 25425, 4: 1454, 5: 454, 3: 2557, 1: 2906, 2: 2343})

In [13]:
df_valid = pd.read_csv(f"{BASE_DIR}/temp/data/eacl/ta/tamil_offensive_full_dev_transliterated.csv")
df_valid.dropna(inplace=True)
df_valid.head()

Unnamed: 0,sentence,label
0,Handsome hunk keri vaa thalaivaa,0
1,thenkaachi maavattam naataar chamuthaayam chaa...,0
2,je vous aime bravo pour clip de merde que j √©c...,4
3,chirappu melum ithu poonra pataippukal mika av...,0
4,Vera level BGM ..semma trailer. ü§û,0


In [15]:
df_test = pd.read_csv(f"{BASE_DIR}/temp/data/eacl/ta/tamil_offensive_full_test_transliterated.csv")
df_test.dropna(inplace=True)
df_test.head()

Unnamed: 0,sentence
0,14.12.2018epo trailer pathutu irken ...Semay...
1,Paka thana poro movie la Enna irukunu
2,‚ÄúU kena tunggu lebih lama lagi untuk tahu saya...
3,Suriya anna vera level anna mass
4,suma kaththaatha da sound over a pooda kudaath...


In [31]:
df_train.shape, df_valid.shape, df_test.shape

((35139, 2), (4388, 2), (4392, 1))

In [32]:
Counter(df_train['label']), Counter(df_valid['label'])

(Counter({0: 25425, 4: 1454, 5: 454, 3: 2557, 1: 2906, 2: 2343}),
 Counter({0: 3193, 4: 172, 3: 295, 2: 307, 1: 356, 5: 65}))

In [14]:
label_cols = ['label']
text_cols = ['sentence']

### Tokenization + Data Cleaning

In [15]:
def handle_all_caps(t: str) -> str:
    tokens = t.split()
    tokens = replace_all_caps(tokens)
    return ' '.join(tokens)

def handle_upper_case_first_letter(t: str) -> str:
    tokens = t.split()
    tokens = deal_caps(tokens)
    return ' '.join(tokens)

def lower_case_everything(t: str) -> str:
    return t.lower().replace('@user', '').replace('#tag ', '').replace('rt ', '').strip()

In [16]:
class CodeMixedTamilTokenizer(BaseTokenizer):
    def __init__(self, lang:str):
        self.lang = lang
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(str(path/"../models/taen_spm.model"))
        
    def tokenizer(self, t:str) -> List[str]:
        return self.sp.EncodeAsPieces(t)

In [17]:
sp = spm.SentencePieceProcessor()
sp.Load(str(path/"../models/taen_spm.model"))
itos = [sp.IdToPiece(int(i)) for i in range(8000)]

In [23]:
itos[:20]

['xxunk',
 'xxbos',
 'xxeos',
 'xxpad',
 'xxfld',
 'xxmaj',
 'xxup',
 'xxrep',
 'xxwrep',
 '.',
 ',',
 '‚ñÅ',
 's',
 'a',
 '="',
 'in',
 'doc',
 't',
 'il',
 'i']

In [24]:
# 8,000 is the vocab size that we chose in sentencepiece
taen_vocab = Vocab(itos)


In [19]:
tokenizer = Tokenizer(lang='taen', tok_func=CodeMixedTamilTokenizer)

In [20]:
tokenizer.pre_rules.append(lower_case_everything)
tokenizer.pre_rules.append(handle_all_caps)
tokenizer.pre_rules.append(handle_upper_case_first_letter)

In [21]:
tokenizer.special_cases, tokenizer.pre_rules, tokenizer.post_rules

(['xxunk',
  'xxpad',
  'xxbos',
  'xxeos',
  'xxfld',
  'xxmaj',
  'xxup',
  'xxrep',
  'xxwrep'],
 [<function fastai.text.transform.fix_html(x: str) -> str>,
  <function fastai.text.transform.replace_rep(t: str) -> str>,
  <function fastai.text.transform.replace_wrep(t: str) -> str>,
  <function fastai.text.transform.spec_add_spaces(t: str) -> str>,
  <function fastai.text.transform.rm_useless_spaces(t: str) -> str>,
  <function __main__.lower_case_everything(t: str) -> str>,
  <function __main__.handle_all_caps(t: str) -> str>,
  <function __main__.handle_upper_case_first_letter(t: str) -> str>],
 [<function fastai.text.transform.replace_all_caps(x: Collection[str]) -> Collection[str]>,
  <function fastai.text.transform.deal_caps(x: Collection[str]) -> Collection[str]>])

In [22]:
tokens = tokenizer.process_all(['epo trailer pathutu irken '])
''.join(tokens[0]), tokens

('‚ñÅepo‚ñÅtrailer‚ñÅpathutu‚ñÅirken',
 [['‚ñÅe', 'po', '‚ñÅtra', 'il', 'er', '‚ñÅpat', 'hu', 'tu', '‚ñÅir', 'ken']])

### Training

In [67]:
data_lm = TextLMDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=taen_vocab, label_cols=['label'], text_cols=['sentence'])

  return np.array(a, dtype=dtype, **kwargs)


In [68]:
data_lm.show_batch()

idx,text
0,‚ñÅsu bbu raj ‚ñÅann e ‚ñÅ xxrep ‚ñÅ4 ‚ñÅ . ‚ñÅin tha ‚ñÅpadam ‚ñÅve tri ‚ñÅada ya ‚ñÅun a gal ukku ‚ñÅen n uda ya ‚ñÅval th ukk al . . . ‚ñÅx x bo s ‚ñÅkav un tar ‚ñÅthe var . cha ar pa ak a ‚ñÅver ri ‚ñÅper a ‚ñÅva a z h th th ukk al ‚ñÅ xxunk ‚ñÅx x bo s ‚ñÅip po ‚ñÅin tha ‚ñÅtra
1,‚ñÅirukkum ‚ñÅunmai ya ana ‚ñÅnaatak ak ‚ñÅka a thal ‚ñÅvish ay ath th ai ‚ñÅcho n na al ‚ñÅat hu ‚ñÅcha ath ip ‚ñÅpat am ‚ñÅav lo o tha an ‚ñÅcha ar ‚ñÅpoor a al e es ‚ñÅx x bo s ‚ñÅtamil ‚ñÅcinema ‚ñÅmon ster ‚ñÅsurya ‚ñÅ xxrep ‚ñÅ4 ‚ñÅ . ‚ñÅx x bo s ‚ñÅb g m ‚ñÅking ‚ñÅyu van ‚ñÅshankar ‚ñÅraja ‚ñÅx x bo s ‚ñÅpa kka ‚ñÅmas
2,? ‚ñÅtha lai var : s we et ‚ñÅsa ap ida ‚ñÅpor om . . . ‚ñÅver a ‚ñÅle vel . . . ‚ñÅx x bo s ‚ñÅat li ‚ñÅmath i ri ‚ñÅmutt al u kal ‚ñÅkan du ‚ñÅpadi kkat tum ‚ñÅe ppa di ‚ñÅpadam ‚ñÅpud ikk anam ‚ñÅen tra th ‚ñÅx x bo s ‚ñÅvay as ukku ‚ñÅtha gu ndha ‚ñÅkel vi ‚ñÅya ‚ñÅke kka ‚ñÅsol lu nga ‚ñÅda
3,"‚ñÅx x bo s ‚ñÅkon g ku ‚ñÅcham u tha ay ath th inar ‚ñÅcha ar pa ak a ‚ñÅpat am ‚ñÅver ri ‚ñÅper a ‚ñÅva a z h th th ukk al ‚ñÅx x bo s ‚ñÅka agam ‚ñÅkarai ndhu ‚ñÅko odi ‚ñÅunn um , ‚ñÅmani dham ‚ñÅen num ‚ñÅmood ar ‚ñÅko o dam ‚ñÅko odi ‚ñÅser dhu ‚ñÅpa ga i mai ‚ñÅkollum . . . ‚ñÅid il ‚ñÅyaar"
4,"‚ñÅnadi ku ran u ‚ñÅka ko os ‚ñÅar my ‚ñÅko ov un anu nga ‚ñÅ xxunk ‚ñÅx x bo s ‚ñÅ ng k ‚ñÅwas ‚ñÅin s pi red ‚ñÅto ‚ñÅ nt k ‚ñÅi ‚ñÅthi n k . ‚ñÅse e man ‚ñÅillamal ‚ñÅaras iyal um ‚ñÅilla i ‚ñÅ , arasi yal ‚ñÅpadam um ‚ñÅilla i ‚ñÅnu ‚ñÅpuri u dhu . . . ‚ñÅx x bo s ‚ñÅparv aiye e ‚ñÅver a"


In [69]:
learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.3, pretrained=False)

In [70]:
f'{path}/../models/models/best_model'

'./../models/models/best_model'

In [71]:
learn.load(f"{BASE_DIR}/temp/models/best_model", with_opt=True)

LanguageLearner(data=TextLMDataBunch;

Train: LabelList (35139 items)
x: LMTextList
‚ñÅx x bo s ‚ñÅmo vi e ‚ñÅvar a ‚ñÅle vel ‚ñÅla ‚ñÅeri ka ‚ñÅpo gu th u,‚ñÅx x bo s ‚ñÅi ‚ñÅlo ve ‚ñÅaj ith ‚ñÅkumar ‚ñÅviv e gam ‚ñÅmo vi e ‚ñÅin ki ‚ñÅm j y ‚ñÅ bh t ‚ñÅachi ‚ñÅl gi,‚ñÅx x bo s ‚ñÅpadam ‚ñÅnall a ‚ñÅcom ed y ‚ñÅpadam a ‚ñÅiru kum ‚ñÅpol aye . .,‚ñÅx x bo s ‚ñÅkar thi ck ‚ñÅsu bbu raj ‚ñÅann e ‚ñÅ xxrep ‚ñÅ4 ‚ñÅ . ‚ñÅin tha ‚ñÅpadam ‚ñÅve tri ‚ñÅada ya ‚ñÅun a gal ukku ‚ñÅen n uda ya ‚ñÅval th ukk al . . .,‚ñÅx x bo s ‚ñÅkav un tar ‚ñÅthe var . cha ar pa ak a ‚ñÅver ri ‚ñÅper a ‚ñÅva a z h th th ukk al ‚ñÅ xxunk
y: LMLabelList
,,,,
Path: .;

Valid: LabelList (4388 items)
x: LMTextList
‚ñÅx x bo s ‚ñÅhan ds ome ‚ñÅhu n k ‚ñÅke ri ‚ñÅva a ‚ñÅtha lai va a,‚ñÅx x bo s ‚ñÅthe n ka a chi ‚ñÅma av attam ‚ñÅna a ta ar ‚ñÅcham u tha ayam ‚ñÅcha ar pa ak a ‚ñÅva a z h th th ukk al,‚ñÅx x bo s ‚ñÅje ‚ñÅ vo us ‚ñÅai me ‚ñÅbr avo ‚ñÅpo ur ‚ñÅcli p ‚ñÅde ‚ñÅmer de ‚ñÅqu e ‚ñÅj ‚ñÅ xxunk co u te ‚ñÅa

In [72]:
learn.freeze()

In [73]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,4.044744,3.922262,0.32673,00:19


In [74]:
learn.save('fit_head', with_opt=True)

In [75]:
learn.load('fit_head', with_opt=True);

In [76]:
learn.unfreeze()

In [77]:
learn.fit_one_cycle(5, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.737872,3.653384,0.361356,00:20
1,3.440274,3.375876,0.398717,00:20
2,3.251985,3.262846,0.412751,00:20
3,3.197981,3.221164,0.418171,00:20
4,3.154185,3.21518,0.419078,00:20


In [78]:
learn.save('fine_tuned', with_opt=True)

In [79]:
learn.load('fine_tuned', with_opt=True);

In [80]:
learn.predict('Paka thana',n_words=10)

'Paka thana ‚ñÅit hu ‚ñÅnaan ‚ñÅa ka ‚ñÅmatt er ha ‚ñÅchinna ‚ñÅra'

In [81]:
learn.save_encoder('fine_tuned_enc')

In [25]:
data_clas = TextClasDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=taen_vocab, bs=16, label_cols=['label'], text_cols=['sentence'])

  return np.array(a, dtype=dtype, **kwargs)


  return array(a, dtype, copy=False, order=order)


In [83]:
data_clas.show_batch()

text,target
‚ñÅx x bo s ‚ñÅthi ro u pathi ‚ñÅ xxrep ‚ñÅ10 ‚ñÅ= ‚ñÅmann ai y um ‚ñÅpenn ai y um ‚ñÅka ak ka ‚ñÅvant ha ‚ñÅna aya ki ‚ñÅnaatak a ‚ñÅka ath alai ‚ñÅtho ol ur ikkum ‚ñÅnav een a ‚ñÅthi ro u pathi ‚ñÅpin j chil ‚ñÅpa z hu th tha a ‚ñÅve m pi ‚ñÅtha an ‚ñÅva atum ‚ñÅnan j ch ai ‚ñÅchu vai th tha al ‚ñÅmaranam,3
‚ñÅx x bo s ‚ñÅen ta a ‚ñÅe ena ‚ñÅpir avi kala a ‚ñÅavan ‚ñÅavan ‚ñÅ path th u ‚ñÅma ach am ‚ñÅchu man th u ‚ñÅkash tap pattu ‚ñÅvalar ath th u ‚ñÅtha an ‚ñÅaa ch ai ‚ñÅpatta th tha ‚ñÅanup avi kka atti y um ‚ñÅthan ‚ñÅku z han th ai ‚ñÅ kku ‚ñÅthe va iya ana tha ‚ñÅva ang ki ‚ñÅku tu th th u ‚ñÅaa ch,3
‚ñÅx x bo s ‚ñÅanai th th u ‚ñÅcham ook a ‚ñÅ mak kalin ‚ñÅka aman ts ‚ñÅ ith ai ‚ñÅpa ar kkum ‚ñÅpo z hu th u ‚ñÅon ru ‚ñÅthe li va ak a ‚ñÅpuri kir ath u ‚ñÅoru ‚ñÅkuripp it ta ‚ñÅcham ook ath tha al ‚ñÅmar ra ‚ñÅanai th th u ‚ñÅcham ook amum ‚ñÅen tha ‚ñÅalav irku ‚ñÅpa ath ippu ‚ñÅat aint hu ‚ñÅirukki rat hu,0
‚ñÅx x bo s ‚ñÅ ith ula ‚ñÅenna ‚ñÅka am e tin na a ‚ñÅoru ‚ñÅ pakk am ‚ñÅna ang ka ‚ñÅaan ta ‚ñÅparam parai ‚ñÅpen ta ‚ñÅparam para in nu ‚ñÅmee ch aiya i ‚ñÅmu rukk ur avan um ‚ñÅinno ru ‚ñÅ pakk am ‚ñÅat ang ka ‚ñÅmaru ‚ñÅ ath th u ‚ñÅmee ru n nu ‚ñÅkam pu ‚ñÅchu th th ur avan um ‚ñÅ50 varu sha ma a,3
‚ñÅx x bo s ‚ñÅvanni ya ‚ñÅkul a ‚ñÅk sh ath thi ri yar ‚ñÅ( ak ni ‚ñÅkul a ‚ñÅk sh ath thi ri yar ‚ñÅma kkal ‚ñÅku tum path thil ‚ñÅull a ‚ñÅvevveru ‚ñÅpattam ‚ñÅkont a ‚ñÅvanni yar kal ‚ñÅ xxunk van n iya ‚ñÅkav un tar ‚ñÅ xxunk van n iya ‚ñÅpat aiya atchi ‚ñÅ xxunk van n iya ‚ñÅna aya kar ‚ñÅ xxunk van n iya ‚ñÅva,0


In [84]:
data_clas.sanity_check()

  return array(a, dtype, copy=False, order=order)


In [26]:
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.5)

In [86]:
learn.load_encoder('fine_tuned_enc')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (35139 items)
x: TextList
‚ñÅx x bo s ‚ñÅmo vi e ‚ñÅvar a ‚ñÅle vel ‚ñÅla ‚ñÅeri ka ‚ñÅpo gu th u,‚ñÅx x bo s ‚ñÅi ‚ñÅlo ve ‚ñÅaj ith ‚ñÅkumar ‚ñÅviv e gam ‚ñÅmo vi e ‚ñÅin ki ‚ñÅm j y ‚ñÅ bh t ‚ñÅachi ‚ñÅl gi,‚ñÅx x bo s ‚ñÅpadam ‚ñÅnall a ‚ñÅcom ed y ‚ñÅpadam a ‚ñÅiru kum ‚ñÅpol aye . .,‚ñÅx x bo s ‚ñÅkar thi ck ‚ñÅsu bbu raj ‚ñÅann e ‚ñÅ xxrep ‚ñÅ4 ‚ñÅ . ‚ñÅin tha ‚ñÅpadam ‚ñÅve tri ‚ñÅada ya ‚ñÅun a gal ukku ‚ñÅen n uda ya ‚ñÅval th ukk al . . .,‚ñÅx x bo s ‚ñÅkav un tar ‚ñÅthe var . cha ar pa ak a ‚ñÅver ri ‚ñÅper a ‚ñÅva a z h th th ukk al ‚ñÅ xxunk
y: CategoryList
0,4,0,0,0
Path: .;

Valid: LabelList (4388 items)
x: TextList
‚ñÅx x bo s ‚ñÅhan ds ome ‚ñÅhu n k ‚ñÅke ri ‚ñÅva a ‚ñÅtha lai va a,‚ñÅx x bo s ‚ñÅthe n ka a chi ‚ñÅma av attam ‚ñÅna a ta ar ‚ñÅcham u tha ayam ‚ñÅcha ar pa ak a ‚ñÅva a z h th th ukk al,‚ñÅx x bo s ‚ñÅje ‚ñÅ vo us ‚ñÅai me ‚ñÅbr avo ‚ñÅpo ur ‚ñÅcli p ‚ñÅde ‚ñÅmer de ‚ñÅqu e ‚ñÅj ‚ñÅ xxunk co u te ‚ñÅau

In [87]:
learn.freeze()

In [88]:
learn.loss_func.func

CrossEntropyLoss()

In [89]:
mcc = MatthewsCorreff()

In [90]:
learn.metrics = [mcc, accuracy]

In [91]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,0.84127,0.751917,0.263278,0.749772,00:56


In [92]:
learn.save('first-full')

In [93]:
learn.load('first-full');

In [94]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,0.750675,0.682011,0.381225,0.770283,00:55


  return array(a, dtype, copy=False, order=order)


In [95]:
learn.save('second-full')

In [96]:
learn.unfreeze()
learn.fit_one_cycle(5, 1e-3, callbacks=[callbacks.SaveModelCallback(learn, every='improvement', monitor='accuracy', name='final')])

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,0.658494,0.677764,0.412541,0.774385,01:00
1,0.657779,0.670823,0.418215,0.778943,00:55
2,0.656074,0.658323,0.427029,0.778259,00:55
3,0.61571,0.652915,0.439312,0.778259,00:56
4,0.521762,0.662419,0.452026,0.773473,01:02


  return array(a, dtype, copy=False, order=order)


Better model found at epoch 0 with accuracy value: 0.7743846774101257.


  return array(a, dtype, copy=False, order=order)


Better model found at epoch 1 with accuracy value: 0.7789425849914551.


  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)


In [27]:
learn.load('final')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (35139 items)
x: TextList
‚ñÅx x bo s ‚ñÅmo vi e ‚ñÅvar a ‚ñÅle vel ‚ñÅla ‚ñÅeri ka ‚ñÅpo gu th u,‚ñÅx x bo s ‚ñÅi ‚ñÅlo ve ‚ñÅaj ith ‚ñÅkumar ‚ñÅviv e gam ‚ñÅmo vi e ‚ñÅin ki ‚ñÅm j y ‚ñÅ bh t ‚ñÅachi ‚ñÅl gi,‚ñÅx x bo s ‚ñÅpadam ‚ñÅnall a ‚ñÅcom ed y ‚ñÅpadam a ‚ñÅiru kum ‚ñÅpol aye . .,‚ñÅx x bo s ‚ñÅkar thi ck ‚ñÅsu bbu raj ‚ñÅann e ‚ñÅ xxrep ‚ñÅ4 ‚ñÅ . ‚ñÅin tha ‚ñÅpadam ‚ñÅve tri ‚ñÅada ya ‚ñÅun a gal ukku ‚ñÅen n uda ya ‚ñÅval th ukk al . . .,‚ñÅx x bo s ‚ñÅkav un tar ‚ñÅthe var . cha ar pa ak a ‚ñÅver ri ‚ñÅper a ‚ñÅva a z h th th ukk al ‚ñÅ xxunk
y: CategoryList
0,4,0,0,0
Path: .;

Valid: LabelList (4388 items)
x: TextList
‚ñÅx x bo s ‚ñÅhan ds ome ‚ñÅhu n k ‚ñÅke ri ‚ñÅva a ‚ñÅtha lai va a,‚ñÅx x bo s ‚ñÅthe n ka a chi ‚ñÅma av attam ‚ñÅna a ta ar ‚ñÅcham u tha ayam ‚ñÅcha ar pa ak a ‚ñÅva a z h th th ukk al,‚ñÅx x bo s ‚ñÅje ‚ñÅ vo us ‚ñÅai me ‚ñÅbr avo ‚ñÅpo ur ‚ñÅcli p ‚ñÅde ‚ñÅmer de ‚ñÅqu e ‚ñÅj ‚ñÅ xxunk co u te ‚ñÅau

### Inference

In [28]:
df_test = df_valid.copy()
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score
df_dict = {'query': list(df_test['sentence']), 'actual_label': list(df_test['label']), 'predicted_label': ['']*df_test.shape[0]}
all_nodes = list(set(df_valid['label']))
for node in all_nodes:
    df_dict[node] = ['']*df_test.shape[0]
    
i2c = {}
for key, value in learn.data.c2i.items():
    i2c[value] = key
    
df_result = pd.DataFrame(df_dict)
preds = learn.get_preds(ds_type=DatasetType.Valid, ordered=True)
nodes = []
predicted_labels = []
for index, row in df_result.iterrows():
    for node in all_nodes:
        df_result.loc[index, node] = preds[0][index][learn.data.c2i[node]].item()
    df_result.loc[index, 'predicted_label'] = i2c[np.argmax(preds[0][index]).data.item()]

df_result.head()

Unnamed: 0,query,actual_label,predicted_label,0,1,2,3,4,5
0,Handsome hunk keri vaa thalaivaa,0,0,0.954507,0.029217,0.006231,0.005025,0.002808,0.002211
1,thenkaachi maavattam naataar chamuthaayam chaa...,0,0,0.994352,0.001518,0.00134,0.002317,9.3e-05,0.000379
2,je vous aime bravo pour clip de merde que j √©c...,4,4,0.3691,0.059537,0.077506,0.04461,0.438532,0.010714
3,chirappu melum ithu poonra pataippukal mika av...,0,0,0.955425,0.006989,0.008612,0.027626,4.2e-05,0.001306
4,Vera level BGM ..semma trailer. ü§û,0,0,0.932819,0.041869,0.00331,0.016839,0.00296,0.002203


In [29]:
import numpy
df_result['predicted_label'] = df_result['predicted_label'].astype(int)
df_result['actual_label'] = df_result['actual_label'].astype(int)

In [105]:
type(df_result['actual_label'].values[0]), type(df_result['predicted_label'].values[0])

(numpy.int64, numpy.int64)

In [30]:
accuracy_score(df_result['actual_label'], df_result['predicted_label'])

0.7782588878760255

In [31]:
matthews_corrcoef(df_result['actual_label'], df_result['predicted_label'])


0.4163069605014294

In [32]:
f1_score(df_result['actual_label'], df_result['predicted_label'], average='weighted')

0.7335403148302653

In [33]:
precision_recall_fscore_support(df_result['actual_label'], df_result['predicted_label'], average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


(0.7269861245612425, 0.7782588878760255, 0.7335403148302653, None)

In [109]:
df_result['status'] = df_result['actual_label'] == df_result['predicted_label']
df_result[df_result['status'] == False]

Unnamed: 0,query,actual_label,predicted_label,0,1,2,3,4,5,status
17,dei YENDA ungalukku inthe illatha Vella Surya ...,2,0,0.354947,0.263264,0.204207,0.132686,0.008453,0.036443,False
21,thala innum ipdi full white evvalavu naal nadi...,2,0,0.887194,0.032376,0.036545,0.037426,0.000109,0.006349,False
27,Deii ponga da tamil naatla rape crime increase...,3,0,0.467927,0.080725,0.038836,0.393410,0.000704,0.018398,False
36,Comment la en da picha edukuringa... pichakara...,3,1,0.049569,0.548068,0.007857,0.331396,0.001707,0.061404,False
40,Karthi fanslike Kaithi waiting but bigil maran...,2,0,0.395608,0.056239,0.394572,0.133157,0.003005,0.017420,False
...,...,...,...,...,...,...,...,...,...,...
4369,Ivarukkku eppodhum thalaivar kalaigner lightaa...,2,0,0.369844,0.208622,0.288573,0.035840,0.075511,0.021610,False
4372,Trailer Nala irukanu oru than comment pandranu...,1,0,0.812631,0.129006,0.018768,0.032908,0.000238,0.006449,False
4378,Wigpathy Visay na Padam Flop than ithula Kabal...,2,3,0.035182,0.140930,0.071759,0.697300,0.003162,0.051667,False
4380,Vikram ella styleum set aaguthu.. Namba moonji...,2,0,0.426659,0.214986,0.234326,0.094319,0.004637,0.025073,False


In [None]:
df_result.to_csv(f"{BASE_DIR}/tamil_valid_results_umlfit.csv", index=False)

In [115]:
df_test = pd.read_csv(f"{BASE_DIR}/tamil_offensive_full_test_transliterated.csv")
print(df_test.shape)
df_test.dropna(inplace=True)
print(df_test.shape)

(4392, 1)
(4392, 1)


In [116]:
df_test

Unnamed: 0,sentence
0,14.12.2018epo trailer pathutu irken ...Semay...
1,Paka thana poro movie la Enna irukunu
2,‚ÄúU kena tunggu lebih lama lagi untuk tahu saya...
3,Suriya anna vera level anna mass
4,suma kaththaatha da sound over a pooda kudaath...
...,...
4387,mannu ponnu rentume onnu athula evan kaiya vac...
4388,Babu mele ko ye song sunke kuch yesa feel hua ...
4389,asuran= aadukalam+pudupettai+ wada chennai..ye...
4390,Vijay's all movies look like same.


In [138]:
df_dict = {'sentence': list(df_test['sentence']), 'label': ['']*df_test.shape[0]}
all_nodes = list(set(df_train['label']))
for node in all_nodes:
    df_dict[node] = ['']*df_test.shape[0]
    
i2c = {}
for key, value in learn.data.c2i.items():
    i2c[value] = key

df_result = pd.DataFrame(df_dict)


In [139]:
df_result

Unnamed: 0,sentence,label,0,1,2,3,4,5
0,14.12.2018epo trailer pathutu irken ...Semay...,,,,,,,
1,Paka thana poro movie la Enna irukunu,,,,,,,
2,‚ÄúU kena tunggu lebih lama lagi untuk tahu saya...,,,,,,,
3,Suriya anna vera level anna mass,,,,,,,
4,suma kaththaatha da sound over a pooda kudaath...,,,,,,,
...,...,...,...,...,...,...,...,...
4387,mannu ponnu rentume onnu athula evan kaiya vac...,,,,,,,
4388,Babu mele ko ye song sunke kuch yesa feel hua ...,,,,,,,
4389,asuran= aadukalam+pudupettai+ wada chennai..ye...,,,,,,,
4390,Vijay's all movies look like same.,,,,,,,


In [142]:
for index, row in df_result.iterrows():
    result = learn.predict(row['sentence'])
    prob =  result[2]
    for node in all_nodes:
        df_result.loc[index, node] = prob[learn.data.c2i[node]].item()
    df_result.loc[index,'label'] = i2c[np.argmax(prob).data.item()]
    df_result.loc[index, 'label'] = result[1].item()
df_result.head()

Unnamed: 0,sentence,label,0,1,2,3,4,5
0,14.12.2018epo trailer pathutu irken ...Semay...,0,0.737698,0.207027,0.018546,0.02793,0.001813,0.006987
1,Paka thana poro movie la Enna irukunu,0,0.596777,0.27567,0.031983,0.070101,0.000985,0.024484
2,‚ÄúU kena tunggu lebih lama lagi untuk tahu saya...,4,0.005738,0.000338,0.000252,0.000648,0.992902,0.000122
3,Suriya anna vera level anna mass,0,0.995216,0.000994,0.000411,0.002299,0.000804,0.000275
4,suma kaththaatha da sound over a pooda kudaath...,2,0.07967,0.115144,0.590439,0.189508,0.001885,0.023356


In [143]:
df_result['label'].value_counts()

0    3812
1     181
4     158
2     146
3      95
Name: label, dtype: int64

In [46]:
learn.data.c2i, i2c

({0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}, {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5})

In [144]:
df_result.to_csv(f"{BASE_DIR}/tamil_test_results_ulmfit.csv", index=False)

In [145]:
df_result.shape

(4392, 8)