In [1]:
from fastai.text import *
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import sentencepiece as spm
import re
import pdb

In [2]:
import fastai, torch
fastai.__version__ , torch.__version__

('1.0.57', '1.0.0')

In [3]:
torch.cuda.set_device(0)

In [4]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value)  
    torch.manual_seed(seed_value)  
    random.seed(seed_value)
    if use_cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)  
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

In [5]:
random_seed(42, True)

In [6]:
!pwd

/data/home/ubuntu/gaurav/in/fire/code-mixed-enta/classification


In [7]:
path = Path('./')

In [8]:
df = pd.read_excel(path/'../hasoc_task_2/Tamil-Codemixed_offensive_data_Training-Tweet-HL.xlsx', header=None)
df.dropna(inplace=True)
df.replace(to_replace='not', value='NOT', inplace=True)
df.replace(to_replace='OFf', value='OFF', inplace=True)
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,0,1,2
0,TA_TW630,@USER Avaru romba varshan munnadi eh retire aa...,NOT
1,TA_TW3041,@USER Ungotha yaru unaku therium ungoppan munj...,OFF
2,TA_TW6151,@USER Athu thaan avan thannoda instagram id la...,NOT
3,TA_TW3291,RT @USER Nee Onnu pannu....Vijay poolu ah Oomb...,OFF
4,TA_TW2722,@USER hater of Ajith nu hater of Ajith tha da ...,OFF


In [9]:
df.shape

(4000, 3)

In [10]:
Counter(df[2])

Counter({'NOT': 2020, 'OFF': 1980})

In [11]:
cutoff = int(0.8*len(df))
df_train = df.iloc[:cutoff]
df_train.head()

Unnamed: 0,0,1,2
0,TA_TW630,@USER Avaru romba varshan munnadi eh retire aa...,NOT
1,TA_TW3041,@USER Ungotha yaru unaku therium ungoppan munj...,OFF
2,TA_TW6151,@USER Athu thaan avan thannoda instagram id la...,NOT
3,TA_TW3291,RT @USER Nee Onnu pannu....Vijay poolu ah Oomb...,OFF
4,TA_TW2722,@USER hater of Ajith nu hater of Ajith tha da ...,OFF


In [12]:
df_valid = df.iloc[cutoff:]
df_valid.head()

Unnamed: 0,0,1,2
3200,TA_TW2000,@USER Semaya irukku D chellam plz dm va 7.9.in...,OFF
3201,TA_TW648,@USER Enaku unnoda Feeling puriyuthu abi Try p...,NOT
3202,TA_HL481,itha losliya nu solitingale,NOT
3203,TA_TW1950,@USER nee yaruda komali avana sona unaku kovam...,OFF
3204,TA_HL186,ama evalunga panra setaikku nama answer pannan...,NOT


In [13]:
df_test = pd.read_csv(path/'../hasoc_task_2/Tamil_hasoc_tanglish_test_without_labels.tsv', sep='\t', header=None)
df_test.dropna(inplace=True)
df_test.head()

Unnamed: 0,0,1
0,TA_TW15946,Take it this thevidiya Kandipa indha page admi...
1,TA_TW10175,enga veetla itha nadakum Athum oru varushama t...
2,TA_TW15947,"Indha Sallli Punda, Dummy Pundalam Orama Iruka..."
3,TA_TW15174,Juriya poola tier 1 la umburan tha kulla punda...
4,TA_TW15182,Kullans lam umba therila Loosu kuthi maari umb...


In [14]:
df_train.shape, df_valid.shape, df_test.shape

((3200, 3), (800, 3), (940, 2))

In [15]:
Counter(df_train[2])

Counter({'NOT': 1633, 'OFF': 1567})

In [16]:
Counter(df_valid[2])

Counter({'NOT': 387, 'OFF': 413})

In [17]:
# df_train = pd.concat([df_train, df_valid])
# df_train.shape

In [18]:
label_cols = [2]
text_cols = [1]

In [19]:
def handle_all_caps(t: str) -> str:
    tokens = t.split()
    tokens = replace_all_caps(tokens)
    return ' '.join(tokens)

def handle_upper_case_first_letter(t: str) -> str:
    tokens = t.split()
    tokens = deal_caps(tokens)
    return ' '.join(tokens)

def lower_case_everything(t: str) -> str:
    return t.lower().replace('@user', '').replace('#tag ', '').replace('rt ', '').strip()

In [20]:
class CodeMixedTamilTokenizer(BaseTokenizer):
    def __init__(self, lang:str):
        self.lang = lang
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(str(path/"../tokenizer/taen_spm.model"))
        
    def tokenizer(self, t:str) -> List[str]:
        return self.sp.EncodeAsPieces(t)

In [21]:
sp = spm.SentencePieceProcessor()
sp.Load(str(path/"../tokenizer/taen_spm.model"))
itos = [sp.IdToPiece(int(i)) for i in range(8000)]

In [22]:
itos[:20]

['xxunk',
 'xxbos',
 'xxeos',
 'xxpad',
 'xxfld',
 'xxmaj',
 'xxup',
 'xxrep',
 'xxwrep',
 '.',
 ',',
 '▁',
 's',
 'a',
 '="',
 'in',
 'doc',
 't',
 'il',
 'i']

In [23]:
# 8,000 is the vocab size that we chose in sentencepiece
taen_vocab = Vocab(itos)

In [24]:
tokenizer = Tokenizer(lang='taen', tok_func=CodeMixedTamilTokenizer)

In [25]:
tokenizer.pre_rules.append(lower_case_everything)
tokenizer.pre_rules.append(handle_all_caps)
tokenizer.pre_rules.append(handle_upper_case_first_letter)

In [26]:
tokenizer.special_cases, tokenizer.pre_rules, tokenizer.post_rules

(['xxunk',
  'xxpad',
  'xxbos',
  'xxeos',
  'xxfld',
  'xxmaj',
  'xxup',
  'xxrep',
  'xxwrep'],
 [<function fastai.text.transform.fix_html>,
  <function fastai.text.transform.replace_rep>,
  <function fastai.text.transform.replace_wrep>,
  <function fastai.text.transform.spec_add_spaces>,
  <function fastai.text.transform.rm_useless_spaces>,
  <function __main__.lower_case_everything>,
  <function __main__.handle_all_caps>,
  <function __main__.handle_upper_case_first_letter>],
 [<function fastai.text.transform.replace_all_caps>,
  <function fastai.text.transform.deal_caps>])

In [27]:
tokens = tokenizer.process_all(['Tell me about TOUR self, mujhe jaanna hai'])
''.join(tokens[0])

'▁tell▁me▁about▁tour▁self,▁mujhe▁jaanna▁hai'

In [28]:
data_lm = TextLMDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=taen_vocab, label_cols=label_cols, text_cols=text_cols)

In [29]:
data_lm.show_batch()

idx,text
0,▁eri ku ra ▁u h ▁da ▁the vid iya ▁pa iya le ▁x x bo s ▁at hu ▁tha an ▁avan ▁than n oda ▁in s ta gram ▁id ▁la ye ▁po du ▁iru kan e ▁vijay ▁do u ble ▁bod y ▁endu ▁un aku ▁it ha ▁vid a ▁ke val ama ▁ena ▁ven um ▁aa ▁tan e ▁ne nga ▁parthu ▁ra ▁x x bo s ▁ne e ▁on nu
1,▁un ▁ dr ess ▁ um ▁ path ave ▁ pathi tu ▁varu th u . ▁x x bo s ▁nan um ▁un ga ▁f r nd ▁li st ▁la ▁iru ku ra tha ▁nin a chu ▁ro mba ▁pro ud ▁ ah ▁f e el ▁pan ren ▁x x bo s ▁br o ▁avan ▁oru ▁du m my ▁br o ▁fi rs t ▁on nu ▁sol lu van ▁
2,x bo s ▁ ippa ▁mattu n ▁enna ▁pann adhu ▁theri yam aya ▁po edu chu ▁mutt al ugal a ▁x x bo s ▁de i ▁sang i ▁pun da ▁ma van e ▁ne e ▁co w ▁ ook ura ▁ xxunk ▁t ag ▁ uku ▁sun n iya ▁so op ura ▁w or k ▁mat um ▁pa aru ▁x x bo s ▁aam een ▁aam een ▁ya ▁ra bbu
3,bo s ▁un may ai na ▁sam ba vam ▁dha ▁men tal ▁ko o thi . ▁po i ▁goo gle ▁panni ▁pa aru ▁ney veli ▁is su e ▁va ▁x x bo s ▁dai ▁un ga ▁tha la ▁ bike ▁ra c er ▁ ah ▁iru tu ▁man go thal aye ▁doo p ▁po ta ▁pun da ▁thay oli ▁dhan a ▁x x bo s ▁pa it iyam ▁mari ▁iru
4,ap avin gl a . . . ▁it hu ▁enna da ▁surya ku ▁van dha ▁so than ai ▁ xxrep ▁4 ▁ . ▁up dat e ▁app ▁to ▁vi e w ▁x x bo s ▁sonna ▁pun da ▁ne e ▁katha ra tha ▁pa akal aam ▁nu ▁vant ha ▁da ▁mun da ▁ko o thi ▁ xxunk ▁t ag ▁ xxunk ▁t ag ▁ xxunk ▁t ag ▁x x bo


In [30]:
learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.3, pretrained=False)

In [31]:
# Loading the pretrained language model on tamil wikipedia
learn.load('../../dataset_preparation/models/best_model', with_opt=True)

LanguageLearner(data=TextLMDataBunch;

Train: LabelList (3200 items)
x: LMTextList
▁x x bo s ▁avar u ▁ro mba ▁var shan ▁munn adi ▁e h ▁re ti re ▁aa y itar u ▁nu ▁ne nai chen ▁ xxrep ▁4 ▁ . ▁for m ▁la ▁irukk a ▁app o ve ▁aa ganum ▁sol l itu ▁iru par u ▁ xxunk ▁t ag,▁x x bo s ▁un go tha ▁yar u ▁un aku ▁theri um ▁un go ppan ▁mun ji ▁pat hu ▁eri ku ra ▁u h ▁da ▁the vid iya ▁pa iya le,▁x x bo s ▁at hu ▁tha an ▁avan ▁than n oda ▁in s ta gram ▁id ▁la ye ▁po du ▁iru kan e ▁vijay ▁do u ble ▁bod y ▁endu ▁un aku ▁it ha ▁vid a ▁ke val ama ▁ena ▁ven um ▁aa ▁tan e ▁ne nga ▁parthu ▁ra,▁x x bo s ▁ne e ▁on nu ▁pann u ▁ xxrep ▁4 ▁ . ▁vijay ▁poo lu ▁ ah ▁o om bur ath a ▁niru th ittu ▁app uku tt y ▁ oda ▁poo lu ▁ ah ▁o o mba ▁sta pan nu . . . y enna ▁avan ▁dhan a ▁periya ▁a ctor ▁un ga ▁aj ith ▁ ah ▁vid a . . . ▁awar d ▁iru k ur avan ▁poo lu ▁ ah ▁dhan ▁ ish tam ▁ ah ▁o o mbu va ▁ne e ▁ xxrep ▁4 ▁ . ▁po ▁ national ▁awar d ▁dhan ▁periya ▁awar d ▁ xxrep ▁4 ▁ . ▁po i ▁app u ▁kutt y ▁ ah ▁o o mbu,▁x x bo s ▁h

In [32]:
learn.freeze()

In [33]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,5.714156,5.276985,0.15129,00:02


In [34]:
learn.save('fit_head', with_opt=True)

In [35]:
learn.load('fit_head', with_opt=True);

In [36]:
learn.unfreeze()

In [37]:
learn.fit_one_cycle(5, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,5.178426,4.931022,0.197321,00:02
1,4.87066,4.528532,0.247321,00:02
2,4.605754,4.343621,0.269048,00:02
3,4.408315,4.273314,0.27619,00:02
4,4.294514,4.26307,0.277207,00:02


In [38]:
learn.save('fine_tuned', with_opt=True)

In [39]:
learn.load('fine_tuned', with_opt=True);

In [40]:
learn.predict('en payar',n_words=10)

'en payar ▁pol ▁dayaaricapadu anu . ing hu ▁ro mba ▁veli ana'

In [41]:
learn.save_encoder('fine_tuned_enc')

In [42]:
data_clas = TextClasDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=taen_vocab, bs=16, label_cols=label_cols, text_cols=text_cols)

In [43]:
data_clas.show_batch()

text,target
▁x x bo s ▁o os uku ▁on num ▁pan la ▁illa . . . ▁sambal am ▁ki mbal am ▁vang arin gal a . . . na atu ka ga ▁on num ▁ne eng a ▁tho o ki ▁ni ppatu la ▁illa ▁ xxrep ▁4 ▁ . ▁ne nga ▁ar my ▁off ic ers ▁vid a ▁na atuku ▁it hu ▁pan ri ngal a ▁ xxrep ▁4 ▁ .,OFF
"▁x x bo s ▁tamil ▁pon nu ▁nu ▁tho ki ▁va chu ▁a adu ni ka ▁ xxrep ▁4 ▁ . ▁iv al ▁on na ▁no ▁fa ke ▁ xxrep ▁8 ▁ . ▁ bb ▁ho me ▁l ▁ull a ▁a angal ▁all or um ▁good . . . pen n kal ▁tha an ▁the ava ▁illa ma ▁ , , c re at ▁pann uth u ka ▁ xxrep",OFF
"▁x x bo s ▁iv un galukum ▁sam e ▁than dan ai ▁kudu kan um ▁a thum ▁nadu ▁ro ad ▁la iv val a vu ▁na al ▁vel a ▁pa kka ma ▁kaal ▁mel a ▁kaal ▁potu ▁sum ma ▁irundhu ▁sambal am ▁va angi tu , ▁cor on a ▁la ▁vel a ▁pa kka ▁sonna ▁an iya ayam a ▁ip di ▁kolai ▁panni tu ▁adh uku ▁sa ppa ▁sa aku",OFF
"▁x x bo s ▁na ▁7 ' th ▁padi kum ▁pot hu ▁3 s ir ▁vant hang a ▁avan ga ▁nam e ▁pan di ▁ , ven kat , sar avan an ▁ ah th ula ▁venkat ▁sir ▁son nga ▁na ▁ th ri sha ▁ki t ta ▁pe asu re ▁nu ▁sol lli ▁ yen ni um ▁pe as a ▁vach anga ▁a pro ▁na ▁ph one ▁pe asu",OFF
▁x x bo s ▁ne e ▁on nu ▁pann u ▁ xxrep ▁4 ▁ . ▁vijay ▁poo lu ▁ ah ▁o om bur ath a ▁niru th ittu ▁app uku tt y ▁ oda ▁poo lu ▁ ah ▁o o mba ▁sta pan nu . . . y enna ▁avan ▁dhan a ▁periya ▁a ctor ▁un ga ▁aj ith ▁ ah ▁vid a . . . ▁awar d ▁iru k,OFF


In [44]:
data_clas.sanity_check()

In [77]:
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.5)

In [78]:
learn.load_encoder('fine_tuned_enc')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (3200 items)
x: TextList
▁x x bo s ▁avar u ▁ro mba ▁var shan ▁munn adi ▁e h ▁re ti re ▁aa y itar u ▁nu ▁ne nai chen ▁ xxrep ▁4 ▁ . ▁for m ▁la ▁irukk a ▁app o ve ▁aa ganum ▁sol l itu ▁iru par u ▁ xxunk ▁t ag,▁x x bo s ▁un go tha ▁yar u ▁un aku ▁theri um ▁un go ppan ▁mun ji ▁pat hu ▁eri ku ra ▁u h ▁da ▁the vid iya ▁pa iya le,▁x x bo s ▁at hu ▁tha an ▁avan ▁than n oda ▁in s ta gram ▁id ▁la ye ▁po du ▁iru kan e ▁vijay ▁do u ble ▁bod y ▁endu ▁un aku ▁it ha ▁vid a ▁ke val ama ▁ena ▁ven um ▁aa ▁tan e ▁ne nga ▁parthu ▁ra,▁x x bo s ▁ne e ▁on nu ▁pann u ▁ xxrep ▁4 ▁ . ▁vijay ▁poo lu ▁ ah ▁o om bur ath a ▁niru th ittu ▁app uku tt y ▁ oda ▁poo lu ▁ ah ▁o o mba ▁sta pan nu . . . y enna ▁avan ▁dhan a ▁periya ▁a ctor ▁un ga ▁aj ith ▁ ah ▁vid a . . . ▁awar d ▁iru k ur avan ▁poo lu ▁ ah ▁dhan ▁ ish tam ▁ ah ▁o o mbu va ▁ne e ▁ xxrep ▁4 ▁ . ▁po ▁ national ▁awar d ▁dhan ▁periya ▁awar d ▁ xxrep ▁4 ▁ . ▁po i ▁app u ▁kutt y ▁ ah ▁o o mbu,▁x x bo s ▁ha ter

In [79]:
learn.freeze()

In [80]:
learn.loss_func.func

CrossEntropyLoss()

In [81]:
mcc = MatthewsCorreff()

In [82]:
learn.metrics = [mcc, accuracy]

In [83]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,0.56055,0.4327,0.631044,0.8125,00:03


In [84]:
learn.save('first-full')

In [85]:
learn.load('first-full');

In [86]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,0.460979,0.36874,0.681285,0.8375,00:04


In [87]:
learn.save('second-full')

In [88]:
learn.unfreeze()
learn.fit_one_cycle(5, 1e-3, callbacks=[callbacks.SaveModelCallback(learn, every='improvement', monitor='accuracy', name='final')])

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,0.365061,0.330873,0.747256,0.87375,00:07
1,0.291775,0.330849,0.724713,0.8625,00:07
2,0.216459,0.407518,0.700695,0.84625,00:07
3,0.127917,0.371191,0.724657,0.8625,00:07
4,0.112401,0.379323,0.724656,0.8625,00:07


Better model found at epoch 0 with accuracy value: 0.8737499713897705.


In [57]:
learn.load('final')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (3200 items)
x: TextList
▁x x bo s ▁avar u ▁ro mba ▁var shan ▁munn adi ▁e h ▁re ti re ▁aa y itar u ▁nu ▁ne nai chen ▁ xxrep ▁4 ▁ . ▁for m ▁la ▁irukk a ▁app o ve ▁aa ganum ▁sol l itu ▁iru par u ▁ xxunk ▁t ag,▁x x bo s ▁un go tha ▁yar u ▁un aku ▁theri um ▁un go ppan ▁mun ji ▁pat hu ▁eri ku ra ▁u h ▁da ▁the vid iya ▁pa iya le,▁x x bo s ▁at hu ▁tha an ▁avan ▁than n oda ▁in s ta gram ▁id ▁la ye ▁po du ▁iru kan e ▁vijay ▁do u ble ▁bod y ▁endu ▁un aku ▁it ha ▁vid a ▁ke val ama ▁ena ▁ven um ▁aa ▁tan e ▁ne nga ▁parthu ▁ra,▁x x bo s ▁ne e ▁on nu ▁pann u ▁ xxrep ▁4 ▁ . ▁vijay ▁poo lu ▁ ah ▁o om bur ath a ▁niru th ittu ▁app uku tt y ▁ oda ▁poo lu ▁ ah ▁o o mba ▁sta pan nu . . . y enna ▁avan ▁dhan a ▁periya ▁a ctor ▁un ga ▁aj ith ▁ ah ▁vid a . . . ▁awar d ▁iru k ur avan ▁poo lu ▁ ah ▁dhan ▁ ish tam ▁ ah ▁o o mbu va ▁ne e ▁ xxrep ▁4 ▁ . ▁po ▁ national ▁awar d ▁dhan ▁periya ▁awar d ▁ xxrep ▁4 ▁ . ▁po i ▁app u ▁kutt y ▁ ah ▁o o mbu,▁x x bo s ▁ha ter

In [58]:
df_test = df_valid.copy()
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score
df_dict = {'query': list(df_test[1]), 'actual_label': list(df_test[2]), 'predicted_label': ['']*df_test.shape[0]}
all_nodes = list(set(df_train[2]))
for node in all_nodes:
    df_dict[node] = ['']*df_test.shape[0]
    
i2c = {}
for key, value in learn.data.c2i.items():
    i2c[value] = key
    
df_result = pd.DataFrame(df_dict)
preds = learn.get_preds(ds_type=DatasetType.Valid, ordered=True)
for index, row in df_result.iterrows():
    for node in all_nodes:
        row[node] = preds[0][index][learn.data.c2i[node]].item()
    row['predicted_label'] = i2c[np.argmax(preds[0][index]).data.item()]
df_result.head()

Unnamed: 0,query,actual_label,predicted_label,OFF,NOT
0,@USER Semaya irukku D chellam plz dm va 7.9.in...,OFF,NOT,0.247348,0.752652
1,@USER Enaku unnoda Feeling puriyuthu abi Try p...,NOT,NOT,0.304944,0.695055
2,itha losliya nu solitingale,NOT,OFF,0.540392,0.459608
3,@USER nee yaruda komali avana sona unaku kovam...,OFF,OFF,0.82265,0.17735
4,ama evalunga panra setaikku nama answer pannan...,NOT,NOT,0.17148,0.82852


In [59]:
accuracy_score(df_result['actual_label'], df_result['predicted_label'])

0.86875

In [60]:
matthews_corrcoef(df_result['actual_label'], df_result['predicted_label'])

0.7389336833200368

In [61]:
f1_score(df_result['actual_label'], df_result['predicted_label'], labels=['NOT ', 'OFF'], pos_label='OFF')

0.8692403486924035

In [62]:
df_result['status'] = df_result['actual_label']==df_result['predicted_label']
df_result[df_result['status']==False]

Unnamed: 0,query,actual_label,predicted_label,NOT,OFF,status
7,Dai pavam Pannita kandipa unnaku punishment ka...,NOT,OFF,0.451207,0.548793,False
220,"@USER Appa, antha SivaKarthikeyan vanthu unaku...",NOT,OFF,0.399875,0.600125,False
222,@USER Unaku erinjinu iruka naala dhana ketunu ...,OFF,NOT,0.660396,0.339604,False
313,"@USER Indha trackers ellam ippo vandhavanga , ...",NOT,OFF,0.434618,0.565382,False
316,intha mari video eduthu pullaigala koothadi aa...,NOT,OFF,0.354108,0.645892,False
369,@USER Enga Yarum Dhanush ahh Pathi Romba Muttu...,OFF,NOT,0.593903,0.406097,False
392,Kithan vaai da unaku...Vadai suttu suttaey val...,OFF,NOT,0.627966,0.372034,False
503,@USER Pappan kasu kudutha ena vena seivan Ithu...,OFF,NOT,0.831375,0.168625,False
683,RT @USER : Idhula edhachum thappa sollirukaen ...,OFF,NOT,0.765768,0.234232,False
697,ipalam ivala pathalea irritate aagudhu,NOT,OFF,0.182997,0.817003,False


In [62]:
df_result.iloc[13]['query']

"@USER Naam tamilarai pesurathum tamilargalai onu illa. Apram nan yaaru kaasula padikka vanthen nu unaku epdi raja theriyum Velakku ethum pudichaya. And en sex life ah pathi pesa thaguthi Because you don't have any life, bastard."

In [63]:
learn.predict("Naam tamilarai pesurathum tamilargalai onu illa. Apram nan yaaru kaasula padikka vanthen nu unaku epdi raja theriyum Velakku ethum pudichaya. And en sex life ah pathi pesa thaguthi Because you don't have any life, bastard.".lower())

(Category NOT, tensor(0), tensor([0.9041, 0.0959]))

In [65]:
df_test = pd.read_csv(path/'../hasoc_task_2/Tamil_hasoc_tanglish_test_without_labels.tsv', sep='\t', header=None)
df_test.dropna(inplace=True)
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score
df_dict = {'id': list(df_test[0]), 'text': list(df_test[1]), 'label': ['']*df_test.shape[0]}
all_nodes = list(set(df_train[2]))
for node in all_nodes:
    df_dict[node] = ['']*df_test.shape[0]
    
i2c = {}
for key, value in learn.data.c2i.items():
    i2c[value] = key
    
df_result = pd.DataFrame(df_dict)
preds = learn.get_preds(ds_type=DatasetType.Test, ordered=True)
for index, row in df_result.iterrows():
    for node in all_nodes:
        row[node] = preds[0][index][learn.data.c2i[node]].item()
    row['label'] = i2c[np.argmax(preds[0][index]).data.item()]
df_result.head()

Unnamed: 0,id,text,label,NOT,OFF
0,TA_TW15946,Take it this thevidiya Kandipa indha page admi...,OFF,0.0140025,0.985997
1,TA_TW10175,enga veetla itha nadakum Athum oru varushama t...,NOT,0.971736,0.0282636
2,TA_TW15947,"Indha Sallli Punda, Dummy Pundalam Orama Iruka...",OFF,0.00695224,0.993048
3,TA_TW15174,Juriya poola tier 1 la umburan tha kulla punda...,OFF,0.00167357,0.998326
4,TA_TW15182,Kullans lam umba therila Loosu kuthi maari umb...,OFF,0.00564466,0.994355


In [66]:
df_result[df_result['label']=='NOT'].shape

(477, 5)

In [67]:
df_result.to_csv('test_res2.csv', index=False)

In [64]:
learn.predict('Dawali-ah irukara Kanagavel pathi mulusa therinjika indha padam paarunga. tonight at 7 PM')

(Category OFF, tensor(1), tensor([0.4558, 0.5442]))