In [1]:
from fastai.text import *
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import sentencepiece as spm
import re
import pdb

In [2]:
import fastai, torch
fastai.__version__ , torch.__version__

('1.0.57', '1.0.0')

In [3]:
torch.cuda.set_device(0)

In [4]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value)  
    torch.manual_seed(seed_value)  
    random.seed(seed_value)
    if use_cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)  
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

In [5]:
random_seed(42, True)

In [6]:
!pwd

/data/home/ubuntu/gaurav/in/fire/code-mixed-enma-2/classification_task_2


In [7]:
path = Path('./')

In [8]:
df = pd.read_excel(path/'../hasoc_task_2/Malayalam_offensive_data_Training-YT.xlsx')
df = df[['Tweets', 'Labels']]
df.head()

Unnamed: 0,Tweets,Labels
0,Thaankal enthaan cheyyarullath?üòõ,NOT
1,Ee theetam WCC feminichigalude news aarkk vena...,OFF
2,fukru nem tiktok oolakale vilich charcha nadat...,OFF
3,Aashiq abu produce cheytharunnel ee problems u...,NOT
4,Pennungal oru team aayal ath moonjum ennu epoo...,OFF


In [9]:
df.shape

(4000, 2)

In [10]:
Counter(df['Labels'])

Counter({'NOT': 2047, 'OFF': 1953})

In [11]:
cutoff = int(0.8*len(df))
df_train = df[:cutoff]
df_train.dropna(inplace=True)
df_train.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Tweets,Labels
0,Thaankal enthaan cheyyarullath?üòõ,NOT
1,Ee theetam WCC feminichigalude news aarkk vena...,OFF
2,fukru nem tiktok oolakale vilich charcha nadat...,OFF
3,Aashiq abu produce cheytharunnel ee problems u...,NOT
4,Pennungal oru team aayal ath moonjum ennu epoo...,OFF


In [12]:
df_valid = df[cutoff:]
df_valid.head()

Unnamed: 0,Tweets,Labels
3200,Aareyum rakshapedaan anuvathikkaruthu? But Aar...,OFF
3201,kammikal motham udaippanu. Mukhyante nidhiyile...,OFF
3202,mohan lal paadi thudagiyappol √Ç¬†janam stadium ...,OFF
3203,Abhinayathinte kaaryam thott kalichalindalla m...,OFF
3204,Loka tholvi...maanam kettavane,OFF


In [13]:
Counter(df_train['Labels'])

Counter({'NOT': 1520, 'OFF': 1679})

In [14]:
Counter(df_valid['Labels'])

Counter({'NOT': 527, 'OFF': 273})

In [15]:
df_test = pd.read_csv(path/'../hasoc_task_2/malayalam_hasoc_tanglish_test_without_labels.tsv', sep='\t', header=None)
df_test.head()

Unnamed: 0,0,1
0,MA_YT5000,Chenkol vendath thanne aayirunnu....
1,MA_YT5001,Sundardasinte bhakshnam vakkukal ano?
2,MA_YT5002,Akasha dooth oru copy adi movie anu 'Who will ...
3,MA_YT5003,Purath onnum pondade... oru pennum payyanum on...
4,MA_YT5004,Avasanam Fahad oru Oscar medikkumbazhum lalett...


In [16]:
df_train.shape, df_valid.shape, df_test.shape

((3199, 2), (800, 2), (951, 2))

In [17]:
df_train[df_train['Tweets'].isnull()]

Unnamed: 0,Tweets,Labels


In [18]:
df_train = pd.concat([df_train, df_valid])
df_train.shape

(3999, 2)

In [19]:
label_cols = ['Labels']
text_cols = [0]

In [20]:
def handle_all_caps(t: str) -> str:
    tokens = t.split()
    tokens = replace_all_caps(tokens)
    return ' '.join(tokens)

def handle_upper_case_first_letter(t: str) -> str:
    tokens = t.split()
    tokens = deal_caps(tokens)
    return ' '.join(tokens)

def lower_case_everything(t: str) -> str:
    return t.lower()

In [21]:
class CodeMixedMalayalamTokenizer(BaseTokenizer):
    def __init__(self, lang:str):
        self.lang = lang
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(str(path/"../tokenizer/mlen_spm.model"))
        
    def tokenizer(self, t:str) -> List[str]:
        return self.sp.EncodeAsPieces(t)

In [22]:
sp = spm.SentencePieceProcessor()
sp.Load(str(path/"../tokenizer/mlen_spm.model"))
itos = [sp.IdToPiece(int(i)) for i in range(15000)]

In [23]:
itos[:20]

['xxunk',
 'xxbos',
 'xxeos',
 'xxpad',
 'xxfld',
 'xxmaj',
 'xxup',
 'xxrep',
 'xxwrep',
 '.',
 '‚ñÅthe',
 ',',
 '‡µº',
 '‡µΩ',
 's',
 '‡µª',
 '‚ñÅof',
 '‚ñÅ',
 '‡µæ',
 '‚ñÅin']

In [24]:
# 15,000 is the vocab size that we chose in sentencepiece
mlen_vocab = Vocab(itos)

In [25]:
tokenizer = Tokenizer(lang='mlen', tok_func=CodeMixedMalayalamTokenizer)

In [26]:
tokenizer.pre_rules.append(lower_case_everything)
tokenizer.pre_rules.append(handle_all_caps)
tokenizer.pre_rules.append(handle_upper_case_first_letter)

In [27]:
tokenizer.special_cases, tokenizer.pre_rules, tokenizer.post_rules

(['xxunk',
  'xxpad',
  'xxbos',
  'xxeos',
  'xxfld',
  'xxmaj',
  'xxup',
  'xxrep',
  'xxwrep'],
 [<function fastai.text.transform.fix_html>,
  <function fastai.text.transform.replace_rep>,
  <function fastai.text.transform.replace_wrep>,
  <function fastai.text.transform.spec_add_spaces>,
  <function fastai.text.transform.rm_useless_spaces>,
  <function __main__.lower_case_everything>,
  <function __main__.handle_all_caps>,
  <function __main__.handle_upper_case_first_letter>],
 [<function fastai.text.transform.replace_all_caps>,
  <function fastai.text.transform.deal_caps>])

In [28]:
tokens = tokenizer.process_all(['Tell me about TOUR self, mujhe jaanna hai'])
''.join(tokens[0])

'‚ñÅtell‚ñÅme‚ñÅabout‚ñÅtour‚ñÅself,‚ñÅmujhe‚ñÅjaanna‚ñÅhai'

In [29]:
data_lm = TextLMDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=mlen_vocab, label_cols=label_cols, text_cols=text_cols)

In [30]:
data_lm.show_batch()

idx,text
0,kka . . be dham ‚ñÅaan h ‚ñÅx x bo s ‚ñÅaa shi q ‚ñÅabu ‚ñÅproduce ‚ñÅche y tha runn el ‚ñÅe e ‚ñÅproblems ‚ñÅ undaki lla runnu ‚ñÅ xxrep ‚ñÅ5 ‚ñÅ. ‚ñÅx x bo s ‚ñÅpe nnu ngal ‚ñÅoru ‚ñÅteam ‚ñÅa ayal ‚ñÅa th ‚ñÅmoon ju m ‚ñÅennu ‚ñÅe po o ‚ñÅman si laya llo ‚ñÅx x bo s ‚ñÅ xxunk us er ‚ñÅpo ‚ñÅoru ‚ñÅraj i tha kku
1,‚ñÅ xxunk us er ‚ñÅgive ‚ñÅrespect ‚ñÅtake ‚ñÅrespect ‚ñÅenna lle ‚ñÅ xxrep ‚ñÅ4 ‚ñÅ. ‚ñÅabhinaya maan ‚ñÅaa swa a dhana m ‚ñÅ xxrep ‚ñÅ4 ‚ñÅ. ‚ñÅkala ye ‚ñÅmaa th ram ‚ñÅsneh i kku ‚ñÅ xxrep ‚ñÅ5 ‚ñÅ. ‚ñÅka zhi vu lla var ‚ñÅmath re ‚ñÅval ar nnu ‚ñÅvann ittull u . . . val ar thi y ittull u ‚ñÅ xxrep ‚ñÅ4 ‚ñÅ. ‚ñÅa thu ‚ñÅmarakka and irik aa .
2,s ‚ñÅmi tta yi ‚ñÅvangi y ku nath ‚ñÅpole a ‚ñÅala ‚ñÅ nj n ‚ñÅfilm ‚ñÅselect ‚ñÅcheyu nath . . ‚ñÅtha ‚ñÅi th . . ‚ñÅi th xxunk . . ‚ñÅbhasi st ‚ñÅ xxrep ‚ñÅ4 ‚ñÅ xxunk ‚ñÅx x bo s ‚ñÅa the ‚ñÅko o de ‚ñÅni kkum ‚ñÅval lo ‚ñÅnte ‚ñÅkarya thi l ‚ñÅe da pe dan ‚ñÅoru y ran um ‚ñÅava ka sha mill a ‚ñÅx x bo
3,‚ñÅpra an tha lla ‚ñÅsuhrut he ‚ñÅ. . ma tte the lu m ‚ñÅreligion ‚ñÅin gan e ‚ñÅpoint ‚ñÅfinger ‚ñÅcheyy u van ‚ñÅa ar enkil um ‚ñÅsamm adi ku mo . . pin ne ‚ñÅella ‚ñÅmath ath eyu m ‚ñÅswe e gar ikunn a ‚ñÅor e ‚ñÅoru ‚ñÅsam s kara m ‚ñÅsan a than dar ma m ‚ñÅaan u ‚ñÅx x bo s ‚ñÅvalare ‚ñÅshe riya nu ‚ñÅ xxrep ‚ñÅ4
4,"‚ñÅa thi lla tha vare ‚ñÅari y ichu ‚ñÅnama le ‚ñÅmand anmaru kku ba no ‚ñÅund he shich e ‚ñÅx x bo s ‚ñÅiva le ‚ñÅkettu na van ‚ñÅend ha yal um ‚ñÅoru ‚ñÅko zhi ‚ñÅa ayirik kum ‚ñÅx x bo s ‚ñÅe th ‚ñÅge thi ‚ñÅke tta ‚ñÅcinema kkarana vo , ‚ñÅiva le yo ke , ‚ñÅvilich e kka ne , ‚ñÅmalaya la ‚ñÅcinema yil ‚ñÅit hra ‚ñÅkshama mano"


In [31]:
learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.3, pretrained=False)

In [32]:
learn.load('../../dataset_preparation/models/best_model', with_opt=True)

LanguageLearner(data=TextLMDataBunch;

Train: LabelList (3999 items)
x: LMTextList
‚ñÅx x bo s ‚ñÅtha an kal ‚ñÅen tha an ‚ñÅcheyy arullat h ? xxunk,‚ñÅx x bo s ‚ñÅe e ‚ñÅthe e tam ‚ñÅw cc ‚ñÅfe mini chi ga lu de ‚ñÅnews ‚ñÅa ar kk ‚ñÅven am . . . kondu po de . . .,‚ñÅx x bo s ‚ñÅfu k ru ‚ñÅne m ‚ñÅtik to k ‚ñÅo ola kale ‚ñÅvilich ‚ñÅchar cha ‚ñÅna da thi ye ne kka . . be dham ‚ñÅaan h,‚ñÅx x bo s ‚ñÅaa shi q ‚ñÅabu ‚ñÅproduce ‚ñÅche y tha runn el ‚ñÅe e ‚ñÅproblems ‚ñÅ undaki lla runnu ‚ñÅ xxrep ‚ñÅ5 ‚ñÅ.,‚ñÅx x bo s ‚ñÅpe nnu ngal ‚ñÅoru ‚ñÅteam ‚ñÅa ayal ‚ñÅa th ‚ñÅmoon ju m ‚ñÅennu ‚ñÅe po o ‚ñÅman si laya llo
y: LMLabelList
,,,,
Path: .;

Valid: LabelList (800 items)
x: LMTextList
‚ñÅx x bo s ‚ñÅaa reyu m ‚ñÅraksha pe da an ‚ñÅanu va thi kkaru thu ? ‚ñÅbut ‚ñÅaaru m ‚ñÅmind i yi lla ‚ñÅennu ‚ñÅparayunnu ‚ñÅthan kal . ‚ñÅi thu ‚ñÅbjp ‚ñÅya anu ‚ñÅcha i tha th enkil ‚ñÅthan kal ‚ñÅmind ill a ‚ñÅennu ‚ñÅella avar kum ‚ñÅa riya am .,‚ñÅx x bo s ‚ñÅka mmika l ‚ñÅmo tham ‚ñÅuda i ppan u

In [33]:
learn.freeze()

In [34]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,5.844162,5.354818,0.195714,00:02


In [35]:
learn.save('fit_head', with_opt=True)

In [36]:
learn.load('fit_head', with_opt=True);

In [37]:
learn.unfreeze()

In [38]:
learn.fit_one_cycle(5, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,5.335639,5.132363,0.21375,00:02
1,5.128068,4.781209,0.24875,00:02
2,4.92865,4.587447,0.267009,00:02
3,4.783125,4.508292,0.274554,00:02
4,4.689357,4.495005,0.274955,00:02


In [39]:
learn.save('fine_tuned', with_opt=True)

In [40]:
learn.load('fine_tuned', with_opt=True);

In [41]:
learn.predict('Evideo oru Hollywood story',n_words=10)

'Evideo oru Hollywood story ino lo re m . ‚ñÅleonardo ‚ñÅeli n anu ‚ñÅa'

In [42]:
learn.save_encoder('fine_tuned_enc')

In [43]:
data_clas = TextClasDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=mlen_vocab, bs=16, label_cols=label_cols, text_cols=text_cols)

In [44]:
data_clas.show_batch()

text,target
‚ñÅx x bo s ‚ñÅ xxunk us er ‚ñÅne e ‚ñÅent je lu m ‚ñÅpara y ‚ñÅmy re . . . nin de ‚ñÅthan ta ‚ñÅannu ‚ñÅclose t ‚ñÅvana m ‚ñÅvit irunnenkil ‚ñÅne e ‚ñÅi po o ‚ñÅavi di run ‚ñÅche la che ne ‚ñÅ xxrep ‚ñÅ4 ‚ñÅ. ‚ñÅni nd e ‚ñÅtha lla yu m ‚ñÅvedi ‚ñÅn no ke ‚ñÅparayunn ‚ñÅa ‚ñÅpe nnu ngal ‚ñÅa var ‚ñÅcho o shan am,OFF
‚ñÅx x bo s ‚ñÅi ' m ‚ñÅchristian . . . nj ng l de ‚ñÅc l g l ‚ñÅe e ‚ñÅjesus ‚ñÅyouth inte ‚ñÅpa rupa di ‚ñÅvaru m . . . nj ng l de ‚ñÅc l g il ‚ñÅella ‚ñÅmath avu m ‚ñÅon d . . . ‚ñÅi th inte ‚ñÅal kar ‚ñÅvann it ‚ñÅo tta ‚ñÅmath am e ‚ñÅo llu ‚ñÅjesus ‚ñÅis ‚ñÅthe ‚ñÅonly ‚ñÅgod . .,NOT
‚ñÅx x bo s ‚ñÅni nte ‚ñÅve e til ‚ñÅninnu ‚ñÅlo an ‚ñÅe du thi ttu ‚ñÅall a ‚ñÅmy re ‚ñÅ nja ngal ‚ñÅjeev i kunnat hu . . ‚ñÅ nja ngal ‚ñÅni ng alu de ‚ñÅliber ty e ‚ñÅhu r t ‚ñÅcheyu ni lla . ‚ñÅ nja ngal ‚ñÅend o gam y ‚ñÅfollow ‚ñÅcheyu nnu . . ‚ñÅa thu ‚ñÅver e ‚ñÅaaru m ‚ñÅaffect ‚ñÅcheyu ni lla . ‚ñÅe,OFF
"‚ñÅx x bo s ‚ñÅamm u ‚ñÅche chi , ‚ñÅni ngal ‚ñÅlad ies ‚ñÅellavaru m ‚ñÅor e ‚ñÅswabhava m ‚ñÅava nam ‚ñÅennu ‚ñÅvasi ‚ñÅ url ‚ñÅpatti lla llo . ‚ñÅ nja ngal ‚ñÅaan u ngal um ‚ñÅellavaru m ‚ñÅoru pole ‚ñÅull avar ‚ñÅall a . ‚ñÅellavaru de yu m ‚ñÅull il ‚ñÅgod ‚ñÅ um ‚ñÅde mon ‚ñÅ um ‚ñÅund . ‚ñÅaa ‚ñÅs th re eyu de ‚ñÅmana s ika ‚ñÅnila",OFF
‚ñÅx x bo s ‚ñÅmaa ma ‚ñÅmadhyama ngal ‚ñÅva ar tha kal ‚ñÅpala thu m ‚ñÅmu kku kaya nu . ‚ñÅsa jan e ‚ñÅpo leyulla ‚ñÅchil a ‚ñÅnall a ‚ñÅmaa dhya ma ‚ñÅprav ar tha kar ‚ñÅmu kki ya ‚ñÅva ar tha kal ‚ñÅpo kki ‚ñÅe du thu ‚ñÅjanang a le ‚ñÅari yi ckunnu ‚ñÅenna thu ‚ñÅaa swa sam ‚ñÅtha runn a ‚ñÅkaryam anennu ‚ñÅpara ya the ‚ñÅvayy a .,NOT


In [45]:
data_clas.sanity_check()

In [46]:
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.7)

In [47]:
learn.load_encoder('fine_tuned_enc')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (3999 items)
x: TextList
‚ñÅx x bo s ‚ñÅtha an kal ‚ñÅen tha an ‚ñÅcheyy arullat h ? xxunk,‚ñÅx x bo s ‚ñÅe e ‚ñÅthe e tam ‚ñÅw cc ‚ñÅfe mini chi ga lu de ‚ñÅnews ‚ñÅa ar kk ‚ñÅven am . . . kondu po de . . .,‚ñÅx x bo s ‚ñÅfu k ru ‚ñÅne m ‚ñÅtik to k ‚ñÅo ola kale ‚ñÅvilich ‚ñÅchar cha ‚ñÅna da thi ye ne kka . . be dham ‚ñÅaan h,‚ñÅx x bo s ‚ñÅaa shi q ‚ñÅabu ‚ñÅproduce ‚ñÅche y tha runn el ‚ñÅe e ‚ñÅproblems ‚ñÅ undaki lla runnu ‚ñÅ xxrep ‚ñÅ5 ‚ñÅ.,‚ñÅx x bo s ‚ñÅpe nnu ngal ‚ñÅoru ‚ñÅteam ‚ñÅa ayal ‚ñÅa th ‚ñÅmoon ju m ‚ñÅennu ‚ñÅe po o ‚ñÅman si laya llo
y: CategoryList
NOT,OFF,OFF,NOT,OFF
Path: .;

Valid: LabelList (800 items)
x: TextList
‚ñÅx x bo s ‚ñÅaa reyu m ‚ñÅraksha pe da an ‚ñÅanu va thi kkaru thu ? ‚ñÅbut ‚ñÅaaru m ‚ñÅmind i yi lla ‚ñÅennu ‚ñÅparayunnu ‚ñÅthan kal . ‚ñÅi thu ‚ñÅbjp ‚ñÅya anu ‚ñÅcha i tha th enkil ‚ñÅthan kal ‚ñÅmind ill a ‚ñÅennu ‚ñÅella avar kum ‚ñÅa riya am .,‚ñÅx x bo s ‚ñÅka mmika l ‚ñÅmo tham ‚ñÅuda

In [48]:
learn.freeze()

In [49]:
learn.loss_func.func

CrossEntropyLoss()

In [50]:
mcc = MatthewsCorreff()

In [51]:
learn.metrics = [mcc, accuracy]

In [52]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,0.666853,0.632036,0.286977,0.65625,00:03


In [53]:
learn.save('first-full')

In [54]:
learn.load('first-full');

In [55]:
learn.freeze_to(-2)
learn.fit_one_cycle(3, 1e-2)

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,0.650479,0.5602,0.400943,0.72375,00:04
1,0.578573,0.466235,0.619371,0.815,00:04
2,0.516534,0.363603,0.709423,0.86875,00:04


In [56]:
learn.save('second-full')

In [57]:
learn.unfreeze()
learn.fit_one_cycle(5, 1e-3, callbacks=[callbacks.SaveModelCallback(learn, every='improvement', monitor='accuracy', name='final')])

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,0.454529,0.354819,0.73258,0.87,00:07
1,0.468962,0.285419,0.802657,0.90875,00:07
2,0.403044,0.236197,0.82854,0.91625,00:07
3,0.355547,0.190148,0.865033,0.93625,00:07
4,0.342986,0.172694,0.888658,0.94875,00:07


Better model found at epoch 0 with accuracy value: 0.8700000047683716.
Better model found at epoch 1 with accuracy value: 0.9087499976158142.
Better model found at epoch 2 with accuracy value: 0.9162499904632568.
Better model found at epoch 3 with accuracy value: 0.9362499713897705.
Better model found at epoch 4 with accuracy value: 0.9487500190734863.


In [58]:
learn.load('final')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (3999 items)
x: TextList
‚ñÅx x bo s ‚ñÅtha an kal ‚ñÅen tha an ‚ñÅcheyy arullat h ? xxunk,‚ñÅx x bo s ‚ñÅe e ‚ñÅthe e tam ‚ñÅw cc ‚ñÅfe mini chi ga lu de ‚ñÅnews ‚ñÅa ar kk ‚ñÅven am . . . kondu po de . . .,‚ñÅx x bo s ‚ñÅfu k ru ‚ñÅne m ‚ñÅtik to k ‚ñÅo ola kale ‚ñÅvilich ‚ñÅchar cha ‚ñÅna da thi ye ne kka . . be dham ‚ñÅaan h,‚ñÅx x bo s ‚ñÅaa shi q ‚ñÅabu ‚ñÅproduce ‚ñÅche y tha runn el ‚ñÅe e ‚ñÅproblems ‚ñÅ undaki lla runnu ‚ñÅ xxrep ‚ñÅ5 ‚ñÅ.,‚ñÅx x bo s ‚ñÅpe nnu ngal ‚ñÅoru ‚ñÅteam ‚ñÅa ayal ‚ñÅa th ‚ñÅmoon ju m ‚ñÅennu ‚ñÅe po o ‚ñÅman si laya llo
y: CategoryList
NOT,OFF,OFF,NOT,OFF
Path: .;

Valid: LabelList (800 items)
x: TextList
‚ñÅx x bo s ‚ñÅaa reyu m ‚ñÅraksha pe da an ‚ñÅanu va thi kkaru thu ? ‚ñÅbut ‚ñÅaaru m ‚ñÅmind i yi lla ‚ñÅennu ‚ñÅparayunnu ‚ñÅthan kal . ‚ñÅi thu ‚ñÅbjp ‚ñÅya anu ‚ñÅcha i tha th enkil ‚ñÅthan kal ‚ñÅmind ill a ‚ñÅennu ‚ñÅella avar kum ‚ñÅa riya am .,‚ñÅx x bo s ‚ñÅka mmika l ‚ñÅmo tham ‚ñÅuda

In [59]:
df_test = df_valid.copy()
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score
df_dict = {'query': list(df_test['Tweets']), 'actual_label': list(df_test['Labels']), 'predicted_label': ['']*df_test.shape[0]}
all_nodes = list(set(df_train['Labels']))
for node in all_nodes:
    df_dict[node] = ['']*df_test.shape[0]
    
i2c = {}
for key, value in learn.data.c2i.items():
    i2c[value] = key
    
df_result = pd.DataFrame(df_dict)
# preds = learn.get_preds(ds_type=DatasetType.Valid, ordered=True)
for index, row in df_result.iterrows():
    pred = learn.predict(row['query'])
    for node in all_nodes:
        row[node] = pred[2][learn.data.c2i[node]].item()
    row['predicted_label'] = i2c[pred[1].data.item()]
df_result.head()

Unnamed: 0,query,actual_label,predicted_label,NOT,OFF
0,Aareyum rakshapedaan anuvathikkaruthu? But Aar...,OFF,OFF,0.340666,0.659334
1,kammikal motham udaippanu. Mukhyante nidhiyile...,OFF,OFF,0.0832142,0.916786
2,mohan lal paadi thudagiyappol √Ç¬†janam stadium ...,OFF,OFF,0.199501,0.800499
3,Abhinayathinte kaaryam thott kalichalindalla m...,OFF,NOT,0.620587,0.379413
4,Loka tholvi...maanam kettavane,OFF,OFF,0.146504,0.853496


In [60]:
accuracy_score(df_result['actual_label'], df_result['predicted_label'])

0.9475

In [61]:
matthews_corrcoef(df_result['actual_label'], df_result['predicted_label'])

0.8861309760207781

In [62]:
f1_score(df_result['actual_label'], df_result['predicted_label'], labels=['OFF ', 'NOT'], pos_label='OFF')

0.9255319148936171

In [63]:
df_result['status'] = df_result['actual_label']==df_result['predicted_label'] 
df_result[df_result['status']==False]

Unnamed: 0,query,actual_label,predicted_label,NOT,OFF,status
3,Abhinayathinte kaaryam thott kalichalindalla m...,OFF,NOT,0.620587,0.379413,False
11,LALAPPAN.... iyalk ithinte valla karyavumundo,OFF,NOT,0.793384,0.206616,False
19,namuk enthina ingane nariya oru kodathy,OFF,NOT,0.554235,0.445765,False
49,amma mole interview cheythu athre ollu..a nere...,OFF,NOT,0.621288,0.378712,False
67,veena arya enth cheythalum kozaURL,NOT,OFF,0.332914,0.667086,False
74,iyalk kurach cynide koduthoode,OFF,NOT,0.626035,0.373965,False
78,ivan evde paripadi avathariURL ithanavastha,NOT,OFF,0.436869,0.563131,False
99,eppoyum shakeelayude vicharame ollu kochu kallan,NOT,OFF,0.362424,0.637576,False
112,nadiye evdnu irakki aval ethanu,NOT,OFF,0.190991,0.809009,False
128,santhosh pandit nu bhudhi alpam kurava ath mat...,OFF,NOT,0.534747,0.465253,False


In [64]:
df_result.to_csv('val_res_enml.csv', index=False)

In [64]:
df_test = pd.read_csv(path/'../hasoc_task_2/malayalam_hasoc_tanglish_test_without_labels.tsv', sep='\t', header=None)
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score
df_dict = {'id': list(df_test[0]), 'text': list(df_test[1]), 'label': ['']*df_test.shape[0]}
all_nodes = list(set(df_train['Labels']))
for node in all_nodes:
    df_dict[node] = ['']*df_test.shape[0]
    
i2c = {}
for key, value in learn.data.c2i.items():
    i2c[value] = key
    
df_result = pd.DataFrame(df_dict)
for index, row in df_result.iterrows():
    pred = learn.predict(row['text'])
    for node in all_nodes:
        row[node] = pred[2][learn.data.c2i[node]].item()
    row['label'] = i2c[pred[1].data.item()]
df_result.head()

Unnamed: 0,id,text,label,NOT,OFF
0,MA_YT5000,Chenkol vendath thanne aayirunnu....,NOT,0.974851,0.0251488
1,MA_YT5001,Sundardasinte bhakshnam vakkukal ano?,NOT,0.9364,0.0635997
2,MA_YT5002,Akasha dooth oru copy adi movie anu 'Who will ...,OFF,0.441635,0.558365
3,MA_YT5003,Purath onnum pondade... oru pennum payyanum on...,NOT,0.75176,0.24824
4,MA_YT5004,Avasanam Fahad oru Oscar medikkumbazhum lalett...,NOT,0.934347,0.0656529


In [65]:
df_result[df_result['label']=='NOT'].shape

(505, 5)

In [66]:
df_result.shape

(951, 5)

In [67]:
df_result.to_csv('test_res_enml_full.csv', index=False)