In [1]:
from fastai.text import *
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
import sentencepiece as spm
import re
import pdb

In [2]:
import fastai, torch
fastai.__version__ , torch.__version__

('1.0.57', '1.0.0')

In [3]:
torch.cuda.set_device(0)

In [4]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value)  
    torch.manual_seed(seed_value)  
    random.seed(seed_value)
    if use_cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)  
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

In [5]:
random_seed(42, True)

In [6]:
!pwd

/data/home/ubuntu/gaurav/in/fire/code-mixed-enta/classification


In [7]:
path = Path('./')

In [8]:
df_train = pd.read_csv(path/'../dc_fire/tamil_train.tsv', sep='\t')
df_train.head()

Unnamed: 0,text,category
0,Trailer late ah parthavanga like podunga,Positive
1,Move pathutu vanthu trailer pakurvnga yaru,Positive
2,Puthupetai dhanush ah yarellam pathinga,Positive
3,"Dhanush oda character ,puthu sa erukay , mass ta",Positive
4,vera level ippa pesungada mokka nu thalaivaaaaaa,Positive


In [9]:
Counter(df_train['category'])

Counter({'Mixed_feelings ': 1283,
         'Negative ': 1448,
         'Positive ': 7627,
         'not-Tamil ': 368,
         'unknown_state ': 609})

In [10]:
# # # In an attempt to solve class-imbalance
# df_mf = df_train[df_train['category']=='Mixed_feelings ']
# df_neg = df_train[df_train['category']=='Negative ']
# df_nott = df_train[df_train['category']=='not-Tamil ']
# df_us = df_train[df_train['category']=='unknown_state ']
# print(df_mf.shape, df_neg.shape, df_nott.shape, df_us.shape)
# df_train = pd.concat([df_train, df_mf, df_neg, df_nott, df_us, df_mf, df_neg, df_nott, df_us])

In [11]:
Counter(df_train['category'])

Counter({'Mixed_feelings ': 1283,
         'Negative ': 1448,
         'Positive ': 7627,
         'not-Tamil ': 368,
         'unknown_state ': 609})

In [12]:
df_valid = pd.read_csv(path/'../dc_fire/tamil_dev.tsv', sep='\t')
df_valid.head()

Unnamed: 0,text,category
0,Daily likes & views pakka vanthavaga ellaruku...,Positive
1,25 k dislikes ethuku da intha trailerku poi a...,Negative
2,#Lyca unna nenacha pavama iruku ya,Mixed_feelings
3,It looks like Hindi movie amitab bachan,Positive
4,Thalaivarukku nejamavey vayasaagiduchu... siv...,Positive


In [13]:
# In an attempt to create two stage classifier
# orig_class = list(df_valid['category'])
# orig_class = [cls if cls == 'Positive ' else 'Other' for cls in orig_class]
# df_valid['category'] = orig_class

In [14]:
Counter(df_valid['category'])

Counter({'Mixed_feelings ': 141,
         'Negative ': 165,
         'Positive ': 857,
         'not-Tamil ': 29,
         'unknown_state ': 68})

In [15]:
df_test = pd.read_csv(path/'../dc_fire/tamil_test.tsv', sep='\t')
df_test.head()

Unnamed: 0,id,text
0,ta_sent_1,Yarayellam FDFS ppga ippove ready agitinga
1,ta_sent_2,Ennada viswasam mersal sarkar madhri time la l...
2,ta_sent_3,yuvan vera level ya .... valuable script. SK i...
3,ta_sent_4,70 vayasulayum thanoda rasigargala sandhosapad...
4,ta_sent_5,all the best anna...Telugu makkal selvan fans


In [16]:
df_train.shape, df_valid.shape, df_test.shape

((11335, 2), (1260, 2), (3149, 2))

In [17]:
df_train = pd.concat([df_train, df_valid])
df_train.shape

(12595, 2)

In [18]:
label_cols = ['category']
text_cols = ['text']

In [19]:
def handle_all_caps(t: str) -> str:
    tokens = t.split()
    tokens = replace_all_caps(tokens)
    return ' '.join(tokens)

def handle_upper_case_first_letter(t: str) -> str:
    tokens = t.split()
    tokens = deal_caps(tokens)
    return ' '.join(tokens)

def lower_case_everything(t: str) -> str:
    return t.lower().replace('@user', '').replace('#tag ', '').replace('rt ', '').strip()

In [20]:
class CodeMixedTamilTokenizer(BaseTokenizer):
    def __init__(self, lang:str):
        self.lang = lang
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(str(path/"../tokenizer/taen_spm.model"))
        
    def tokenizer(self, t:str) -> List[str]:
        return self.sp.EncodeAsPieces(t)

In [21]:
sp = spm.SentencePieceProcessor()
sp.Load(str(path/"../tokenizer/taen_spm.model"))
itos = [sp.IdToPiece(int(i)) for i in range(8000)]

In [22]:
itos[:20]

['xxunk',
 'xxbos',
 'xxeos',
 'xxpad',
 'xxfld',
 'xxmaj',
 'xxup',
 'xxrep',
 'xxwrep',
 '.',
 ',',
 '▁',
 's',
 'a',
 '="',
 'in',
 'doc',
 't',
 'il',
 'i']

In [23]:
# 8,000 is the vocab size that we chose in sentencepiece
taen_vocab = Vocab(itos)

In [24]:
tokenizer = Tokenizer(lang='taen', tok_func=CodeMixedTamilTokenizer)

In [25]:
tokenizer.pre_rules.append(lower_case_everything)
tokenizer.pre_rules.append(handle_all_caps)
tokenizer.pre_rules.append(handle_upper_case_first_letter)

In [26]:
tokenizer.special_cases, tokenizer.pre_rules, tokenizer.post_rules

(['xxunk',
  'xxpad',
  'xxbos',
  'xxeos',
  'xxfld',
  'xxmaj',
  'xxup',
  'xxrep',
  'xxwrep'],
 [<function fastai.text.transform.fix_html>,
  <function fastai.text.transform.replace_rep>,
  <function fastai.text.transform.replace_wrep>,
  <function fastai.text.transform.spec_add_spaces>,
  <function fastai.text.transform.rm_useless_spaces>,
  <function __main__.lower_case_everything>,
  <function __main__.handle_all_caps>,
  <function __main__.handle_upper_case_first_letter>],
 [<function fastai.text.transform.replace_all_caps>,
  <function fastai.text.transform.deal_caps>])

In [27]:
tokens = tokenizer.process_all(['Tell me about TOUR self, mujhe jaanna hai'])
''.join(tokens[0])

'▁tell▁me▁about▁tour▁self,▁mujhe▁jaanna▁hai'

In [28]:
data_lm = TextLMDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=taen_vocab, label_cols=label_cols, text_cols=text_cols)

In [29]:
data_lm.show_batch()

idx,text
0,", put hu ▁sa ▁eru kay ▁ , ▁mas s ▁ ta ▁x x bo s ▁ver a ▁le vel ▁ ippa ▁pes ung ada ▁mo kka ▁nu ▁tha lai v ▁ xxrep ▁6 ▁a ▁x x bo s ▁tha la ▁mas s ▁ . ▁u 1 ▁b g m . ▁ver a ▁le vel ▁x x bo s ▁ivar a ▁ path ta ▁de ath ▁ vadi ▁madi ri"
1,▁ odi ▁po iru ▁kol a ▁kand ula ▁irukk a en ▁padam ▁pa ak ama ▁vid ama ta en ▁x x bo s ▁el am ▁ gir ls ▁ um ▁tra il er ▁ ah ▁pat ha ▁ma ari ▁mo vi e ▁ya ▁kan di pa a ▁paru nga a ▁ xxrep ▁4 ▁ . ▁x x bo s ▁3 ena ▁da ▁300 ▁sp ar tan ▁s ▁b g m
2,"ru ▁va kk il ▁rang a raj ▁pan de y ▁dhan a ▁avar u ▁x x bo s ▁ini ▁je n m ath h uku ▁mee than e , ▁ hydro car bon ▁ path hi ▁ne na chi ▁ko oda ▁pa ak a ▁mudiya adhu ▁cor por ates . . ▁el a ▁padam um ▁ini ▁adh a pathi ▁da an . . ▁x x bo s ▁tha la ▁se"
3,▁ xxunk ▁chu m ma ▁a thiru th u ▁x x bo s ▁e ▁ka ▁ba wasi r ▁ban a ▁di ye ▁ho ▁x x bo s ▁are ▁yo u ▁vir gin ▁sc ene ▁iru ka ▁x x bo s ▁maran a ▁ wai ting ▁for ▁n k p ▁f d f s ▁ku ▁x x bo s ▁ivaruku ▁vay asu ▁agal a ▁vay asu ▁kor an chu kitu ▁iru
4,"s ▁he y , ▁dis like ▁pann a vell aam ▁ap di ye ▁ odi ▁po idu . . ▁kol a ▁ka andu la ▁iru ken . . ▁x x bo s ▁ xxunk ▁1 ▁on ▁tre nd ing ▁in ▁sri ▁ lan ka . . . ▁ne e ▁va a ▁tha la . . . ▁x x bo s ▁v j s ▁n na ▁di al o gu e"


In [30]:
learn = language_model_learner(data_lm, arch=AWD_LSTM, drop_mult=0.5)

In [31]:
learn.load('../../dataset_preparation/models/best_model', with_opt=True)

LanguageLearner(data=TextLMDataBunch;

Train: LabelList (12595 items)
x: LMTextList
▁x x bo s ▁tra il er ▁la te ▁ ah ▁parth avan ga ▁ like ▁po du nga,▁x x bo s ▁mo ve ▁pat hu tu ▁vant hu ▁tra il er ▁pa k ur v nga ▁yar u,▁x x bo s ▁put hu pe tai ▁dhan u sh ▁ ah ▁yar ellam ▁pathin ga,▁x x bo s ▁dhan u sh ▁ oda ▁chara c ter ▁ , put hu ▁sa ▁eru kay ▁ , ▁mas s ▁ ta,▁x x bo s ▁ver a ▁le vel ▁ ippa ▁pes ung ada ▁mo kka ▁nu ▁tha lai v ▁ xxrep ▁6 ▁a
y: LMLabelList
,,,,
Path: .;

Valid: LabelList (1260 items)
x: LMTextList
▁x x bo s ▁da il y ▁li kes ▁ xxunk ▁vi e ws ▁pa kka ▁van th ava ga ▁ella ru kum ▁van ak kam,▁x x bo s ▁25 ▁k ▁dis like s ▁eth uku ▁da ▁in tha ▁tra il er ku ▁po i ▁ap di ▁un gal ukku ▁en tha ▁tra il er ▁tha ▁pi di kum,▁x x bo s ▁ xxunk ▁ ly ca ▁unna ▁ne na cha ▁pa vam a ▁iru ku ▁ya,▁x x bo s ▁it ▁l ook s ▁ like ▁hi ndi ▁mo vi e ▁ami ta b ▁ba chan,▁x x bo s ▁tha lai var ukku ▁ne jam ave y ▁vay as a ag idu chu . . . ▁siva ji ▁than ▁tha lai var oda ▁pe ak xxunk
y: LMLabelList
,,,,

In [32]:
learn.freeze()

In [33]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,4.602516,4.084794,0.304241,00:04


In [34]:
learn.save('fit_head', with_opt=True)

In [35]:
learn.load('fit_head', with_opt=True);

In [36]:
learn.unfreeze()

In [37]:
learn.fit_one_cycle(3, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,3.878182,3.585127,0.375474,00:06
1,3.548992,3.315666,0.413728,00:06
2,3.383756,3.271243,0.41928,00:06


In [38]:
learn.save('fine_tuned', with_opt=True)

In [39]:
learn.load('fine_tuned', with_opt=True);

In [40]:
learn.predict('en payar',n_words=10)

'en payar ▁rutte ▁chan t p ▁pol ▁cinema ▁pot ta ▁da kka'

In [41]:
learn.save_encoder('fine_tuned_enc')

In [42]:
data_clas = TextClasDataBunch.from_df(path=path, train_df=df_train, valid_df=df_valid, test_df=df_test, tokenizer=tokenizer, vocab=taen_vocab, bs=128, label_cols=label_cols, text_cols=text_cols)

In [43]:
data_clas.show_batch()

text,target
▁x x bo s ▁ver a a a ▁le v ▁ xxrep ▁6 ▁l ▁tha lai va a a ▁ xxrep ▁7 ▁ . ▁av lo o o ▁yo ung ▁ ah h h ▁ka at irkan g ▁ xxrep ▁6 ▁a ▁ xxrep ▁6 ▁ . ▁ant ha ▁s ty le ▁kor aya ave ee ▁illa a ▁ xxrep ▁5 ▁ . ▁pe ee ▁ xxrep ▁4 ▁t ▁aa,Positive
▁x x bo s ▁su per ▁star . . na a . ▁su per ▁star . . tha a a . . ▁maran a ▁wat ing ▁ xxrep ▁9 ▁ . ▁d ▁ xxrep ▁7 ▁a ▁ . . . vai nga . . da a . ▁ . mar on am . . mas s . . mar anam . . thal ai var . var an um .,Positive
▁x x bo s ▁ye van uk ava th u ▁pon ta ti ▁kol a ndha ▁kutt y ▁sent i ment ▁iru ndha ▁ap di ye ▁ odi ▁po idu . . . kol ag andu la ▁ir ken . . . koll ama ▁vid ama ten ▁ xxrep ▁4 ▁ . ▁tha lai var ▁mas s ▁ xxrep ▁4 ▁ . ▁i ▁am ▁e a ger ly ▁ wai,unknown_state
▁x x bo s ▁tha lai v ▁ xxrep ▁6 ▁a ▁ne e ga a ▁e ap av um ▁mas s ▁ xxrep ▁6 ▁ . ▁se ma ▁s ty le ▁ xxrep ▁4 ▁ . ▁i am ▁ad di c ted ▁ xxrep ▁4 ▁ . ▁wat ch ing ▁aga in ▁and ▁aga in ▁ xxrep ▁6 ▁ . ▁f d f s ▁kol a a ▁mas s ▁aga,Positive
▁x x bo s ▁ini me ▁ho ll y wood ▁f li m ▁ lam ▁kidai yat hu ▁ xxunk ▁2 po in t 0 ▁koll y wood ▁math i ri ▁than ▁ xxunk shan kar shan mu gh ▁tha ram ana ▁w or k ▁ xxunk ▁tha lai va a _ un nal a mudiyath ath u _ in num _ enna _ ir uku ▁ xxunk raj ini,Positive


In [44]:
data_clas.sanity_check()

In [45]:
learn = text_classifier_learner(data_clas, arch=AWD_LSTM, drop_mult=0.5, pretrained=True)

In [46]:
learn.load_encoder('fine_tuned_enc')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (12595 items)
x: TextList
▁x x bo s ▁tra il er ▁la te ▁ ah ▁parth avan ga ▁ like ▁po du nga,▁x x bo s ▁mo ve ▁pat hu tu ▁vant hu ▁tra il er ▁pa k ur v nga ▁yar u,▁x x bo s ▁put hu pe tai ▁dhan u sh ▁ ah ▁yar ellam ▁pathin ga,▁x x bo s ▁dhan u sh ▁ oda ▁chara c ter ▁ , put hu ▁sa ▁eru kay ▁ , ▁mas s ▁ ta,▁x x bo s ▁ver a ▁le vel ▁ ippa ▁pes ung ada ▁mo kka ▁nu ▁tha lai v ▁ xxrep ▁6 ▁a
y: CategoryList
Positive ,Positive ,Positive ,Positive ,Positive 
Path: .;

Valid: LabelList (1260 items)
x: TextList
▁x x bo s ▁da il y ▁li kes ▁ xxunk ▁vi e ws ▁pa kka ▁van th ava ga ▁ella ru kum ▁van ak kam,▁x x bo s ▁25 ▁k ▁dis like s ▁eth uku ▁da ▁in tha ▁tra il er ku ▁po i ▁ap di ▁un gal ukku ▁en tha ▁tra il er ▁tha ▁pi di kum,▁x x bo s ▁ xxunk ▁ ly ca ▁unna ▁ne na cha ▁pa vam a ▁iru ku ▁ya,▁x x bo s ▁it ▁l ook s ▁ like ▁hi ndi ▁mo vi e ▁ami ta b ▁ba chan,▁x x bo s ▁tha lai var ukku ▁ne jam ave y ▁vay as a ag idu chu . . . ▁siva ji ▁than ▁tha lai v

In [47]:
learn.freeze()

In [48]:
learn.loss_func.func

CrossEntropyLoss()

In [49]:
mcc = MatthewsCorreff()

In [50]:
learn.metrics = [mcc, accuracy]

In [51]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,1.032375,0.886263,0.184757,0.692857,00:03


In [52]:
learn.save('first-full')

In [53]:
learn.load('first-full');

In [54]:
learn.freeze_to(-2)
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,0.935355,0.818849,0.243696,0.703968,00:04


In [55]:
learn.save('second-full')

In [56]:
learn.load('second-full')

RNNLearner(data=TextClasDataBunch;

Train: LabelList (12595 items)
x: TextList
▁x x bo s ▁tra il er ▁la te ▁ ah ▁parth avan ga ▁ like ▁po du nga,▁x x bo s ▁mo ve ▁pat hu tu ▁vant hu ▁tra il er ▁pa k ur v nga ▁yar u,▁x x bo s ▁put hu pe tai ▁dhan u sh ▁ ah ▁yar ellam ▁pathin ga,▁x x bo s ▁dhan u sh ▁ oda ▁chara c ter ▁ , put hu ▁sa ▁eru kay ▁ , ▁mas s ▁ ta,▁x x bo s ▁ver a ▁le vel ▁ ippa ▁pes ung ada ▁mo kka ▁nu ▁tha lai v ▁ xxrep ▁6 ▁a
y: CategoryList
Positive ,Positive ,Positive ,Positive ,Positive 
Path: .;

Valid: LabelList (1260 items)
x: TextList
▁x x bo s ▁da il y ▁li kes ▁ xxunk ▁vi e ws ▁pa kka ▁van th ava ga ▁ella ru kum ▁van ak kam,▁x x bo s ▁25 ▁k ▁dis like s ▁eth uku ▁da ▁in tha ▁tra il er ku ▁po i ▁ap di ▁un gal ukku ▁en tha ▁tra il er ▁tha ▁pi di kum,▁x x bo s ▁ xxunk ▁ ly ca ▁unna ▁ne na cha ▁pa vam a ▁iru ku ▁ya,▁x x bo s ▁it ▁l ook s ▁ like ▁hi ndi ▁mo vi e ▁ami ta b ▁ba chan,▁x x bo s ▁tha lai var ukku ▁ne jam ave y ▁vay as a ag idu chu . . . ▁siva ji ▁than ▁tha lai v

In [57]:
learn.unfreeze()
learn.fit_one_cycle(10, 1e-2)

epoch,train_loss,valid_loss,matthews_correff,accuracy,time
0,0.850083,0.746708,0.365137,0.733333,00:07
1,0.839479,0.702771,0.38402,0.738889,00:06
2,0.893236,0.972616,0.093121,0.683333,00:07
3,0.95827,0.882259,0.187138,0.692857,00:07
4,0.933478,0.824397,0.259833,0.707143,00:07
5,0.896298,0.792845,0.339776,0.724603,00:07
6,0.863896,0.755098,0.400766,0.743651,00:07
7,0.836132,0.722058,0.422963,0.750794,00:07
8,0.811084,0.709317,0.417659,0.748413,00:07
9,0.792862,0.706518,0.42595,0.750794,00:06


In [58]:
learn.save('final')

In [59]:
interp = TextClassificationInterpretation.from_learner(learn)

In [60]:
interp.confusion_matrix()

array([[  2,  12, 120,   4,   3],
       [  0,  71,  88,   2,   4],
       [  2,  13, 833,   3,   6],
       [  0,   0,   3,  26,   0],
       [  0,   5,  49,   0,  14]])

In [61]:
learn.data.c2i

{'Mixed_feelings ': 0,
 'Negative ': 1,
 'Positive ': 2,
 'not-Tamil ': 3,
 'unknown_state ': 4}

In [62]:
df_test = df_valid.copy()
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score
df_dict = {'query': list(df_test['text']), 'actual_label': list(df_test['category']), 'predicted_label': ['']*df_test.shape[0]}
all_nodes = list(set(df_train['category']))
for node in all_nodes:
    df_dict[node] = ['']*df_test.shape[0]
    
i2c = {}
for key, value in learn.data.c2i.items():
    i2c[value] = key
    
df_result = pd.DataFrame(df_dict)
preds = learn.get_preds(ds_type=DatasetType.Valid, ordered=True)
for index, row in df_result.iterrows():
    for node in all_nodes:
        row[node] = preds[0][index][learn.data.c2i[node]].item()
    row['predicted_label'] = i2c[np.argmax(preds[0][index]).data.item()]
df_result.head()

Unnamed: 0,query,actual_label,predicted_label,unknown_state,Positive,not-Tamil,Mixed_feelings,Negative
0,Daily likes & views pakka vanthavaga ellaruku...,Positive,Positive,0.0902639,0.748242,0.00104036,0.0956726,0.064781
1,25 k dislikes ethuku da intha trailerku poi a...,Negative,Negative,0.0468466,0.307581,0.00121287,0.156031,0.488329
2,#Lyca unna nenacha pavama iruku ya,Mixed_feelings,Positive,0.0140259,0.674023,0.00366425,0.13598,0.172307
3,It looks like Hindi movie amitab bachan,Positive,Positive,0.0965051,0.377914,0.123414,0.155451,0.246716
4,Thalaivarukku nejamavey vayasaagiduchu... siv...,Positive,Positive,0.0146662,0.791409,0.000865942,0.104964,0.0880947


In [63]:
accuracy_score(df_result['actual_label'], df_result['predicted_label'])

0.7507936507936508

In [64]:
matthews_corrcoef(df_result['actual_label'], df_result['predicted_label'])

0.42595047453467916

In [65]:
f1_score(df_result['actual_label'], df_result['predicted_label'], average='weighted')

0.688700482720738

In [66]:
df_result['status'] = df_result['actual_label']==df_result['predicted_label']
df_result[df_result['status']==False]

Unnamed: 0,query,actual_label,predicted_label,unknown_state,Positive,not-Tamil,Mixed_feelings,Negative,status
2,#Lyca unna nenacha pavama iruku ya,Mixed_feelings,Positive,0.0140259,0.674023,0.00366425,0.13598,0.172307,False
7,Vera levellllllllllllll all the best shankar ...,Negative,Positive,0.0101191,0.899275,0.00083147,0.0586485,0.0311261,False
13,HELLO YOUTUBE TRENT ON1 BUTTON YOUR SOFTWARE ...,Negative,Positive,0.106255,0.624435,0.0382924,0.0996062,0.131412,False
20,Superstar Rajnikant Fans Hit Like 2K Likes ...,Negative,Positive,0.0384124,0.878994,0.00105333,0.0585051,0.0230351,False
26,Vaa thaliva vaa thalaiva Marana mass thailaiv...,Mixed_feelings,Positive,0.0111512,0.845636,0.000533368,0.092808,0.0498719,False
...,...,...,...,...,...,...,...,...,...
1247,life time viewae 11m thaandathu pola. kodumai.,Negative,Positive,0.119291,0.518008,0.00137219,0.169559,0.191769,False
1251,Yenakku mattum than iru mugan movie Maari th...,Positive,Negative,0.0509016,0.32762,0.0270263,0.230979,0.363473,False
1253,ವಿಂಟೆಜ್ ರಜ್ನೀ... Karnataka rajnj fans hit. Like,not-Tamil,Positive,0.19143,0.606134,0.0738548,0.0803482,0.0482327,False
1257,Style la irukana hahaha mass dialogue,Negative,Positive,0.0273858,0.786842,0.0403217,0.0918596,0.0535909,False


In [67]:
df_result[(df_result['status'] == False) & (df_result['predicted_label'] == 'Positive ') & (df_result['Positive '] < 0.7)].shape

(162, 9)

In [68]:
df_test = pd.read_csv(path/'../dc_fire/tamil_test.tsv', sep='\t')
from sklearn.metrics import accuracy_score, matthews_corrcoef, f1_score
df_dict = {'id': list(df_test['id']), 'text': list(df_test['text']), 'category': ['']*df_test.shape[0]}
all_nodes = list(set(df_train['category']))
for node in all_nodes:
    df_dict[node] = ['']*df_test.shape[0]
    
i2c = {}
for key, value in learn.data.c2i.items():
    i2c[value] = key
    
df_result = pd.DataFrame(df_dict)
preds = learn.get_preds(ds_type=DatasetType.Test, ordered=True)
for index, row in df_result.iterrows():
    for node in all_nodes:
        row[node] = preds[0][index][learn.data.c2i[node]].item()
    row['category'] = i2c[np.argmax(preds[0][index]).data.item()]
df_result.head()

Unnamed: 0,id,text,category,unknown_state,Positive,not-Tamil,Mixed_feelings,Negative
0,ta_sent_1,Yarayellam FDFS ppga ippove ready agitinga,Positive,0.120828,0.684897,0.00102912,0.0983007,0.0949456
1,ta_sent_2,Ennada viswasam mersal sarkar madhri time la l...,Negative,0.0659945,0.271631,0.00269051,0.24773,0.411954
2,ta_sent_3,yuvan vera level ya .... valuable script. SK i...,Positive,0.00169345,0.942566,0.000100132,0.0397426,0.0158974
3,ta_sent_4,70 vayasulayum thanoda rasigargala sandhosapad...,Positive,0.0177472,0.548892,0.00193056,0.157654,0.273776
4,ta_sent_5,all the best anna...Telugu makkal selvan fans,Positive,0.099657,0.562273,0.0903013,0.11538,0.132389


In [69]:
df_result[df_result['category']=='Positive '].shape

(2735, 8)

In [70]:
df_result.to_csv('test_res_dc_fire_full.csv', index=False)