In [2]:
import pandas as pd
import os
import shutil
import random
import subprocess
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification,BertAdam
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
import re
random.seed(42)
def p(local_path):
    return os.path.join("/home/jupyter/mt-dnn",local_path)

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [3]:
src = pd.read_csv("~/mt-dnn/toxic_data/train.csv")

In [4]:
train, test = train_test_split(src,test_size=0.05)

In [5]:
train0 = train.iloc[0:140000]

In [6]:
train1 = train.iloc[140000:140000*2]

In [16]:
test0 = train.iloc[140000*2:140000*2+8000]

In [8]:
pd.options.mode.chained_assignment = None  # default='warn'

In [10]:
os.path.join('test','t')

'test/t'

In [18]:
MAX_LENGTH = 220
def full_tokenizer(raw_string):
    raw_string=raw_string[0]
    tokens = tokenizer.tokenize(raw_string)
    if len(tokens)>MAX_LENGTH:
        tokens = tokens[0:MAX_LENGTH]
    tokens = ['[CLS]'] + tokens + ['[SEP]']
    return tokenizer.convert_tokens_to_ids(tokens)

In [19]:
def toxic_prep(sample,path_to_prepared=''):
    how_many_datapoints = sample.shape[0]
    sample = sample[['id',"target",'comment_text']]        
    sample['token_id'] = sample[['comment_text']].apply(full_tokenizer,axis = 1)
    sample['type_id'] = sample[['token_id']].apply(lambda x:[0]*len(x[0]),axis = 1)
    sample.drop(['comment_text'], axis=1, inplace=True)
    sample = sample.rename(columns={'id': 'uid', 'target': 'label'})
    sample['label'] = sample[['label']].apply(lambda x:int(x>=0.5),axis = 1)
    sample.to_json(path_to_prepared, orient='records', lines=True)

In [None]:
toxic_prep(train0,"/home/jupyter/mt-dnn/data/mt_dnn/sst_train.json")

In [20]:
toxic_prep(test0,"/home/jupyter/mt-dnn/data/mt_dnn/sst_test.json")

In [90]:
toxic_prep(train1,"/home/jupyter/mt-dnn/data/mt_dnn/sst_train.json")

In [8]:
def smaller_train_set(file_in,file_out,batch_size=4000,batch_number=0):
    with open(file_in) as f:
        lines=f.readlines()
        with open(file_out, "w") as out:
            out.writelines(lines[batch_size*batch_number:batch_size*(batch_number+1)])

In [9]:
smaller_train_set(p("data/mt_dnn/toxic_test.json"),p("data/mt_dnn/sst_test.json"))
smaller_train_set(p("data/mt_dnn/toxic_train.json"),p("data/mt_dnn/sst_train.json"),batch_size=140000)

In [5]:
def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

In [5]:
file_len("/home/jupyter/mt-dnn/data/mt_dnn/toxic_test.json")

90244

In [7]:
file_len("/home/jupyter/mt-dnn/data/mt_dnn/toxic_train.json")

1714630

In [6]:
file_len("/home/jupyter/mt-dnn/data/mt_dnn/sst_test.json")

1821

In [7]:
file_len("/home/jupyter/mt-dnn/data/mt_dnn/sst_train.json")

67349

In [9]:
def cut_file(file,max_length):
    with open(file) as f, open("/home/jupyter/mt-dnn/tmp.txt", "w") as out:
        out.writelines(f.readlines()[:max_length])
    os.remove(file)
    shutil.move("/home/jupyter/mt-dnn/tmp.txt",file)    

In [10]:
# trying a shorter train set, for more reasonable training time
cut_file('/home/jupyter/mt-dnn/data/mt_dnn/toxic_test.json',2000)

In [9]:
def test_accuracy(result,truth):
    result['reality'] = result["index"].apply(lambda x : int(truth[truth.id == x].target.item() > 0.5))
    result['accurate'] = result['reality'] == result['prediction']
    result['truepositive'] = (result['reality'] == result['prediction']) & (result['prediction'] == 1)
    result['truenegative'] = (result['reality'] == result['prediction']) & (result['prediction'] == 0)
    total = result.shape[0]
    accurate = result['accurate'].sum()
    accuracy = accurate/total
    tp = result['truepositive'].sum()
    tn = result['truenegative'].sum()
    totalpos = result['prediction'].sum()
    totalneg = total - totalpos
    tpratio = tp / totalpos
    tnratio = tn / totalneg
    print(f"accuracy : {accuracy} | tpratio : {tpratio} | tnratio : {tnratio}")

In [11]:
def eval_accuracy(folder_path,truth):
    fp = p(folder_path)
    results = list(filter(lambda x: re.search('sst_test_scores_\d+.tsv',x), os.listdir(fp)))
    results.sort(key = lambda x : re.search('sst_test_scores_(\d+).tsv',x)[1])
    for result in results:
        print(result)
        result = pd.read_csv(os.path.join(fp,result), sep='\t')
        test_accuracy(result,truth)

In [None]:
eval_accuracy('checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-06T0621/',src)

In [52]:
eval_accuracy('checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-05T1529',src)

sst_test_scores_0.tsv
accuracy : 0.9585 | tpratio : 0.5485074626865671 | tnratio : 0.9879421221864951
sst_test_scores_1.tsv
accuracy : 0.9715 | tpratio : 0.6695652173913044 | tnratio : 0.9899204244031831
sst_test_scores_2.tsv
accuracy : 0.9735 | tpratio : 0.6592592592592592 | tnratio : 0.9962466487935657
sst_test_scores_3.tsv
accuracy : 0.97575 | tpratio : 0.6752767527675276 | tnratio : 0.997586484312148


In [54]:
eval_accuracy('checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-06T2307',src)

sst_test_scores_0.tsv
accuracy : 0.97575 | tpratio : 0.6752767527675276 | tnratio : 0.997586484312148


In [84]:
test_accuracy(res0,test0)

accuracy : 0.9585 | tpratio : 0.5485074626865671 | tnratio : 0.9879421221864951


In [88]:
test_accuracy(res1,test0)
test_accuracy(res2,test0)
test_accuracy(res3,test0)

accuracy : 0.9715 | tpratio : 0.6695652173913044 | tnratio : 0.9899204244031831
accuracy : 0.9735 | tpratio : 0.6592592592592592 | tnratio : 0.9962466487935657
accuracy : 0.97575 | tpratio : 0.6752767527675276 | tnratio : 0.997586484312148


In [77]:
res0.loc[0:100,:]

Unnamed: 0,index,prediction,reality,accurate,truepositive,truenegative
0,240862,0,0,True,False,True
1,242282,1,0,False,False,False
2,243665,0,0,True,False,True
3,243919,0,0,True,False,True
4,245506,0,0,True,False,True
5,245953,0,0,True,False,True
6,246151,0,0,True,False,True
7,247013,0,0,True,False,True
8,247338,0,0,True,False,True
9,247460,0,0,True,False,True


In [91]:
!python scripts/strip_model.py --checkpoint /home/jupyter/mt-dnn/checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-05T1529/model_3.pt --fout /home/jupyter/mt-dnn/model001.pt 




In [10]:
result2_0, result2_1, result2_2, result2_3 = [p(os.path.join('checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-06T0621/',
        'sst_test_scores_{}.tsv'.format(i))) for i in range(4)]

In [11]:
res2_0,res2_1, res2_2, res2_3 = [pd.read_csv(x, sep='\t') for x in [result2_0, result2_1, result2_2, result2_3]]

In [20]:
test_accuracy(res2_0,test0)
test_accuracy(res2_1,test0)
test_accuracy(res2_2,test0)
test_accuracy(res2_3,test0)

ValueError: can only convert an array of size 1 to a Python scalar

In [4]:
!sh scripts/inference.sh 4 0

scripts/inference.sh: 2: scripts/inference.sh: [[: not found
export CUDA_VISIBLE_DEVICES=0
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
Namespace(answer_att_hidden_size=128, answer_att_type='bilinear', answer_dropout_p=0.1, answer_mem_drop_p=0.1, answer_mem_type=1, answer_merge_opt=1, answer_num_turn=5, answer_opt=0, answer_rnn_type='gru', answer_sum_att_type='bilinear', answer_weight_norm_on=False, batch_size=4, batch_size_eval=8, bert_dropout_p=0.1, bert_l2norm=0.0, cuda=True, data_dir='data/mt_dnn', data_sort_on=False, dropout_p=0.1, dropout_w=0.0, dump_state_on=False, ema_gamma=0.995, ema_opt=0, embedding_opt=0, epochs=1, freeze_layers=-1, global_grad_clipping=1.0, grad_clipping=0.0, have_lr_scheduler=True, inference=1, init_checkpoint='model001.pt', init_ratio=1, label_size='3', learning_rate=5e-05, log_file='checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-06T2307/log.log', log_per_updates=500, lr_gamma=0.5, max_seq_len=512, 

In [None]:
!python scripts/strip_model.py --checkpoint mt-dnn/checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-06T2307 --fout modela.pt




In [60]:
import mt-dnn.scripts.strip_model

SyntaxError: invalid syntax (<ipython-input-60-79c057f5d559>, line 1)

In [8]:
from strip_modelETN import strip

In [None]:
strip(p("checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-06T0621/model_0.pt"),p("modelA.pt"))

In [None]:
strip(p("checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-05T1529/model_0.pt"),p("model1.pt"))
strip(p("checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-05T1529/model_1.pt"),p("model2.pt"))
strip(p("checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-05T1529/model_2.pt"),p("model3.pt"))
strip(p("checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-05T1529/model_3.pt"),p("model4.pt"))
strip(p("checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-06T0621/model_0.pt"),p("model5.pt"))
strip(p("checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-06T0621/model_1.pt"),p("model6.pt"))
strip(p("checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-06T0621/model_2.pt"),p("model7.pt"))
strip(p("checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-06T0621/model_3.pt"),p("model8.pt"))

In [23]:
dirr = os.listdir(p('checkpoints'))
dirr.sort()
dirr

['mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-04T2309',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-04T2326',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-05T0134',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-05T0135',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-05T0208',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-05T0256',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-05T0418',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-05T0644',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-05T0659',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-05T1529',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-06T0606',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-06T0621',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-06T2254',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0204',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0211',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0214',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0217

In [24]:
mmodels = dirr[-8:]
mmodels

['mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0316',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0318',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0320',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0322',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0325',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0327',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0329',
 'mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0331']

In [26]:
x = []
for m in mmodels:
    folder = os.path.join(p("checkpoints"),m)
    to_check = os.path.join(folder,'sst_test_scores_0.tsv')
    x.append(to_check)
x

['/home/jupyter/mt-dnn/checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0316/sst_test_scores_0.tsv',
 '/home/jupyter/mt-dnn/checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0318/sst_test_scores_0.tsv',
 '/home/jupyter/mt-dnn/checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0320/sst_test_scores_0.tsv',
 '/home/jupyter/mt-dnn/checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0322/sst_test_scores_0.tsv',
 '/home/jupyter/mt-dnn/checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0325/sst_test_scores_0.tsv',
 '/home/jupyter/mt-dnn/checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0327/sst_test_scores_0.tsv',
 '/home/jupyter/mt-dnn/checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0329/sst_test_scores_0.tsv',
 '/home/jupyter/mt-dnn/checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0331/sst_test_scores_0.tsv']

In [28]:
for i in range(8):
    print("file",i+1)
    test_accuracy(pd.read_csv(x[i], sep='\t'),src)
    

file 1
accuracy : 0.95275 | tpratio : 0.5879310344827586 | tnratio : 0.981266846361186
file 2
accuracy : 0.9605 | tpratio : 0.6767241379310345 | tnratio : 0.9779723991507431
file 3
accuracy : 0.954625 | tpratio : 0.5996592844974447 | tnratio : 0.9827330365573992
file 4
accuracy : 0.9585 | tpratio : 0.6594827586206896 | tnratio : 0.9769108280254777
file 5
accuracy : 0.96125 | tpratio : 0.7103960396039604 | tnratio : 0.9745918904686677
file 6
accuracy : 0.962875 | tpratio : 0.7194244604316546 | tnratio : 0.9762626928656205
file 7
accuracy : 0.95175 | tpratio : 0.5799319727891157 | tnratio : 0.9812466270912035
file 8
accuracy : 0.9555 | tpratio : 0.6076388888888888 | tnratio : 0.982489224137931


In [15]:
sh scripts/inference.sh 4 0 model1.pt
sh scripts/inference.sh 4 0 model2.pt
sh scripts/inference.sh 4 0 model3.pt
sh scripts/inference.sh 4 0 model4.pt
sh scripts/inference.sh 4 0 model5.pt
sh scripts/inference.sh 4 0 model6.pt
sh scripts/inference.sh 4 0 model7.pt
sh scripts/inference.sh 4 0 model8.pt

-1

In [21]:
!sh scripts/inference.sh 4 0 model1.pt
!sh scripts/inference.sh 4 0 model2.pt
!sh scripts/inference.sh 4 0 model3.pt
!sh scripts/inference.sh 4 0 model4.pt
!sh scripts/inference.sh 4 0 model5.pt
!sh scripts/inference.sh 4 0 model6.pt
!sh scripts/inference.sh 4 0 model7.pt
!sh scripts/inference.sh 4 0 model8.pt

scripts/inference.sh: 2: scripts/inference.sh: [[: not found
export CUDA_VISIBLE_DEVICES=0
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
Namespace(answer_att_hidden_size=128, answer_att_type='bilinear', answer_dropout_p=0.1, answer_mem_drop_p=0.1, answer_mem_type=1, answer_merge_opt=1, answer_num_turn=5, answer_opt=0, answer_rnn_type='gru', answer_sum_att_type='bilinear', answer_weight_norm_on=False, batch_size=4, batch_size_eval=8, bert_dropout_p=0.1, bert_l2norm=0.0, cuda=True, data_dir='data/mt_dnn', data_sort_on=False, dropout_p=0.1, dropout_w=0.0, dump_state_on=False, ema_gamma=0.995, ema_opt=0, embedding_opt=0, epochs=1, freeze_layers=-1, global_grad_clipping=1.0, grad_clipping=0.0, have_lr_scheduler=True, inference=1, init_checkpoint='model1.pt', init_ratio=1, label_size='3', learning_rate=5e-05, log_file='checkpoints/mt-dnn-sst_adamax_answer_opt0_gc0_ggc1_2019-06-07T0316/log.log', log_per_updates=500, lr_gamma=0.5, max_seq_len=512, me

In [15]:
eval_accuracy(p('checkpoints/testing'),src)

sst_test_scores_1.tsv
accuracy : 0.97575 | tpratio : 0.6752767527675276 | tnratio : 0.997586484312148
sst_test_scores_2.tsv
accuracy : 0.963 | tpratio : 0.6826722338204593 | tnratio : 0.9808536098923015
sst_test_scores_3.tsv
accuracy : 0.95725 | tpratio : 0.6087689713322091 | tnratio : 0.9851491832050763
sst_test_scores_4.tsv
accuracy : 0.9625 | tpratio : 0.6727272727272727 | tnratio : 0.9816122584943371
sst_test_scores_5.tsv
accuracy : 0.9625 | tpratio : 0.7011764705882353 | tnratio : 0.9771617161716172
sst_test_scores_6.tsv
accuracy : 0.9665 | tpratio : 0.7399527186761229 | tnratio : 0.979147419823149
sst_test_scores_7.tsv
accuracy : 0.95725 | tpratio : 0.6076794657762938 | tnratio : 0.9855424942575328
sst_test_scores_8.tsv
accuracy : 0.95825 | tpratio : 0.6183074265975821 | tnratio : 0.9847729416520684
