In [82]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

import bucc_proc as bp
Path = bp.Path

In [3]:
## May also try: 
# 'bert-base-multilingual-uncased' vs 'bert-base-multilingual-cased'

#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-multilingual-cased')

#model = model_class.from_pretrained(pretrained_weights)

In [4]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-multilingual-cased')

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [20]:
model.eval()

text_en = "I love bubble tea."
text_zh = '我爱泡泡茶。'

text_en = "I live in Singapore"
#text_zh = '我住在新加坡'

## Two ways of doing the same thing ##
# Refer to Jalamar for padding later on
# https://github.com/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb
# https://www.kaggle.com/funtowiczmo/hugging-face-transformers-get-started/notebook#Frameworks-interoperability

tokens = tokenizer.encode(text_en, add_special_tokens=True)
print(tokens)
tokens_pt = torch.tensor([tokens])
print(tokens_pt)
with torch.no_grad():
    outputs_en = model(tokens_pt)
print(outputs_en[0].shape)
print(outputs_en[0][0][0].shape)  # CLS token (first column)
mean_en = outputs_en[0].mean(1)   # find mean of all columns (axis=1)
print(mean_en.shape)  


tokens = tokenizer.tokenize(text_zh)
print("Tokens: {}".format(tokens))
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
print("Tokens id: {}".format(tokens_ids))
tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)
tokens_pt = torch.tensor([tokens_ids])
print("Tokens PyTorch: {}".format(tokens_pt))
outputs_zh = model(tokens_pt)
print(outputs_zh[0].shape)
print("Tokenw   ise output: {}".format(outputs_zh[0][0][0].shape))

mean_zh = outputs_zh[0].mean(1)
print(mean_zh.shape)

# Shape is 
#   <--  768 -->
#  ^
#  n
#  v

def cos_sim(v1,v2):
    v1 = np.asarray(np.squeeze(v1))
    v2 = np.asarray(np.squeeze(v2))    
    return bp.cosine_similarity(v1,v2)


print(cos_sim(outputs_en[0][0][0].detach(), outputs_zh[0][0][0].detach()), cos_sim(mean_en.detach(),mean_zh.detach()))

[101, 177, 12962, 10106, 21253, 102161, 10246, 102]
tensor([[   101,    177,  12962,  10106,  21253, 102161,  10246,    102]])
torch.Size([1, 8, 768])
torch.Size([768])
torch.Size([1, 768])
Tokens: ['我', '爱', '泡', '泡', '茶', '。']
Tokens id: [3976, 5383, 4945, 4945, 6739, 1882]
Tokens PyTorch: tensor([[ 101, 3976, 5383, 4945, 4945, 6739, 1882,  102]])
torch.Size([1, 8, 768])
Tokenw   ise output: torch.Size([768])
torch.Size([1, 768])
0.9101923 0.48165843


In [22]:
sum=0
for i in range(8):
    sum+= outputs_zh[0][0][i][0]
sum/8

tensor(0.0639, grad_fn=<DivBackward0>)

In [23]:
mean_zh[0][0]

#proof that it is mean across row

#     <--768 -->            <--768-->
# ^                     ^
# n               --->  1
# v                     v


tensor(0.0639, grad_fn=<SelectBackward>)

In [84]:
new_df = bp.get_merge()
en_proc = bp.en_proc
zh_proc = bp.zh_proc
cosine_similarity = bp.cosine_similarity
new_df.count

Merged file exists, reading...


<bound method DataFrame.count of              ID_zh         ID_en                             Sentence_zh  \
0     zh-000000033  en-000005983               1989年以前，全球经济包含大约8亿到10亿人口。   
1     zh-000000231  en-000047360        今日全球面临的威胁是超民族的，因此也必须采取超民族的方式来应对。   
2     zh-000000272  en-000027140                   欧盟移民政策的硬伤还有一个不太显著的方面。   
3     zh-000000438  en-000065621           只有让民粹主义服务于自由主义改革，政府才能取得长久的利益。   
4     zh-000000639  en-000005169       但社会民主派必须理解为何示威的发展会独立于现有的有组织中左翼政治。   
...            ...           ...                                     ...   
1848  zh-000094590  en-000013258         事件发生后当局在尚未进行调查的情况下就匆匆掩埋了出事列车残骸。   
1849  zh-000094593  en-000061419             北方拥有丰富的自然资源，就连电力也是从北方输送到南方。   
1850  zh-000094607  en-000039373                如果利率为3%，那么年税收额必须增加15亿美元。   
1851  zh-000094611  en-000003807           五年前，叙利亚北部边陲城镇享受着土耳其高速经济增长的红利。   
1852  zh-000094633  en-000083972  在过去的一个世纪中，我们的世界发生了翻天覆地的变化——技术是其中的重要原因。   

                                            Sentence_e

In [69]:
def get_vector(sentence,vtype="cls",proc=None):
    if proc:
        if proc == "en":
            s = en_proc(sentence)
        elif proc == "zh":
            s = zh_proc(sentence)
        else:
            print('No proccessing method for this language.')
            
        tokens_ids = tokenizer.convert_tokens_to_ids(s)
        tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)
        tokens_pt = torch.tensor([tokens_ids])
        with torch.no_grad():
            outputs = model(tokens_pt)
        
    else:
        s = sentence
        
        tokens = tokenizer.encode(s, add_special_tokens=True)
        tokens_pt = torch.tensor([tokens])
        with torch.no_grad():
            outputs = model(tokens_pt)
            
            
    if vtype == "cls":
        return outputs[0][0][0]
    elif vtype == "mean":
        return outputs[0].mean(1)
    
    
    


In [64]:
cosine_similarity(get_vector('The weather tomorrow should be sunny.'),get_vector('我喜欢下雨天!'))

[101, 10105, 35660, 19132, 10667, 21454, 14819, 10347, 42230, 10756, 119, 102]
tensor([[  101, 10105, 35660, 19132, 10667, 21454, 14819, 10347, 42230, 10756,
           119,   102]])
[101, 3976, 2921, 4774, 2079, 8289, 3198, 106, 102]
tensor([[ 101, 3976, 2921, 4774, 2079, 8289, 3198,  106,  102]])


0.77698934

In [70]:
def make_sim_feat(x,y, vtype="cls", proc=None):
    proc_en = None
    proc_zh = None
    if proc ==True:
        proc_en = "en"
        proc_zh = "zh"
    return cosine_similarity( get_vector(new_df.iloc[x]['Sentence_en'], vtype, proc_en), get_vector(new_df.iloc[y]['Sentence_zh'], vtype, proc_zh) )

In [71]:
c_df = pd.DataFrame()
c_df['Line_en'] = new_df.index
c_df['Line_zh'] = new_df.index
c_df['cls_none'] = [make_sim_feat(i,i) for i in range(1853)]
c_df['cls_proc'] = [make_sim_feat(i,i, "cls", True) for i in range(1853)]
c_df['mean_none'] = [make_sim_feat(i,i, "mean") for i in range(1853)]
c_df['mean_proc'] = [make_sim_feat(i,i, "mean", True) for i in range(1853)]
c_df

Unnamed: 0,Line_en,Line_zh,cls_none,cls_proc,mean_none,mean_proc
0,0,0,0.932438,0.944367,0.718354,0.768387
1,1,1,0.885249,0.826390,0.628691,0.704907
2,2,2,0.911883,0.924598,0.621186,0.853088
3,3,3,0.925288,0.785227,0.692905,0.647528
4,4,4,0.856016,0.873681,0.679328,0.695893
...,...,...,...,...,...,...
1848,1848,1848,0.900661,0.965716,0.642806,0.962498
1849,1849,1849,0.878535,0.844786,0.651582,0.801471
1850,1850,1850,0.946697,0.878529,0.735276,0.697647
1851,1851,1851,0.937502,0.890416,0.683345,0.813110


In [86]:
inc_file = Path("zh-en.training.incorrect")
if inc_file.is_file():
    print('INC File exists, reading...')
    inc_df = pd.read_csv(inc_file, header=0, sep='\t')

else:
    print('No inc file Error')
    
inc_df = inc_df[['Line_en','Line_zh']]

cls_none=[]
cls_proc=[]
mean_none=[]
mean_proc=[]

for i in range(1853):
    x = int(inc_df.iloc[i]['Line_en'])
    y = int(inc_df.iloc[i]['Line_zh'])
    
    
    cls_none.append(make_sim_feat(x,y))
    cls_proc.append(make_sim_feat(x,y, "cls", True))
    mean_none.append(make_sim_feat(x,y, "mean"))
    mean_proc.append(make_sim_feat(x,y, "mean", True))
    
    
inc_df['cls_none'] = cls_none
inc_df['cls_proc'] = cls_proc
inc_df['mean_none'] = mean_none
inc_df['mean_proc'] = mean_proc

inc_df


INC File exists, reading...


Unnamed: 0,Line_en,Line_zh,cls_none,cls_proc,mean_none,mean_proc
0,647,430,0.905566,0.869491,0.589745,0.752345
1,1835,1362,0.876352,0.864598,0.396474,0.741474
2,365,15,0.910112,0.773795,0.529881,0.665662
3,1373,701,0.911939,0.819845,0.616756,0.751962
4,197,987,0.903395,0.810019,0.444720,0.623145
...,...,...,...,...,...,...
1848,1308,962,0.911192,0.875608,0.479526,0.612884
1849,801,602,0.893925,0.900721,0.495979,0.759477
1850,1778,1359,0.914331,0.766015,0.536847,0.567795
1851,1782,1097,0.854141,0.784104,0.451884,0.657362


In [101]:
def _f_score(x, y, cut_point):
    p,n = (0,0)
    
    if x[y] > cut_point:
        p += 1
    else:
        n +=1
    return p, n

def f_score(y, cut_point):
    
    tp = c_df.apply(lambda x: _f_score(x, y, cut_point), axis=1).value_counts().get((1,0))
    fn = c_df.apply(lambda x: _f_score(x, y, cut_point), axis=1).value_counts().get((0,1))
    fp = inc_df.apply(lambda x: _f_score(x, y, cut_point), axis=1).value_counts().get((1,0))
    tn = inc_df.apply(lambda x: _f_score(x, y, cut_point), axis=1).value_counts().get((0,1))
    
    tp = tp if tp else 0
    fn = fn if fn else 0
    fp = fp if fp else 0
    tn = tn if tn else 0
    
    precision = tp/(tp+fp) if (tp+fp)!=0 else 0
    recall = tp/(tp+fn) if (tp+fn)!=0 else 0
    f1 = 2*(precision*recall/(precision+recall))
    
    return precision*100, recall*100, f1*100

In [92]:
start_c = 0.1
step_c = 0.1
num_c = 9

f_df = pd.DataFrame(columns=['Criteria','Precision(%)','Recall(%)','F1_Score(%)'])

n=0 
while n<num_c:
    criteria = start_c
    
    p,r,f = f_score("cls_proc", criteria)
    
    f_df.loc[len(f_df)] = [criteria, p, r, f]
    
    start_c += step_c
    n+=1
    
f_df

Unnamed: 0,Criteria,Precision(%),Recall(%),F1_Score(%)
0,0.1,50.0,100.0,66.666667
1,0.2,50.0,100.0,66.666667
2,0.3,50.0,100.0,66.666667
3,0.4,50.0,100.0,66.666667
4,0.5,50.013495,100.0,66.678661
5,0.6,49.986483,99.784134,66.606628
6,0.7,50.096445,98.111171,66.326158
7,0.8,51.318102,78.791149,62.154108
8,0.9,58.876117,24.878575,34.977238


In [97]:
start_c = 0.1
step_c = 0.1
num_c = 9

f_df = pd.DataFrame(columns=['Criteria','Precision(%)','Recall(%)','F1_Score(%)'])

n=0 
while n<num_c:
    criteria = start_c
    
    p,r,f = f_score("mean_proc", criteria)
    
    f_df.loc[len(f_df)] = [criteria, p, r, f]
    
    start_c += step_c
    n+=1
    
f_df

Unnamed: 0,Criteria,Precision(%),Recall(%),F1_Score(%)
0,0.1,50.0,100.0,66.666667
1,0.2,50.0,100.0,66.666667
2,0.3,50.0,99.568268,66.570449
3,0.4,49.945415,98.75877,66.340402
4,0.5,50.297703,95.736643,65.947955
5,0.6,50.730623,86.184566,63.867227
6,0.7,52.086078,64.004317,57.433414
7,0.8,53.772582,27.30707,36.220472
8,0.9,56.666667,4.587156,8.487269


In [98]:
start_c = 0.5
step_c = 0.01
num_c = 20

f_df = pd.DataFrame(columns=['Criteria','Precision(%)','Recall(%)','F1_Score(%)'])

n=0 
while n<num_c:
    criteria = start_c
    
    p,r,f = f_score("mean_none", criteria)
    
    f_df.loc[len(f_df)] = [criteria, p, r, f]
    
    start_c += step_c
    n+=1
    
f_df

Unnamed: 0,Criteria,Precision(%),Recall(%),F1_Score(%)
0,0.5,65.941512,99.784134,79.407344
1,0.51,68.409344,99.568268,81.098901
2,0.52,71.992188,99.460335,83.525946
3,0.53,75.143091,99.190502,85.508258
4,0.54,78.43389,98.920669,87.494033
5,0.55,81.622591,98.273071,89.177277
6,0.56,84.640676,97.247706,90.507283
7,0.57,87.950617,96.114409,91.85147
8,0.58,90.272774,94.657312,92.413066
9,0.59,92.980562,92.930383,92.955466


In [103]:
start_c = 0.85
step_c = 0.005
num_c = 20

f_df = pd.DataFrame(columns=['Criteria','Precision(%)','Recall(%)','F1_Score(%)'])

n=0 
while n<num_c:
    criteria = start_c
    
    p,r,f = f_score("cls_none", criteria)
    
    f_df.loc[len(f_df)] = [criteria, p, r, f]
    
    start_c += step_c
    n+=1
    
f_df

Unnamed: 0,Criteria,Precision(%),Recall(%),F1_Score(%)
0,0.85,52.448566,97.679439,68.250377
1,0.855,53.125,97.247706,68.71306
2,0.86,54.018127,96.492175,69.262057
3,0.865,55.150015,96.222342,70.114039
4,0.87,55.978433,95.250944,70.515382
5,0.875,57.576758,94.117647,71.446129
6,0.88,59.307958,92.498651,72.274931
7,0.885,61.113132,90.663788,73.011734
8,0.89,63.010502,87.425796,73.23689
9,0.895,65.993266,84.619536,74.154646


In [None]:
## Conclusion: 
# 92.9% F1 Score if using mean, no text processing. 
# Mean gives better results than cls (text classification tag) and no processing gives better results because model trained on individual Chinese characters rather than jieba tokens