# Aligning Parallel Monolingual Corpora



Explored methods of aligning sentences by
* Length ([Gale & Church 1993](https://www.aclweb.org/anthology/J93-1004.pdf))
* Parts of Speech (Using Stanza, concept from [Chen & Chen 1993](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.102.4921&rep=rep1&type=pdf))
* Embeddings from BiLSTM (Using LASER, [Artetxe & Schwenk 2018](https://arxiv.org/pdf/1812.10464.pdf))
* Embeddings from BERT Transformers (Using BERT, [Devlin et al. 2018](https://arxiv.org/abs/1810.04805v2))

on English and Chinese.

Future exploration
* Chunking ([Sun et al. 2000](https://www.aclweb.org/anthology/W00-1314/))
* Topic mapping ([Sabbah & Akker 2018](http://lrec-conf.org/workshops/lrec2018/W8/summaries/9_W8.html))
* Other languages besides Chinese

In [1]:
import csv
import logging
import re
import warnings
from pathlib import Path
from random import randrange
from string import punctuation

import jieba
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import stanza
import torch
import transformers as ppb
from laserembeddings import Laser
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from pylab import polyfit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score, train_test_split

from gachalign import length_cost

logging.getLogger().setLevel(logging.CRITICAL)


warnings.filterwarnings('ignore')
nltk.download('punkt')
nltk.download('stopwords')
stanza.download('en')
stanza.download('zh')

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resou

Universally-used functions:

In [2]:
en_stopwords=stopwords.words("english")+["'s"]  #chinese de is stopword
stemmer=PorterStemmer()
punctuation = punctuation +'–’“”'

def en_proc(sentence):
    ''' 1. Tokenize Sentence -> Words
        2. Remove punctuation and stopwords
        3. Stemming Words'''
    word_list = word_tokenize(sentence)
    bow_list = [stemmer.stem(w.lower()) for w in word_list if w.lower() not in en_stopwords and w not in punctuation]
    
    return bow_list

with open('zh_stopwords.txt','r', encoding='utf-8') as file:
    zh_stopwords = file.read()
zh_stopwords = re.sub('[ A-Za-z]+\n', ',', zh_stopwords)
zh_stopwords = zh_stopwords.translate(str.maketrans('', '', '\n')).split(',') 
zh_stopwords = list(filter(None, zh_stopwords))
punctuation = punctuation + '，「」。！？《》【】、'


def zh_proc(sentence):
    ''' 1. Segmentation
        2. Remove punctuation and stopwords'''
    bow_list = [w for w in jieba.cut(sentence) if w not in zh_stopwords and w not in punctuation]
    return bow_list


def cosine_similarity(v1, v2):
    '''cosine_similarity(transformed_docs[2], transformed_docs[2])'''
    # np.squeeze() allows both v1 dot v1 and v1 dot v2
    # np.toarray() converts to array
    v1 = np.squeeze(v1)
    v2 = np.squeeze(v2)
    return np.dot(v1,v2) / ( np.sqrt(np.dot(v1,v1)) * np.sqrt(np.dot(v2,v2)) )

In [3]:
def euclidean_distance(v1, v2):
    '''euclidean_distance([0,0,0,0], [1,0,1,0])'''
    return np.sum(np.subtract(v1, v2) ** 2) ** 0.5

In [10]:
def save_df(df, path, have_string=0, index=False, sep='\t'):
    if have_string==1:
        quoting=csv.QUOTE_NONE
        escapechar="\\"
    else:
        quoting=None
        escapechar=None
        
    path = Path(path)
    df.to_csv(path, index=index, sep=sep, quoting=quoting, escapechar=escapechar)
    return
    
def read_df(path, names=None, have_string=0, header=0, sep='\t'):
    if have_string==1:
        quoting=csv.QUOTE_NONE
        escapechar="\\"
    else:
        quoting=0
        escapechar=None
    
    path = Path(path)
    if path.is_file():
        print('File exists, reading...')
        new_df = pd.read_csv(path, names=names, header=header, sep=sep, quoting=quoting, escapechar=escapechar)
        return new_df
     
    else:
        print('File does not exist.')
        return None
    
    
def apply_to_df(df, path, method, reoutput=0, index=False, header=0, sep='\t', **kwargs):
    
    callbacks = {
        'gc' : make_gachalign_feats,
        'pos' : make_pos_feats,
        'laser' : make_laser_feats,
        'bert' : make_bert_feats,
        'concat' : pd.concat,
    }
    
    
    if reoutput == 1:
        df = callbacks[method](df, **kwargs)
        save_df(df, path,  index=index, sep=sep)
        print('File overwritten.')
        
        return df
    
    _df = read_df(path, header=header, sep=sep)
    
    if _df is None:
        df = callbacks[method](df, **kwargs)
        save_df(df, path,  index=index, sep=sep)
        return df
    else:
        return _df



In [5]:
def test_to_df(df, funct):
    test_df = df[:10]
    df = eval(funct)
    return df

In [6]:
def get_bucc_data(reoutput=False):
    
    data_file = Path("bucc_data/zh-en.training.pairs")
    
    if data_file.is_file() and not reoutput:
        print('Data file exists, reading...')
        new_df = read_df(data_file, have_string=1)
    
    else:
        print('Data file does not exist, creating...') if not reoutput else print('Data file to be overwritten.')
    
        zh_file = Path("bucc_data/zh-en.training.zh")
        en_file = Path("bucc_data/zh-en.training.en")
        pair_file = Path("bucc_data/zh-en.training.gold")
    
        zh_df = read_df(zh_file, names=['ID_zh','Sentence_zh'], header=None, have_string=1)
        en_df = read_df(en_file, names=['ID_en','Sentence_en'], header=None, have_string=1)
        pair_df = read_df(pair_file, names=['ID_zh','ID_en'], header=None)
        
        new_df = pair_df.merge(zh_df, 'inner', 'ID_zh')
        new_df = new_df.merge(en_df, 'inner', 'ID_en')
        save_df(new_df, data_file)
        
    return new_df

In [7]:
def get_bucc_non_pair_data(reoutput=False):
    
    data_file = Path("bucc_data/zh-en.training.nonpairs")
    
    if data_file.is_file() and not reoutput:
        print('Data file exists, reading...')
        new_df = read_df(data_file, have_string=1)
    
    else:
        print('Data file does not exist, creating...') if not reoutput else print('Data file to be overwritten.')
        
        new_df = pd.DataFrame(columns=['ID_zh', 'ID_en'])
    
        bucc_file = "bucc_data/zh-en.training.pairs"
        
        bucc_df = read_df(bucc_file, have_string=1)
        
        n = 0
        
        while n < 1899:
            x = randrange(1899)
            y = randrange(1899)
            if x != y:
                zh_id = bucc_df.iloc[y]['ID_zh']
                en_id = bucc_df.iloc[x]['ID_en']
                new_df.loc[n] = [zh_id, en_id]
                n+=1
                
        zh_file = "bucc_data/zh-en.training.zh"
        en_file = "bucc_data/zh-en.training.en"
        
        zh_df = read_df(zh_file, names=['ID_zh','Sentence_zh'], header=None, have_string=1)
        en_df = read_df(en_file, names=['ID_en','Sentence_en'], header=None, have_string=1)
        
        new_df = new_df.merge(zh_df, 'inner', 'ID_zh')
        new_df = new_df.merge(en_df, 'inner', 'ID_en')
        save_df(new_df, data_file)
        
    return new_df

In [13]:
bucc_df = get_bucc_data()
bucc_df

Data file exists, reading...
File exists, reading...


Unnamed: 0,ID_zh,ID_en,Sentence_zh,Sentence_en
0,zh-000000033,en-000005983,1989年以前，全球经济包含大约8亿到10亿人口。,"Until 1989, the global market encompassed betw..."
1,zh-000000231,en-000047360,今日全球面临的威胁是超民族的，因此也必须采取超民族的方式来应对。,The threats facing the world today are suprana...
2,zh-000000272,en-000027140,欧盟移民政策的硬伤还有一个不太显著的方面。,"There is another, less obvious, reason why the..."
3,zh-000000438,en-000065621,只有让民粹主义服务于自由主义改革，政府才能取得长久的利益。,Only if populism is put at the service of libe...
4,zh-000000639,en-000005169,但社会民主派必须理解为何示威的发展会独立于现有的有组织中左翼政治。,But social democrats must understand why the p...
...,...,...,...,...
1894,zh-000094590,en-000013258,事件发生后当局在尚未进行调查的情况下就匆匆掩埋了出事列车残骸。,The wrecked body of the ruined train was burie...
1895,zh-000094593,en-000061419,北方拥有丰富的自然资源，就连电力也是从北方输送到南方。,"Natural resources were abundant in the North, ..."
1896,zh-000094607,en-000039373,如果利率为3%，那么年税收额必须增加15亿美元。,"If it is 3%, the required increase in annual t..."
1897,zh-000094611,en-000003807,五年前，叙利亚北部边陲城镇享受着土耳其高速经济增长的红利。,"Five years ago, Syria’s northern border towns ..."


In [14]:
bucc_non_pair_df = get_bucc_non_pair_data()
bucc_non_pair_df

Data file exists, reading...
File exists, reading...


Unnamed: 0,ID_zh,ID_en,Sentence_zh,Sentence_en
0,zh-000030343,en-000044219,这个国家的这种做法是对抗击气候变化事业的巨大损害。,"When Russia faced its worst crisis, aid was gi..."
1,zh-000077588,en-000044219,和任何民主国家一样，对土耳其政府政策公共批评是正常的健康现象。,"When Russia faced its worst crisis, aid was gi..."
2,zh-000030343,en-000036472,这个国家的这种做法是对抗击气候变化事业的巨大损害。,"Investments in infrastructure, education, and ..."
3,zh-000094584,en-000036472,东南方则是已有400，000难民涌入土耳其的战火纷飞的叙利亚。,"Investments in infrastructure, education, and ..."
4,zh-000075027,en-000036472,据“世界粮食计划”统计，当地70%的人口缺少食品安全。,"Investments in infrastructure, education, and ..."
...,...,...,...,...
1894,zh-000078635,en-000079186,在阿根廷和智利南部美丽的巴塔哥尼亚地区，旅游业大有可为。,More debt relief – encompassing more countries...
1895,zh-000053016,en-000033080,布什政府不但没有延续上届政府推行的禁止湿地开发的政策，反而将其彻底推翻。,People in rich countries are undoubtedly famil...
1896,zh-000009129,en-000012334,从20世纪早期开始，墨西哥人就感觉到了美国实力的威胁。,Any country with a sensible development strate...
1897,zh-000052489,en-000002928,四十年前我初入政界时是如此，四十年后的今天也是如此。,"As recently as 2008, Turkey’s highest court co..."


## Gale & Church

Using [GaChalign](https://github.com/alvations/gachalign) to easily modify mean and variance variables and investigate effects on length distance/cost. 

Concept from [Wu 1994](https://www.aclweb.org/anthology/P94-1012/) to count each Chinese character as having length 2, and each English or punctuation character as having length 1.

3 types of counting explored: by character, by word tokenization, by special counting method from above paper. 

In [15]:
def get_length(sentence, groupby="char", lang ="en"):
    if groupby == "char":
        n = len(sentence)
        
    elif groupby == "word":
        if lang == "en":
            n = len(word_tokenize(sentence))
        elif lang == "zh":
            n = len(jieba.lcut(sentence))
            
    elif groupby == "special":
        # count each Chinese character as having length 2, and each English or punctuation character as having length 1
        if lang == "zh":
            n=0
            for c in sentence:
                if c in punctuation:
                    n+=1
                else:
                    n+=2
        else:
            n = len(sentence)
            
    return n

In [16]:
def make_gachalign_feats(df, c=1, s2=6.8):
    # Calculate length cost given 2 sentence. Lower cost = higher prob.
    # c is mean
    # s2 is variance
    
    gc_df = pd.DataFrame()
    
    gc_df['char_zh'] = df['Sentence_zh'].map(lambda x: get_length(x, "char", "zh"))
    gc_df['char_en'] = df['Sentence_en'].map(lambda x: get_length(x, "char", "en"))
    gc_df['char_cost'] = gc_df.apply(lambda x: length_cost([x['char_zh']], [x['char_en']], c, s2) if length_cost([x['char_zh']], [x['char_en']], c, s2) >= 0 else 0, axis=1)
    
    gc_df['word_zh'] = df['Sentence_zh'].map(lambda x: get_length(x, "word", "zh"))
    gc_df['word_en'] = df['Sentence_en'].map(lambda x: get_length(x, "word", "en"))
    gc_df['word_cost'] = gc_df.apply(lambda x: length_cost([x['word_zh']], [x['word_en']], c, s2) if length_cost([x['word_zh']], [x['word_en']], c, s2) >= 0 else 0, axis=1)
    
    gc_df['special_zh'] = df['Sentence_zh'].map(lambda x: get_length(x, "special", "zh"))
    gc_df['special_en'] = df['Sentence_en'].map(lambda x: get_length(x, "special", "en"))
    gc_df['special_cost'] = gc_df.apply(lambda x: length_cost([x['special_zh']], [x['special_en']], c, s2) if length_cost([x['special_zh']], [x['special_en']], c, s2) >=0 else 0, axis=1)
    
    return gc_df
    


## Parts Of Speech

Stanza is slower than NLTK POS tagger and jieba, but gives better results. Benefit of multilingual POS tagger, shares same tagging legend. 

Count NOUN, VERB, ADJ, NUM. Did not count quotation marks unlike in [Chen & Chen 1993](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.102.4921&rep=rep1&type=pdf). 

In [17]:
en_postagger = stanza.Pipeline('en')

2020-07-01 08:57:59 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| ner       | ontonotes |

2020-07-01 08:57:59 INFO: Use device: cpu
2020-07-01 08:57:59 INFO: Loading: tokenize
2020-07-01 08:58:00 INFO: Loading: pos
2020-07-01 08:58:07 INFO: Loading: lemma
2020-07-01 08:58:07 INFO: Loading: depparse
2020-07-01 08:58:10 INFO: Loading: ner
2020-07-01 08:58:18 INFO: Done loading processors!


In [18]:
zh_postagger = stanza.Pipeline('zh')

2020-07-01 08:58:18 INFO: "zh" is an alias for "zh-hans"
2020-07-01 08:58:18 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package   |
-------------------------
| tokenize  | gsdsimp   |
| pos       | gsdsimp   |
| lemma     | gsdsimp   |
| depparse  | gsdsimp   |
| ner       | ontonotes |

2020-07-01 08:58:18 INFO: Use device: cpu
2020-07-01 08:58:18 INFO: Loading: tokenize
2020-07-01 08:58:18 INFO: Loading: pos
2020-07-01 08:58:37 INFO: Loading: lemma
2020-07-01 08:58:38 INFO: Loading: depparse
2020-07-01 08:58:59 INFO: Loading: ner
2020-07-01 08:59:08 INFO: Done loading processors!


In [19]:
zh_postagger_pretoken = stanza.Pipeline(lang='zh', tokenize_pretokenized=True)

# words split by space, sentences by newline

2020-07-01 08:59:08 INFO: "zh" is an alias for "zh-hans"
2020-07-01 08:59:08 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package   |
-------------------------
| tokenize  | gsdsimp   |
| pos       | gsdsimp   |
| lemma     | gsdsimp   |
| depparse  | gsdsimp   |
| ner       | ontonotes |

2020-07-01 08:59:08 INFO: Use device: cpu
2020-07-01 08:59:08 INFO: Loading: tokenize
2020-07-01 08:59:08 INFO: Loading: pos
2020-07-01 08:59:44 INFO: Loading: lemma
2020-07-01 08:59:50 INFO: Loading: depparse
2020-07-01 09:00:18 INFO: Loading: ner
2020-07-01 09:00:29 INFO: Done loading processors!


In [20]:
def get_pos_counts(sentence, attr = "upos", lang ="en"):
    
    tag_count_dict = {}
    
    if lang == "en":
        pos_list = [getattr(word, attr) for s in en_postagger(sentence).sentences for word in s.words]
        
    elif lang == "zh":
        pos_list = [getattr(word, attr) for s in zh_postagger(sentence).sentences for word in s.words]
        
    elif lang == "zh_jieba":
        pos_list = [getattr(word, attr) for s in zh_postagger_pretoken(' '.join(jieba.lcut(sentence))).sentences for word in s.words]
        
    for tag in pos_list:
        if tag not in tag_count_dict:
            tag_count_dict[tag]=1
        else: 
            tag_count_dict[tag]+=1
    
    return tag_count_dict

def make_pos_vect(pos_string, filters=['NOUN', 'VERB', 'ADJ', 'NUM']):
    
    # pos_dict = eval(pos_string) # if converted to str before saving in df
    pos_dict = pos_string
    
    filtered_dict = {}
    
    for f in filters:
        if f not in pos_dict:
            filtered_dict[f] = 0
        else:
            filtered_dict[f] = pos_dict[f]
            
    ordered_list = [v for k, v in sorted(filtered_dict.items())]
            
    vector = np.asarray(ordered_list)
    
    return vector

In [21]:
def make_pos_feats(df, filters=['NOUN', 'VERB', 'ADJ', 'NUM']):
    
    pos_df = pd.DataFrame()
    
    pos_df['dict_zh'] = df['Sentence_zh'].map(lambda x: get_pos_counts(x, lang="zh"))
    pos_df['dict_en'] = df['Sentence_en'].map(lambda x: get_pos_counts(x, lang="en"))
    
    pos_df['vect_zh'] = pos_df['dict_zh'].map(lambda x: make_pos_vect(x, filters))
    pos_df['vect_en'] = pos_df['dict_en'].map(lambda x: make_pos_vect(x, filters))
    
    # pos_df['cos_sim'] = pos_df.apply(lambda x: cosine_similarity(x['vect_zh'], x['vect_en']), axis=1) # Edit: Use Euclidean Distance to deal with zero vectors
    pos_df['euclid_dist'] = pos_df.apply(lambda x: euclidean_distance(x['vect_zh'], x['vect_en']), axis=1)
    
    return pos_df



## LASER Embeddings

Language-Agnostic SEntence Representations ([LASER](https://github.com/facebookresearch/LASER)) has multiple languages encoded by the same BiLSTM encoder. Supports code-switching. Specify language for tokenization only.

In [22]:
laser = Laser()

In [23]:
def get_laser_vect(sentence, proc=True, lang="en"):
    if proc==True:
        if lang == "en":
            s = ' '.join(en_proc(sentence))
        elif lang == "zh":
            s = ' '.join(zh_proc(sentence))
        else:
            print('No proccessing method for this language.')
    else:
        s = sentence
    return laser.embed_sentences(s, lang=[lang])

In [24]:
def make_laser_feats(df):
    
    laser_df = pd.DataFrame()
    
    laser_df['vect_proc_zh'] = df['Sentence_zh'].map(lambda x: get_laser_vect(x, lang="zh"))
    laser_df['vect_proc_en'] = df['Sentence_en'].map(lambda x: get_laser_vect(x, lang="en"))
    laser_df['cos_sim_proc'] = laser_df.apply(lambda x: cosine_similarity(x['vect_proc_zh'], x['vect_proc_en']), axis=1)
    
    laser_df['vect_noproc_zh'] = df['Sentence_zh'].map(lambda x: get_laser_vect(x, False, lang="zh"))
    laser_df['vect_noproc_en'] = df['Sentence_en'].map(lambda x: get_laser_vect(x, False, lang="en"))
    laser_df['cos_sim_noproc'] = laser_df.apply(lambda x: cosine_similarity(x['vect_noproc_zh'], x['vect_noproc_en']), axis=1)
    
    return laser_df


## BERT Embeddings

Using [Distilled Multilingual Bert](https://github.com/huggingface/transformers/tree/master/examples/distillation) from Huggingface Transformers

In [25]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-multilingual-cased')

tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)
model.eval()

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0): TransformerBlock(
        (dropout): Dropout(p=0.1, inplace=False)
        (attention): MultiHeadSelfAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_fe

In [26]:
def get_bert_vect(sentence, vtype="cls", proc=None):
    if proc:
        if proc == "en":
            s = en_proc(sentence)
        elif proc == "zh":
            s = zh_proc(sentence)
        else:
            print('No proccessing method for this language.')
            
        tokens_ids = tokenizer.convert_tokens_to_ids(s)
        tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids)
        tokens_pt = torch.tensor([tokens_ids])
        with torch.no_grad():
            outputs = model(tokens_pt)
        
    else:
        s = sentence
        
        tokens = tokenizer.encode(s, add_special_tokens=True)
        tokens_pt = torch.tensor([tokens])
        with torch.no_grad():
            outputs = model(tokens_pt)
            
            
    if vtype == "cls":
        return outputs[0][0][0]
    elif vtype == "mean":
        return outputs[0].mean(1)
    
    

In [27]:
def make_bert_feats(df):
    
    bert_df = pd.DataFrame()
    
    bert_df['vect_cls_proc_zh'] = df['Sentence_zh'].map(lambda x: get_bert_vect(x, "cls", "zh"))
    bert_df['vect_cls_proc_en'] = df['Sentence_en'].map(lambda x: get_bert_vect(x, "cls", "en"))
    bert_df['sim_cls_proc'] = bert_df.apply(lambda x: cosine_similarity(x['vect_cls_proc_zh'], x['vect_cls_proc_en']), axis=1)
    
    bert_df['vect_cls_noproc_zh'] = df['Sentence_zh'].map(lambda x: get_bert_vect(x, "cls"))
    bert_df['vect_cls_noproc_en'] = df['Sentence_en'].map(lambda x: get_bert_vect(x, "cls"))
    bert_df['sim_cls_noproc'] = bert_df.apply(lambda x: cosine_similarity(x['vect_cls_noproc_zh'], x['vect_cls_noproc_en']), axis=1)
    
    bert_df['vect_mean_proc_zh'] = df['Sentence_zh'].map(lambda x: get_bert_vect(x, "mean", "zh"))
    bert_df['vect_mean_proc_en'] = df['Sentence_en'].map(lambda x: get_bert_vect(x, "mean", "en"))
    bert_df['sim_mean_proc'] = bert_df.apply(lambda x: cosine_similarity(x['vect_mean_proc_zh'], x['vect_mean_proc_en']), axis=1)
    
    bert_df['vect_mean_noproc_zh'] = df['Sentence_zh'].map(lambda x: get_bert_vect(x, "mean"))
    bert_df['vect_mean_noproc_en'] = df['Sentence_en'].map(lambda x: get_bert_vect(x, "mean"))
    bert_df['sim_mean_noproc'] = bert_df.apply(lambda x: cosine_similarity(x['vect_mean_noproc_zh'], x['vect_mean_noproc_en']), axis=1)
    
    return bert_df


## Ensemble


In [35]:
def ensemble(df, path_prefix, num, pair, reoutput=0, deep_reoutput=0):
    
    pair=int(pair)
    _pair = "pair" if pair==1 else "nonpair"
    
    _gc_df = apply_to_df(df, path_prefix+".gachalign", "gc", reoutput=deep_reoutput)
    _pos_df = apply_to_df(df, path_prefix+".pos", "pos", reoutput=deep_reoutput)
    _laser_df = apply_to_df(df, path_prefix+".laser", "laser", reoutput=deep_reoutput)
    _bert_df = apply_to_df(df, path_prefix+".bert", "bert", reoutput=deep_reoutput)
    
    _feats_list = [
        pd.Series(pair, name='pair', index=range(num)), 
        _gc_df[['char_cost','word_cost','special_cost']], 
        _pos_df['euclid_dist'].rename('pos'), 
        _laser_df[['cos_sim_proc', 'cos_sim_noproc']].rename(columns=lambda x: 'laser_'+x[8:]), 
        _bert_df[['sim_cls_proc','sim_cls_noproc', 'sim_mean_proc','sim_mean_noproc']].rename(columns=lambda x: 'bert_'+x[4:])
    ]
    
    new_df = apply_to_df(_feats_list, path_prefix+'.'+_pair, "concat", reoutput=reoutput, axis=1)
    
    return new_df

    

## Actual Code

In [39]:
gc_df = apply_to_df(bucc_df, "bucc_data/zh-en.output.gachalign", "gc")
gc_df

File exists, reading...


Unnamed: 0,char_zh,char_en,char_cost,word_zh,word_en,word_cost,special_zh,special_en,special_cost
0,25,85,625.600212,14,15,8.361790,48,85,250.264773
1,32,103,703.551801,21,18,23.009397,62,103,248.353301
2,21,82,685.467548,12,18,59.338722,41,82,310.167001
3,29,102,756.902252,15,18,25.230475,56,102,305.377916
4,33,129,1005.265643,20,19,7.171378,65,129,436.581072
...,...,...,...,...,...,...,...,...,...
1894,31,93,597.886734,19,16,24.422795,61,93,182.031339
1895,27,100,777.412323,15,18,25.230475,52,100,336.003998
1896,24,87,679.130552,14,21,65.186118,45,87,304.874844
1897,29,106,808.117073,16,21,42.197643,56,106,340.720506


In [40]:
pos_df = apply_to_df(bucc_df, "bucc_data/zh-en.output.pos", "pos")
pos_df

File exists, reading...


Unnamed: 0,dict_zh,dict_en,vect_zh,vect_en,euclid_dist
0,"{'NUM': 3, 'NOUN': 4, 'ADP': 1, 'PUNCT': 2, 'V...","{'ADP': 2, 'NUM': 5, 'PUNCT': 2, 'DET': 1, 'AD...",[0 4 3 1],[1 2 5 1],3.000000
1,"{'NOUN': 6, 'VERB': 3, 'PART': 5, 'AUX': 2, 'P...","{'DET': 2, 'NOUN': 3, 'VERB': 2, 'AUX': 3, 'AD...",[0 6 0 3],[2 3 0 2],3.741657
2,"{'PROPN': 1, 'NOUN': 5, 'PART': 2, 'ADV': 3, '...","{'PRON': 1, 'VERB': 1, 'DET': 2, 'PUNCT': 3, '...",[1 5 1 1],[2 3 0 1],2.449490
3,"{'ADP': 1, 'VERB': 4, 'NOUN': 7, 'PUNCT': 2, '...","{'ADV': 1, 'SCONJ': 1, 'NOUN': 5, 'AUX': 2, 'V...",[1 7 0 4],[2 5 0 2],3.000000
4,"{'ADV': 1, 'NOUN': 6, 'PART': 3, 'AUX': 2, 'VE...","{'CCONJ': 1, 'ADJ': 2, 'NOUN': 4, 'AUX': 1, 'V...",[0 6 0 6],[2 4 0 4],3.464102
...,...,...,...,...,...
1894,"{'NOUN': 6, 'VERB': 5, 'ADP': 3, 'ADV': 4, 'PA...","{'DET': 3, 'VERB': 3, 'NOUN': 3, 'ADP': 2, 'AU...",[0 6 0 5],[0 3 0 3],3.605551
1895,"{'NOUN': 6, 'VERB': 4, 'ADJ': 1, 'PART': 1, 'P...","{'ADJ': 2, 'NOUN': 2, 'AUX': 2, 'ADP': 3, 'DET...",[1 6 0 4],[2 2 0 1],5.099020
1896,"{'ADP': 1, 'NOUN': 4, 'AUX': 2, 'NUM': 2, 'PUN...","{'SCONJ': 1, 'PRON': 1, 'AUX': 1, 'NUM': 3, 'S...",[0 4 2 1],[1 4 3 2],1.732051
1897,"{'NUM': 1, 'NOUN': 6, 'ADP': 1, 'PUNCT': 2, 'P...","{'NUM': 1, 'NOUN': 6, 'ADV': 1, 'PUNCT': 3, 'P...",[1 6 1 2],[2 6 1 1],1.414214


In [41]:
# test_to_df(bucc_df, "make_laser_feats(test_df)")
laser_df = apply_to_df(bucc_df, "bucc_data/zh-en.output.laser", "laser")
laser_df

File exists, reading...


Unnamed: 0,vect_proc_zh,vect_proc_en,cos_sim_proc,vect_noproc_zh,vect_noproc_en,cos_sim_noproc
0,[[ 0.00645304 0.01016895 0.0112541 ... 0.0...,[[0.00347378 0.00540323 0.00875655 ... 0.02086...,0.837227,[[ 0.00392936 0.01487431 0.01211387 ... 0.0...,[[0.00515043 0.02721935 0.00645456 ... 0.01680...,0.899076
1,[[0.01141793 0.00508447 0.0129709 ... 0.04458...,[[ 1.4517792e-02 -2.0507550e-05 2.4949501e-03...,0.701900,[[0.01262616 0.00193994 0.00612493 ... 0.04488...,[[ 1.5549253e-02 -6.9306821e-05 -1.1087634e-04...,0.854534
2,[[ 0.00363402 -0.00035689 -0.00126717 ... 0.0...,[[ 1.4998055e-02 -9.0022750e-06 7.4755466e-03...,0.605472,[[ 0.00837048 -0.00038536 -0.00178816 ... 0.0...,[[ 4.2293437e-02 -4.6420514e-04 -1.7852148e-04...,0.744089
3,[[-0.00071763 0.00012011 0.00200634 ... 0.0...,[[ 0.01445586 -0.00014476 0.00294003 ... 0.0...,0.755492,[[3.2259028e-05 8.7529244e-03 1.3469356e-03 .....,[[-7.7551391e-05 2.4979616e-02 3.8849344e-03...,0.871500
4,[[ 0.00596821 -0.00014701 0.00115364 ... 0.0...,[[ 0.0082258 -0.00044157 -0.00094935 ... 0.0...,0.818175,[[0.01313915 0.00910229 0.0044931 ... 0.06837...,[[ 0.00675407 0.01356385 -0.00238059 ... 0.0...,0.879257
...,...,...,...,...,...,...
1894,[[ 1.9390594e-03 -5.6734676e-05 -1.4707656e-03...,[[ 0.01836784 0.00051258 -0.001157 ... 0.0...,0.582360,[[0.01405967 0.0111012 0.00022158 ... 0.02790...,[[ 4.0991213e-03 -1.6492419e-05 -3.1652416e-03...,0.756503
1895,[[ 0.00034669 0.00011753 -0.00030322 ... 0.0...,[[ 0.01621959 0.0003056 -0.00237891 ... 0.0...,0.707579,[[ 0.00266796 0.00774885 0.00021995 ... 0.0...,[[ 5.1623415e-03 4.5243862e-05 -1.0195789e-03...,0.886560
1896,[[ 0.00203523 -0.00086154 0.0016085 ... 0.0...,[[ 0.01299874 -0.00041894 0.00354722 ... 0.0...,0.840512,[[0.00070778 0.03335044 0.0027578 ... 0.00442...,[[0.00178883 0.07145008 0.00521136 ... 0.00900...,0.865018
1897,[[0.01515337 0.00470427 0.00074662 ... 0.06852...,[[ 1.0927872e-02 1.4790997e-02 -4.9027418e-05...,0.888686,[[0.01581433 0.00860143 0.00266306 ... 0.06571...,[[0.00521134 0.01128286 0.00996406 ... 0.06455...,0.911643


In [42]:
# test_to_df(bucc_df, "make_bert_feats(test_df)")
bert_df = apply_to_df(bucc_df, "bucc_data/zh-en.output.bert", "bert")
bert_df

File exists, reading...


Unnamed: 0,vect_cls_proc_zh,vect_cls_proc_en,sim_cls_proc,vect_cls_noproc_zh,vect_cls_noproc_en,sim_cls_noproc,vect_mean_proc_zh,vect_mean_proc_en,sim_mean_proc,vect_mean_noproc_zh,vect_mean_noproc_en,sim_mean_noproc
0,"tensor([-1.7088e-01, 1.0762e-01, -1.3487e-01,...","tensor([-4.8489e-01, 8.8018e-02, -4.3963e-02,...",0.944367,"tensor([-2.5697e-01, -1.8738e-01, 4.5120e-02,...","tensor([-3.3425e-01, -2.6686e-01, 2.3955e-02,...",0.932438,"tensor([[-2.8351e-01, 7.9982e-02, -1.5785e-01...","tensor([[-6.0150e-01, 1.1672e-02, 6.6766e-02...",0.768387,"tensor([[-4.6078e-01, -5.4302e-01, 3.6953e-01...","tensor([[-6.0650e-01, -5.2651e-01, 1.4976e-01...",0.718354
1,"tensor([ 1.0295e-04, 3.2729e-01, -1.5472e-02,...","tensor([-1.2937e-01, 1.1730e-01, -2.5667e-01,...",0.826390,"tensor([-2.7488e-01, -1.0204e-01, 8.2408e-02,...","tensor([-2.7188e-01, -1.2896e-01, -3.4097e-01,...",0.885249,"tensor([[-2.0262e-01, 2.7153e-01, 2.3812e-02...","tensor([[-3.2220e-01, 2.3857e-01, -3.7317e-02...",0.704907,"tensor([[-5.7265e-01, -2.7035e-01, 5.9032e-01...","tensor([[-5.8185e-01, -2.9130e-01, -3.3270e-01...",0.628691
2,"tensor([-2.1172e-02, 3.3266e-01, -2.0031e-01,...","tensor([-1.7163e-01, 3.5121e-01, -2.2221e-01,...",0.924598,"tensor([-1.5105e-01, -7.8253e-02, 2.3573e-01,...","tensor([-3.1621e-01, -7.9063e-02, 1.3702e-01,...",0.911883,"tensor([[-1.9326e-01, 2.5108e-01, -2.4815e-01...","tensor([[-3.3650e-01, 3.3788e-01, -2.3007e-01...",0.853088,"tensor([[-1.6103e-01, -2.6882e-01, 7.9026e-01...","tensor([[-5.5742e-01, -8.0466e-02, 2.3267e-01...",0.621186
3,"tensor([-2.1172e-02, 3.3266e-01, -2.0031e-01,...","tensor([-1.8954e-01, 1.6436e-01, 4.4577e-01,...",0.785227,"tensor([-1.3524e-01, 6.8776e-02, 7.4715e-02,...","tensor([-3.6524e-01, -2.5968e-02, 2.4890e-02,...",0.925288,"tensor([[-1.9326e-01, 2.5108e-01, -2.4815e-01...","tensor([[-1.9562e-01, 1.5004e-01, 3.1491e-01...",0.647528,"tensor([[-1.6080e-01, 6.4559e-02, 5.8238e-01...","tensor([[-5.9252e-01, -2.6286e-02, 9.6520e-02...",0.692905
4,"tensor([-1.0712e-01, 4.3301e-01, -1.2629e-01,...","tensor([-1.3753e-01, 6.5209e-02, 1.5559e-01,...",0.873681,"tensor([-1.8522e-01, 5.3373e-02, 1.1788e-02,...","tensor([-4.0475e-01, -9.1276e-02, -3.5700e-01,...",0.856016,"tensor([[-3.5143e-01, 3.9773e-01, -2.5038e-01...","tensor([[-3.8218e-01, 1.6849e-03, 1.7885e-01...",0.695893,"tensor([[-1.8980e-01, 7.8458e-02, 5.3974e-01...","tensor([[-5.7922e-01, -9.4828e-02, -5.9200e-02...",0.679328
...,...,...,...,...,...,...,...,...,...,...,...,...
1894,"tensor([-4.2401e-02, 3.3193e-01, -2.5014e-01,...","tensor([-2.9637e-02, 4.2298e-01, -2.5473e-01,...",0.965716,"tensor([-8.7130e-02, -1.0812e-01, 1.3057e-02,...","tensor([-2.5755e-02, -1.5073e-01, -2.0023e-01,...",0.900661,"tensor([[-2.3588e-01, 2.4285e-01, -3.2926e-01...","tensor([[-1.8602e-01, 4.1861e-01, -3.2512e-01...",0.962498,"tensor([[-1.4315e-01, -2.6759e-01, 1.2490e-01...","tensor([[-1.3977e-01, -2.9867e-01, -3.5045e-01...",0.642806
1895,"tensor([-2.5168e-02, 3.3407e-01, -2.0796e-01,...","tensor([ 1.7340e-02, 3.0686e-01, 1.2504e-01,...",0.844786,"tensor([-1.5894e-01, -4.2638e-02, 1.7518e-01,...","tensor([-2.7662e-01, -1.9584e-01, -1.3521e-01,...",0.878535,"tensor([[-2.0540e-01, 2.4748e-01, -2.6483e-01...","tensor([[-2.1405e-01, 2.4954e-01, 1.1501e-01...",0.801471,"tensor([[-3.5640e-01, -2.1323e-01, 6.3719e-01...","tensor([[-6.8853e-01, -3.1353e-01, -9.9416e-02...",0.651582
1896,"tensor([ 2.2724e-02, 1.4694e-01, -1.5951e-01,...","tensor([-1.5070e-01, -7.4269e-02, 5.0753e-01,...",0.878529,"tensor([-4.5775e-03, -4.7035e-02, 8.0545e-02,...","tensor([-7.3277e-02, -8.2576e-02, 1.3520e-01,...",0.946697,"tensor([[-3.7834e-02, 3.1467e-02, -8.3374e-02...","tensor([[-2.5218e-01, -1.4371e-01, 6.3781e-01...",0.697647,"tensor([[-1.8460e-02, -2.4383e-01, 5.4320e-01...","tensor([[-3.2126e-01, -3.1776e-01, 5.4247e-01...",0.735276
1897,"tensor([ 4.2968e-03, 3.1439e-01, -2.4757e-01,...","tensor([-1.5165e-01, 1.3551e-01, -3.1547e-02,...",0.890416,"tensor([-8.7320e-02, -1.7427e-01, 1.7993e-01,...","tensor([-1.6905e-01, -1.5648e-01, -5.5539e-03,...",0.937502,"tensor([[-1.7464e-01, 2.1160e-01, -3.6906e-01...","tensor([[-2.8356e-01, 8.4986e-02, 1.5096e-01...",0.813110,"tensor([[-3.2496e-01, -4.2663e-01, 6.5345e-01...","tensor([[-3.9661e-01, -2.2053e-01, 2.2335e-01...",0.683345


In [38]:
bucc_output = ensemble(bucc_df, "bucc_data/zh-en.output", 1899, 1)
bucc_output

File exists, reading...
File exists, reading...
File exists, reading...
File exists, reading...
File exists, reading...


Unnamed: 0,pair,char_cost,word_cost,special_cost,pos,laser_proc,laser_noproc,bert_cls_proc,bert_cls_noproc,bert_mean_proc,bert_mean_noproc
0,1,625.600212,8.361790,250.264773,3.000000,0.837227,0.899076,0.944367,0.932438,0.768387,0.718354
1,1,703.551801,23.009397,248.353301,3.741657,0.701900,0.854534,0.826390,0.885249,0.704907,0.628691
2,1,685.467548,59.338722,310.167001,2.449490,0.605472,0.744089,0.924598,0.911883,0.853088,0.621186
3,1,756.902252,25.230475,305.377916,3.000000,0.755492,0.871500,0.785227,0.925288,0.647528,0.692905
4,1,1005.265643,7.171378,436.581072,3.464102,0.818175,0.879257,0.873681,0.856016,0.695893,0.679328
...,...,...,...,...,...,...,...,...,...,...,...
1894,1,597.886734,24.422795,182.031339,3.605551,0.582360,0.756503,0.965716,0.900661,0.962498,0.642806
1895,1,777.412323,25.230475,336.003998,5.099020,0.707579,0.886560,0.844786,0.878535,0.801471,0.651582
1896,1,679.130552,65.186118,304.874844,1.732051,0.840512,0.865018,0.878529,0.946697,0.697647,0.735276
1897,1,808.117073,42.197643,340.720506,1.414214,0.888686,0.911643,0.890416,0.937502,0.813110,0.683345


In [43]:
bucc_non_pair_output = ensemble(bucc_non_pair_df, "bucc_data/non_pairs/zh-en.output", 1899, 0, 1, 1)
bucc_non_pair_output

File overwritten.
File overwritten.
File overwritten.
File overwritten.
File overwritten.


Unnamed: 0,pair,char_cost,word_cost,special_cost,pos,laser_proc,laser_noproc,bert_cls_proc,bert_cls_noproc,bert_mean_proc,bert_mean_noproc
0,0,567.050117,17.076216,197.462321,4.123106,0.562159,0.565235,0.922061,0.905559,0.806044,0.525250
1,0,448.422605,7.819304,102.359587,4.582576,0.577667,0.575151,0.918370,0.902466,0.803569,0.559394
2,0,625.600212,8.678403,238.943655,1.000000,0.544537,0.528156,0.771636,0.876779,0.773878,0.529556
3,0,502.889048,17.722915,134.634470,5.000000,0.543380,0.517663,0.790675,0.838892,0.777257,0.412686
4,0,582.365457,17.722915,238.943655,1.732051,0.544718,0.496453,0.771636,0.848218,0.773878,0.422117
...,...,...,...,...,...,...,...,...,...,...,...
1894,0,749.132896,57.068192,311.449982,3.605551,0.527975,0.484224,0.816490,0.906329,0.708461,0.467330
1895,0,598.950573,32.171100,161.664429,6.782330,0.598243,0.539671,0.756894,0.888381,0.614425,0.461016
1896,0,1068.567153,42.197643,552.077177,3.316625,0.519382,0.511526,0.838038,0.890924,0.757976,0.545457
1897,0,874.507913,23.685628,419.746350,2.645751,0.462040,0.502066,0.877623,0.862899,0.742209,0.469648


In [44]:
bucc_combined = apply_to_df([bucc_output, bucc_non_pair_output], "bucc_data/zh-en.training.combined", "concat", 1)
bucc_combined

File overwritten.


Unnamed: 0,pair,char_cost,word_cost,special_cost,pos,laser_proc,laser_noproc,bert_cls_proc,bert_cls_noproc,bert_mean_proc,bert_mean_noproc
0,1,625.600212,8.361790,250.264773,3.000000,0.837227,0.899076,0.944367,0.932438,0.768387,0.718354
1,1,703.551801,23.009397,248.353301,3.741657,0.701900,0.854534,0.826390,0.885249,0.704907,0.628691
2,1,685.467548,59.338722,310.167001,2.449490,0.605472,0.744089,0.924598,0.911883,0.853088,0.621186
3,1,756.902252,25.230475,305.377916,3.000000,0.755492,0.871500,0.785227,0.925288,0.647528,0.692905
4,1,1005.265643,7.171378,436.581072,3.464102,0.818175,0.879257,0.873681,0.856016,0.695893,0.679328
...,...,...,...,...,...,...,...,...,...,...,...
1894,0,749.132896,57.068192,311.449982,3.605551,0.527975,0.484224,0.816490,0.906329,0.708461,0.467330
1895,0,598.950573,32.171100,161.664429,6.782330,0.598243,0.539671,0.756894,0.888381,0.614425,0.461016
1896,0,1068.567153,42.197643,552.077177,3.316625,0.519382,0.511526,0.838038,0.890924,0.757976,0.545457
1897,0,874.507913,23.685628,419.746350,2.645751,0.462040,0.502066,0.877623,0.862899,0.742209,0.469648


In [51]:
def get_sample_data(reoutput=False):
    
    data_file = Path("bucc_data/zh-en.sample.pairs")
    
    if data_file.is_file() and not reoutput:
        print('Data file exists, reading...')
        new_df = read_df(data_file, have_string=1)
    
    else:
        print('Data file does not exist, creating...') if not reoutput else print('Data file to be overwritten.')
    
        zh_file = Path("bucc_data/zh-en.sample.zh")
        en_file = Path("bucc_data/zh-en.sample.en")
        pair_file = Path("bucc_data/zh-en.sample.gold")
    
        zh_df = read_df(zh_file, names=['ID_zh','Sentence_zh'], header=None, have_string=1)
        en_df = read_df(en_file, names=['ID_en','Sentence_en'], header=None, have_string=1)
        pair_df = read_df(pair_file, names=['ID_zh','ID_en'], header=None)
        
        new_df = pair_df.merge(zh_df, 'inner', 'ID_zh')
        new_df = new_df.merge(en_df, 'inner', 'ID_en')
        save_df(new_df, data_file)
        
    return new_df



In [52]:
def get_sample_non_pair_data(reoutput=False):
    
    data_file = Path("bucc_data/zh-en.sample.nonpairs")
    
    if data_file.is_file() and not reoutput:
        print('Data file exists, reading...')
        new_df = read_df(data_file, have_string=1)
    
    else:
        print('Data file does not exist, creating...') if not reoutput else print('Data file to be overwritten.')
        
        new_df = pd.DataFrame(columns=['ID_zh', 'ID_en'])
    
        bucc_file = "bucc_data/zh-en.sample.pairs"
        
        bucc_df = read_df(bucc_file, have_string=1)
        
        n = 0
        
        while n < 257:
            x = randrange(257)
            y = randrange(257)
            if x != y:
                zh_id = bucc_df.iloc[y]['ID_zh']
                en_id = bucc_df.iloc[x]['ID_en']
                new_df.loc[n] = [zh_id, en_id]
                n+=1
                
        zh_file = "bucc_data/zh-en.sample.zh"
        en_file = "bucc_data/zh-en.sample.en"
        
        zh_df = read_df(zh_file, names=['ID_zh','Sentence_zh'], header=None, have_string=1)
        en_df = read_df(en_file, names=['ID_en','Sentence_en'], header=None, have_string=1)
        
        new_df = new_df.merge(zh_df, 'inner', 'ID_zh')
        new_df = new_df.merge(en_df, 'inner', 'ID_en')
        save_df(new_df, data_file)
        
    return new_df

In [54]:
sample_df = get_sample_data()
sample_df

Data file exists, reading...
File exists, reading...


Unnamed: 0,ID_zh,ID_en,Sentence_zh,Sentence_en
0,zh-000000057,en-000008530,在突尼斯的1000万人口中，该国最大报纸的发行量是大约5万份。,"In Tunisia, the largest newspaper has a circul..."
1,zh-000000137,en-000003060,一些大城市已经开始了示威活动，要求遏制政府官员中的腐败行为。,"Demonstrations begin in major cities, calling ..."
2,zh-000000181,en-000006518,事实上，这就是Urkrise——让20世纪变得面目狰狞的事件。,"Indeed, this was the Urkrise – the event that ..."
3,zh-000000183,en-000007440,卡特里娜飓风的近期效果是把有关伊拉克的新闻报导赶下了电视屏幕和报纸头条。,The short-term effect of Katrina was to drive ...
4,zh-000000197,en-000006775,1979年之后，对于增长极限和原子能的恐惧消退了。,"After 1979, fears about limits to growth and n..."
...,...,...,...,...
252,zh-000008560,en-000005652,从更广义的角度讲，非洲国家需要规范自己的政治和经济秩序。,"More broadly, African countries need to put th..."
253,zh-000008566,en-000011992,发展地区弹道导弹防御等联合项目也提高了同盟的可信度。,Credibility is also enhanced by joint projects...
254,zh-000008575,en-000003572,类似地，世界卫生组织也大力呼吁加大卫生发展援助力度。,"Likewise, the World Health Organization issued..."
255,zh-000008581,en-000012939,在中国的外汇储备中，有大约8000亿美元投资于欧元资产。,About $800 billion of China’s foreign-exchange...


In [55]:
sample_non_pair_df = get_sample_non_pair_data(1)
sample_non_pair_df

Data file to be overwritten.
File exists, reading...
File exists, reading...
File exists, reading...


Unnamed: 0,ID_zh,ID_en,Sentence_zh,Sentence_en
0,zh-000007549,en-000012077,莫拉莱斯随即以绝对优势在2005年的总统选举中获胜。,"As the efficacy of military power is reduced, ..."
1,zh-000008073,en-000012077,选举不会是完全自由的，但政府也不能完全操纵它。,"As the efficacy of military power is reduced, ..."
2,zh-000002494,en-000012077,这些补助基于竞争标准发放，资金量取决于是否实现可测量的目标。,"As the efficacy of military power is reduced, ..."
3,zh-000007764,en-000002453,奥巴桑乔和执政的人民党于2003年5月在充满争议的情况下当选。,"Its unemployment rate, at just below 5%, is ha..."
4,zh-000007764,en-000001135,奥巴桑乔和执政的人民党于2003年5月在充满争议的情况下当选。,With the liberalization of global financial ma...
...,...,...,...,...
252,zh-000008441,en-000000418,福利开支花费巨大，削减这一开支将是痛苦的。,Government plays a central role in financing t...
253,zh-000001580,en-000011892,实际上，2008年来自化石燃料的二氧化碳排放已经比1990年增长了近40%。,"But, most importantly, the invention of coins ..."
254,zh-000005549,en-000011663,卫生援助在拯救生命、改善生活方面是有用的，而且很有用。,Conservatives held a majority of the seats in ...
255,zh-000004000,en-000004248,毕竟法国是欧元区排名第二、世界排名第五的经济大国。,There are no policies to reduce risks in shado...


In [57]:
sample_output = ensemble(sample_df, "bucc_data/sample/zh-en.output", 257, 1)
sample_output

File exists, reading...
File does not exist.
File does not exist.
File does not exist.
File does not exist.


Unnamed: 0,pair,char_cost,word_cost,special_cost,pos,laser_proc,laser_noproc,bert_cls_proc,bert_cls_noproc,bert_mean_proc,bert_mean_noproc
0,1,737.580423,15.043894,277.395265,1.000000,0.756168,0.839682,0.942113,0.934795,0.912724,0.741520
1,1,622.485195,7.819304,210.027831,1.414214,0.713394,0.859108,0.885409,0.890294,0.798097,0.691382
2,1,502.889048,16.493954,134.634470,3.605551,0.700916,0.788735,0.895154,0.911382,0.797245,0.683166
3,1,791.115981,15.043894,267.592490,1.414214,0.789793,0.819363,0.847412,0.915910,0.754864,0.600001
4,1,380.692784,8.678403,95.249682,1.414214,0.750441,0.875089,0.937944,0.920843,0.671142,0.661791
...,...,...,...,...,...,...,...,...,...,...,...
252,1,615.189105,7.819304,218.530493,2.828427,0.667896,0.854644,0.887325,0.927912,0.840848,0.712080
253,1,920.148205,26.120477,435.112015,5.744563,0.761274,0.825573,0.883596,0.898876,0.787054,0.649296
254,1,874.507913,46.801859,419.746350,3.000000,0.712739,0.874674,0.695167,0.937445,0.429903,0.705683
255,1,698.330681,8.077092,276.597159,2.449490,0.794179,0.881399,0.942517,0.943011,0.895734,0.678615


In [58]:
sample_non_pair_output = ensemble(sample_non_pair_df, "bucc_data/sample/non_pairs/zh-en.output", 257, 0)
sample_non_pair_output

File does not exist.
File does not exist.
File does not exist.
File does not exist.
File does not exist.


Unnamed: 0,pair,char_cost,word_cost,special_cost,pos,laser_proc,laser_noproc,bert_cls_proc,bert_cls_noproc,bert_mean_proc,bert_mean_noproc
0,0,641.468736,26.120477,242.183320,1.732051,0.504368,0.477733,0.848468,0.858453,0.685114,0.439511
1,0,716.068798,16.493954,327.528873,2.449490,0.517501,0.503216,0.799139,0.906307,0.677112,0.573194
2,0,557.703778,7.819304,172.517183,4.582576,0.603498,0.597677,0.786354,0.894400,0.671513,0.499250
3,0,336.901020,15.966182,40.080511,3.872983,0.494124,0.455999,0.861551,0.875606,0.742763,0.416285
4,0,949.390537,31.252034,414.360425,4.582576,0.606292,0.556441,0.912340,0.861666,0.711010,0.480268
...,...,...,...,...,...,...,...,...,...,...,...
252,0,1019.091467,57.068192,590.445529,4.690416,0.507537,0.541698,0.766315,0.892111,0.626709,0.483626
253,0,407.374407,24.422795,60.085454,5.385165,0.434843,0.556170,0.820043,0.859236,0.821027,0.444615
254,0,500.790517,8.077092,162.293225,3.316625,0.440960,0.433448,0.804777,0.872671,0.636243,0.470573
255,0,720.648389,59.338722,312.988416,3.162278,0.551633,0.507879,0.804550,0.884800,0.732837,0.470202


In [60]:
sample_combined = apply_to_df([sample_output, sample_non_pair_output], "bucc_data/zh-en.sample.combined", "concat")
sample_combined

File overwritten.


Unnamed: 0,pair,char_cost,word_cost,special_cost,pos,laser_proc,laser_noproc,bert_cls_proc,bert_cls_noproc,bert_mean_proc,bert_mean_noproc
0,1,737.580423,15.043894,277.395265,1.000000,0.756168,0.839682,0.942113,0.934795,0.912724,0.741520
1,1,622.485195,7.819304,210.027831,1.414214,0.713394,0.859108,0.885409,0.890294,0.798097,0.691382
2,1,502.889048,16.493954,134.634470,3.605551,0.700916,0.788735,0.895154,0.911382,0.797245,0.683166
3,1,791.115981,15.043894,267.592490,1.414214,0.789793,0.819363,0.847412,0.915910,0.754864,0.600001
4,1,380.692784,8.678403,95.249682,1.414214,0.750441,0.875089,0.937944,0.920843,0.671142,0.661791
...,...,...,...,...,...,...,...,...,...,...,...
252,0,1019.091467,57.068192,590.445529,4.690416,0.507537,0.541698,0.766315,0.892111,0.626709,0.483626
253,0,407.374407,24.422795,60.085454,5.385165,0.434843,0.556170,0.820043,0.859236,0.821027,0.444615
254,0,500.790517,8.077092,162.293225,3.316625,0.440960,0.433448,0.804777,0.872671,0.636243,0.470573
255,0,720.648389,59.338722,312.988416,3.162278,0.551633,0.507879,0.804550,0.884800,0.732837,0.470202


## Random Forest

Train on training data (1899 rows), Test on sample data (267 rows).

In [45]:
# Check for NaN values# check for NaN values

bucc_combined[bucc_combined.isnull().any(axis=1)]
np.where(np.isnan(bucc_combined))  

(array([], dtype=int64), array([], dtype=int64))

### Indivdually

In [125]:
def RandomForest(a, b):
    x_train = bucc_combined.iloc[:, a:b]
    y_train = bucc_combined.iloc[:, 0]

    x_test = sample_combined.iloc[:, a:b]
    y_test = sample_combined.iloc[:, 0]
    
    for col_name in x_train.columns:
        print(col_name)

    clf = RandomForestClassifier(n_estimators = 100, random_state = 42)

    scores = cross_val_score(clf, x_train, y_train, cv=5, scoring = "f1")*100

    print("Cross-Validation F1 Score: {:.2f}% (+/- {:.2f})".format(scores.mean(), scores.std() * 2))

    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)

    f1 = f1_score(y_test, y_pred)

    print("Sample Data F1 Score: {:.2f}%".format(f1*100))
    
    
    new_df = pd.concat([x_test,y_test == y_pred], axis=1)
    new_df= new_df[~new_df['pair']]
    
    return new_df
    

- Length

In [86]:
RandomForest(1,4)

char_cost
word_cost
special_cost
Cross-Validation F1 Score: 56.38% (+/- 5.30)
Sample Data F1 Score: 79.87%


- POS

In [90]:
RandomForest(4,5)

pos
Cross-Validation F1 Score: 59.74% (+/- 4.43)
Sample Data F1 Score: 62.79%


- LASER

In [91]:
RandomForest(5,7)

laser_proc
laser_noproc
Cross-Validation F1 Score: 99.50% (+/- 0.35)
Sample Data F1 Score: 100.00%


- BERT

In [126]:
RandomForest(7,11)

bert_cls_proc
bert_cls_noproc
bert_mean_proc
bert_mean_noproc
Cross-Validation F1 Score: 92.33% (+/- 1.02)
Sample Data F1 Score: 95.34%


Unnamed: 0,bert_cls_proc,bert_cls_noproc,bert_mean_proc,bert_mean_noproc,pair
220,0.885304,0.909413,0.826566,0.515286,False
5,0.954386,0.910722,0.814996,0.579358,False
8,0.831211,0.930241,0.788583,0.604848,False
18,0.769579,0.937763,0.385171,0.595545,False
24,0.815299,0.894325,0.86536,0.614066,False
30,0.891827,0.926146,0.719151,0.577512,False
33,0.860346,0.935513,0.740887,0.601121,False
34,0.893284,0.941624,0.711507,0.603914,False
38,0.914611,0.92816,0.842773,0.580464,False
74,0.914134,0.894631,0.708763,0.590125,False


### Ensemble (only pairs training data)

In [70]:
x_train = bucc_output.iloc[:, 1:]
y_train = bucc_output.iloc[:, 0]

x_test = sample_combined.iloc[:, 1:]
y_test = sample_combined.iloc[:, 0]

clf = RandomForestClassifier(n_estimators = 100, random_state = 42)

scores = cross_val_score(clf, x_train, y_train, cv=5, scoring = "f1")*100

print("Cross-Validation F1 Score: {:.2f}% (+/- {:.2f})".format(scores.mean(), scores.std() * 2))

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

f1 = f1_score(y_test, y_pred)

print("Sample Data F1 Score: {:.2f}%".format(f1*100))

Cross-Validation F1 Score: 100.00% (+/- 0.00)
Sample Data F1 Score: 66.67%


### Ensemble (combined)

In [103]:
x_train = bucc_combined.iloc[:, 1:]
y_train = bucc_combined.iloc[:, 0]

x_test = sample_combined.iloc[:, 1:]
y_test = sample_combined.iloc[:, 0]

clf = RandomForestClassifier(n_estimators = 100, random_state = 42)

scores = cross_val_score(clf, x_train, y_train, cv=5, scoring = "f1")*100

print("Cross-Validation F1 Score: {:.2f}% (+/- {:.2f})".format(scores.mean(), scores.std() * 2))

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

f1 = f1_score(y_test, y_pred)

print("Sample Data F1 Score: {:.2f}%".format(f1*100))

Cross-Validation F1 Score: 99.47% (+/- 0.53)
Sample Data F1 Score: 100.00%


In [110]:
x = bucc_combined.iloc[:, 1:]
y = bucc_combined.iloc[:, 0]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 42)

clf = RandomForestClassifier(n_estimators = 100, random_state = 42)

scores = cross_val_score(clf, x_train, y_train, cv=5, scoring = "f1")*100

print("Cross-Validation F1 Score: {:.2f}% (+/- {:.2f})".format(scores.mean(), scores.std() * 2))

clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

f1 = f1_score(y_test, y_pred)

print("Train-Test-Split F1 Score: {:.2f}%".format(f1*100))

Cross-Validation F1 Score: 99.53% (+/- 0.78)
Train-Test-Split F1 Score: 99.49%


In [111]:
x_test['pred']=[y_test == y_pred]

ValueError: Length of values does not match length of index

[1070    True
 354     True
 881     True
 196     True
 1619    True
         ... 
 419     True
 322     True
 978     True
 1610    True
 1418    True
 Name: pair, Length: 950, dtype: bool]

In [124]:
new_pd = pd.concat([x_test,y_test, pd.Series(y_pred, name="pred")], axis=1)
new_pd


ValueError: Shape of passed values is (1658, 12), indices imply (1470, 12)

In [122]:
pd.concat([x_test,y_test == y_pred], axis=1)
new_pd[~new_pd['pair']]

Unnamed: 0,char_cost,word_cost,special_cost,pos,laser_proc,laser_noproc,bert_cls_proc,bert_cls_noproc,bert_mean_proc,bert_mean_noproc,pair
1113,693.721982,14.637722,231.888405,5.744563,0.698821,0.646563,0.902548,0.881269,0.825414,0.620809,False
1427,934.377036,75.583699,425.617604,3.464102,0.615988,0.604749,0.858124,0.940161,0.691959,0.546335,False
712,653.195083,8.678403,261.926892,3.0,0.563388,0.573315,0.846855,0.916289,0.688008,0.579986,False
1410,774.726146,57.068192,329.301192,2.645751,0.628975,0.660858,0.787891,0.928913,0.621074,0.654007,False
105,1063.187621,43.587028,541.817874,4.123106,0.647291,0.68985,0.979321,0.922198,0.964613,0.612956,False
