<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Preprocessing" data-toc-modified-id="Preprocessing-1">Preprocessing</a></span></li><li><span><a href="#Model" data-toc-modified-id="Model-2">Model</a></span></li></ul></div>

# Preprocessing

In [13]:
import torchtext
import spacy

#Embeddings
glove = torchtext.vocab.GloVe(name='6B', dim=100)

#tokenizer model
nlp_en = spacy.load('en300')

In [2]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

wv_from_bin = KeyedVectors.load_word2vec_format("data/model.bin", binary=True) 

In [15]:
import string
import jieba
import gensim 
import spacy
import numpy as np

stop_words = [line.rstrip() for line in open('data/chinese_stop_words.txt',"r", encoding="utf-8") ]

def get_sentence_vector_zh(line, mean):
    vectors = []
    for w in line:
        try:
            emb = wv_from_bin[w]
            vectors.append(emb)
        except:
            pass #Do not add if the word is out of vocabulary
    if vectors:
        vectors = np.array(vectors)
        if mean:
            vectors = np.mean(vectors) 
        else :
            vectors = np.mean(vectors, axis=0)
        return vectors
    else:
        return np.zeros(100)


def processing_zh(sentence):
    seg_list = jieba.lcut(sentence,cut_all=True)
    doc = [word for word in seg_list if word not in stop_words]
    docs = [e for e in doc if e.isalnum()]
    return docs


def get_sentence_embeddings_zh(f, mean=True):
    file = open(f, encoding="utf8") 
    lines = file.readlines() 
    sentences_vectors = []
    for l in lines:
        sent  = processing_zh(l)
        vec = get_sentence_vector_zh(sent, mean=mean)
        if vec is not None:
            sentences_vectors.append(vec)
        else:
            print(l)
    return sentences_vectors

In [16]:
from nltk.corpus import stopwords
from tqdm import tqdm 

stop_words_en = set(stopwords.words('english'))

def preprocess(sentence,nlp):
    text = sentence.lower()
    doc = [token.lemma_ for token in  nlp.tokenizer(text)]
    doc = [word for word in doc if word not in stop_words_en]
    doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only
    return doc

def get_word_vector(embeddings, word):
    try:
        vec = embeddings.vectors[embeddings.stoi[word]]
        return vec
    except KeyError:
        #print(f"Word {word} does not exist")
        pass

def get_sentence_vector(embeddings,line):
    vectors = []
    for w in line:
        emb = get_word_vector(embeddings,w)
        #do not add if the word is out of vocabulary
        if emb is not None:
            vectors.append(emb)
    return torch.mean(torch.stack(vectors), axis=0).data.numpy()


def get_embeddings(f,embeddings,lang):
    file = open(f, encoding="utf8") 
    lines = file.readlines() 
    sentences_vectors =[]

    for l in lines:
        sentence = preprocess(l,lang)
        try:
            vec = get_sentence_vector(embeddings,sentence)
            sentences_vectors.append(vec)
        except:
            sentences_vectors.append(np.zeros(100))

    return sentences_vectors


In [17]:
import spacy
import torchtext
from torchtext import data

zh_train_mt = get_sentence_embeddings_zh("data/en-zh/train.enzh.mt", mean=False)
zh_train_src = get_embeddings("data/en-zh/train.enzh.src", glove, nlp_en)
f_train_scores = open("data/en-zh/train.enzh.scores",'r')
zh_train_scores = f_train_scores.readlines()

zh_val_src = get_embeddings("data/en-zh/dev.enzh.src", glove, nlp_en)
zh_val_mt = get_sentence_embeddings_zh("data/en-zh/dev.enzh.mt", mean=False)
f_val_scores = open("data/en-zh/dev.enzh.scores",'r')
zh_val_scores = f_val_scores.readlines()

In [None]:
print(f"Training mt: {len(zh_train_mt)} Training src: {len(zh_train_src)}")
print(f"Validation mt: {len(zh_val_mt)} Validation src: {len(zh_val_src)}")

In [None]:
import numpy as np

X_train = np.concatenate((np.asarray(zh_train_src), np.asarray(zh_train_mt)),axis=1) #[np.array(zh_train_src),np.array(zh_train_mt)]
X_train_zh = np.array(X_train).transpose()

X_val = np.concatenate((zh_val_src, zh_val_mt),axis=1) # [np.array(zh_val_src),np.array(zh_val_mt)]
X_val_zh = np.array(X_val).transpose()

#Scores
train_scores = np.array(zh_train_scores).astype(float)
y_train_zh = train_scores

val_scores = np.array(zh_val_scores).astype(float)
y_val_zh = val_scores

# Model

In [None]:
def rmse(predictions, targets):
    res = np.sqrt(((predictions - targets) ** 2).mean())
    return res

In [None]:
import torch.nn.functional as F

class Net(torch.nn.Module):
    def __init__(self, n_feature, n_output):
        super(Net, self).__init__()
        self.hidden = torch.nn.Linear(n_feature, 256)  
        self.hidden2 = torch.nn.Linear(256, 64)
        self.hidden3 = torch.nn.Linear(64, 16)
        self.predict = torch.nn.Linear(16, n_output) 

    def forward(self, x):
        x = F.relu(self.hidden(x))  
        x = F.relu(self.hidden2(x))  
        x = F.relu(self.hidden3(x))  
        x = self.predict(x)
        return x

In [None]:
import matplotlib.pyplot as plt
from scipy.stats.stats import pearsonr

net = Net(n_feature=200, n_output=1)     # define the network
optimizer = torch.optim.SGD(net.parameters(), lr=0.01)
loss_func = torch.nn.MSELoss()  
batch_size = 64
steps = 20000
losses = []

for t in range(steps):
    curr_bat = np.random.choice(len(X_train), batch_size, replace=False)
    x = torch.Tensor(X_train[curr_bat])
    y = torch.Tensor(y_train_zh[curr_bat]).view(batch_size,-1)
    optimizer.zero_grad()
    prediction = net(x)    
    #print("prediction.shape", prediction, "y.shape", y)
    loss = loss_func(prediction, y)
    losses += [loss.item()]
    optimizer.zero_grad()   
    loss.backward()        
    optimizer.step()    
    if t % (steps // 10) == 0:
        with torch.no_grad():
            predictions = net(torch.Tensor(X_val)).flatten().data.numpy()
            pearson = pearsonr(y_val_zh, predictions)
            print(f'Steps: {t} RMSE: {rmse(predictions,y_val_zh)} Pearson {pearson[0]}')
plt.plot(list(range(len(losses))), losses)
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Training")
plt.yscale('log')
plt.show()