In [1]:
import os
from string import punctuation
from nltk.tokenize import word_tokenize

In [2]:
# function to load the europarl data into a list of sentences
def load_sentences(file):
    doc = open(file, mode='rt', encoding='utf-8')
    text = doc.read()
    doc.close()
    return text.strip().split('\n')

In [3]:
file_english = 'es-en/europarl-v7.es-en.en'
file_spanish = 'es-en/europarl-v7.es-en.es'
english = load_sentences(file_english)
spanish = load_sentences(file_spanish)

In [37]:
# For simplicity, remove all punctuation and lowercase all text. 
# Argument: List of sentences (as strings)
# Returns: List of lists of tokens
def clean(sentences):
    cleaned = []
    for sentence in sentences:
        # This is cleaner than splitting on whitespace since it allows us to
        # keep hyphens while removing punctuation, separate possessive apostrophes, etc
        clean = word_tokenize(sentence)
        # words consisting of not /only/ punctuation, and no numbers
        clean = [word.lower() for word in clean if word not in punctuation and not word.isnumeric()]
        cleaned.append(clean)
    return cleaned

In [38]:
en_clean = clean(english)
es_clean = clean(spanish)

In [40]:
# Return a dictionary with counts of the number of times a token occurred in the corpora.
def get_tokens(clean_data):
    tokens = {}
    for sentence in clean_data:
        for token in sentence:
            if token in tokens:
                tokens[token] += 1
            else:
                tokens[token] = 1
    return tokens

In [41]:
# make a much, much smaller version of the data so we can test quickly!
en_mini = ['that book is red', 'a yellow car', 'was that a red book', 
           'the car was red', 'yellow book', 'a car', 'that is a book', 
           'the yellow car', 'the book']

es_mini = ['eso libro es rojo', 'un carro amarillo', 'era eso un libro rojo',
           'el carro era rojo', 'libro amarillo', 'un carro', 
           'eso es un libro', 'el carro amarillo', 'el libro']

In [51]:
en_mini_clean = [lst.split() for lst in en_mini]
es_mini_clean = [lst.split() for lst in es_mini]

In [58]:
# Takes two lists of lists of words and a threshhold percentage of sentences that words must either
# both appear in or both not appear in, and returns a dictionary of words that occur in similar places.
def is_word_present_model(source_data, target_data, threshold):
    source_tokens = get_tokens(source_data)
    target_tokens = get_tokens(target_data)
    
    # should be the same for source and target (could assert this)
    num_sentences = len(source_data)
    
    # these will have arrays of 1s and 0s corresponding to whether or not a word appeared in sentence i
    s_present = dict((token, []) for token in source_tokens.keys())
    t_present = dict((token, []) for token in target_tokens.keys())
    
    for token in source_tokens.keys():
        for sentence in source_data:
            if token in sentence:
                s_present[token].append(1)
            else:
                s_present[token].append(0)
                
    for token in target_tokens.keys():
        for sentence in target_data:
            if token in sentence:
                t_present[token].append(1)
            else:
                t_present[token].append(0)
                
    source_to_target = dict((token, []) for token in source_tokens.keys())
    
    for s_token in s_present.keys():
        for t_token in t_present.keys():
            match_percent = sum([i == j for i, j in zip(s_present[s_token], t_present[t_token])]) / num_sentences
            if match_percent > threshold:
                source_to_target[s_token].append(t_token)
    
    return source_to_target

In [59]:
# this works very well on our contrived, one-to-one correspondant data
is_word_present_model(en_mini_clean, es_mini_clean, .9)

{'a': ['un'],
 'book': ['libro'],
 'car': ['carro'],
 'is': ['es'],
 'red': ['rojo'],
 'that': ['eso'],
 'the': ['el'],
 'was': ['era'],
 'yellow': ['amarillo']}

In [65]:
# the above function will take a very long time to run on our dataset. 
# we can use this to reduce the number of words, since words that only occur a few times
# are likely less important--filter out proper names
def reduce_vocab(data, min_occurrence):
    tokens = get_tokens(data)
    vocab = []
    for token in tokens.keys():
        if tokens[token] >= min_occurrence:
            vocab.append(token)
    
    new_data = []
    for sentence in data:
        new_data.append([token for token in sentence if token in vocab])
    
    return new_data

In [73]:
reduced_en_2000 = reduce_vocab(en_clean[:2000], 12)
reduced_es_2000 = reduce_vocab(es_clean[:2000], 12)

In [77]:
dict_2000 = is_word_present_model(reduced_en_2000, reduced_es_2000, .99)

In [85]:
reduced_en_10000 = reduce_vocab(en_clean[:10000], 100)
reduced_es_10000 = reduce_vocab(es_clean[:10000], 100)

In [93]:
dict_10000_take_2 = is_word_present_model(reduced_en_10000, reduced_es_10000, .985)

In [129]:
import numpy as np
import math

# split the above function into two functions because varying the parameters on only the final loop
# shouldn't require recomputing the boolean vectors
def data_to_present_dicts(source_data, target_data):
    source_tokens = get_tokens(source_data)
    target_tokens = get_tokens(target_data)
    
    # these will have arrays of 1s and 0s corresponding to whether or not a word appeared in sentence i
    s_present = dict((token, []) for token in source_tokens.keys())
    t_present = dict((token, []) for token in target_tokens.keys())
    
    for token in source_tokens.keys():
        for sentence in source_data:
            if token in sentence:
                s_present[token].append(1)
            else:
                s_present[token].append(0)
                
    for token in target_tokens.keys():
        for sentence in target_data:
            if token in sentence:
                t_present[token].append(1)
            else:
                t_present[token].append(0)
    return (s_present, t_present)

# vary the number of sentences that need to match to align words
def match_threshold(s_present, t_present, threshold, num_sentences):
    source_to_target = dict((token, []) for token in s_present.keys())
    s_appearances = dict((token, np.mean(s_present[token])) for token in s_present.keys())
    t_appearances = dict((token, np.mean(t_present[token])) for token in t_present.keys())  
    
    num_acceptable_misses = math.ceil((1 - threshold) * num_sentences)
    
    for s_token in s_present.keys():
        for t_token in t_present.keys():
            # only compare words that occur at a similar frequency to one another
            if abs(s_appearances[s_token] - t_appearances[t_token]) < 1 - threshold:
                num_misses = 0
                index = 0
                while num_misses <= num_acceptable_misses:
                    num_misses += (s_present[s_token][index] != t_present[t_token][index])
                    if index == num_sentences - 1:
                        source_to_target[s_token].append(t_token)
                        break
                    index += 1
    
    return source_to_target

In [99]:
(en_present_10000, es_present_10000) = data_to_present_dicts(reduced_en_10000, reduced_es_10000)

In [130]:
dict_attempt = match_threshold(en_present_10000, es_present_10000, .981, 10000)

In [113]:
en_reduced_25000 = reduce_vocab(en_clean[:25000], 200)
es_reduced_25000 = reduce_vocab(es_clean[:25000], 200)
(en_present_25000, es_present_25000) = data_to_present_dicts(en_reduced_25000, es_reduced_25000)

In [125]:
large_attempt = match_threshold(en_present_25000, es_present_25000, .986, 25000)

In [135]:
en_reduced_100000 = reduce_vocab(en_clean[:100000], 500)
es_reduced_100000 = reduce_vocab(es_clean[:100000], 500)
(en_present_100000, es_present_100000) = data_to_present_dicts(en_reduced_100000, es_reduced_100000)

In [138]:
larger_attempt = match_threshold(en_present_100000, es_present_100000, .99, 100000)

In [140]:
larger_attempt2 = match_threshold(en_present_100000, es_present_100000, .992, 100000)

In [141]:
es_to_en = match_threshold(es_present_100000, en_present_100000, .992, 100000)

In [159]:
dict((k,v) for k, v in larger_attempt2.items() if v != [])

{'access': ['acceso'],
 'actions': ['acciones'],
 'activities': ['actividades'],
 'agreements': ['acuerdos'],
 'always': ['siempre'],
 'amendment': ['enmienda'],
 'amendments': ['enmiendas'],
 'article': ['artículo'],
 'aspects': ['aspectos'],
 'attention': ['atención'],
 'authorities': ['autoridades'],
 'behalf': ['nombre'],
 'budget': ['presupuesto'],
 'cases': ['casos'],
 'charter': ['carta'],
 'children': ['niños'],
 'citizens': ['ciudadanos'],
 'communication': ['comunicación'],
 'competition': ['competencia'],
 'concern': ['preocupación'],
 'conditions': ['condiciones'],
 'conference': ['conferencia'],
 'context': ['contexto'],
 'control': ['control'],
 'cooperation': ['cooperación'],
 'costs': ['costes'],
 'council': ['consejo'],
 'court': ['tribunal'],
 'create': ['crear'],
 'currently': ['actualmente'],
 'decision': ['decisión'],
 'decisions': ['decisiones'],
 'democracy': ['democracia'],
 'dialogue': ['diálogo'],
 'difficult': ['difícil'],
 'directive': ['directiva'],
 'draft

In [180]:
# add common words to our dictionary in addition to ones that are machine learned
en_to_es = larger_attempt2
en_to_es['a'] = ['un', 'una']
en_to_es['an'] = ['un', 'una']
en_to_es['and'] = ['y']
en_to_es['be'] = ['ser', 'estar']
en_to_es['because'] = ['porque']
en_to_es['between'] = ['entre']
en_to_es['but'] = ['pero']
en_to_es['can'] = ['puede']
en_to_es['day'] = ['dia']
en_to_es['days'] = ['dias']
en_to_es['do'] = ['hacer']
en_to_es['done'] = ['hecho']
en_to_es['each'] = ['cada']
en_to_es['from'] = ['de']
en_to_es['go'] = ['ir']
en_to_es['he'] = ['el']
en_to_es['if'] = ['si']
en_to_es['is'] = ['es']
en_to_es['more'] = ['mas']
en_to_es['no'] = ['no']
en_to_es['not'] = ['no']
en_to_es['of'] = ['de']
en_to_es['our'] = ['nuestro']
en_to_es['she'] = ['ella']
en_to_es['the'] = ['el', 'la']
en_to_es['today'] = ['hoy']

In [182]:
# The use of different samples and threshholds gave us different results, so add them to our dict
for k, v in dict_attempt.items():
    if len(v) == 1:
        en_to_es[k] = v

for k,v in large_attempt.items():
    if len(v) == 1:
        en_to_es[k] = v
        
for k,v in larger_attempt.items():
    if len(v) == 1:
        en_to_es[k] = v

In [186]:
en_to_es = (dict((k,v) for k, v in en_to_es.items() if v != []))

In [194]:
# mostly only useful as a single-word dictionary, but sentences where word order is preserved are translatable
def translate(en):
    # Keep numbers the same
    if en.isnumeric():
        return en
    
    # If it is a single word, return the list of possible translations
    if len(en.split()) == 1:
        if en in en_to_es.keys():
            return en_to_es[en]
        else:
            return 'Input not in dictionary'
    
    else:
        result = ''
        for word in en.split():
            if word in en_to_es.keys():
                result += en_to_es[word][0] + ' '
            # keep numbers the same
            elif word.isnumeric():
                result += word + ' '
        
        if result == '':
            return 'Input not in dictionary.'
    
    return result

In [195]:
# A cherry-picked example--the translator ignores "big" because it doesn't know it
translate('the context of our big decision')

'el contexto de nuestro decisión '

In [197]:
translate("we want a regulation")

'queremos un reglamento '

In [199]:
translate('an')

['un', 'una']