I recently added large corpora of sentences to a Postgresql. I want to use these data as new training data to see if we can do a better job of translating. Each corpus has millions of words and are specific to American and British dialects. Keep your fingers crossed...

First, lets see what we have in the database.

In [37]:
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
from gensim import models, utils
from gensim.models import translation_matrix
import gensim

In [6]:
# Define a database name
# Set your postgres username
dbname = 'corpus'
username = 'dan' # change this to your username

In [7]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
from sqlalchemy import create_engine
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print(engine.url)

postgres://dan@localhost/corpus


In [8]:
conn = None
conn = psycopg2.connect(database = dbname, user = username)
cur = conn.cursor()

# create table one by one

# close communication with the PostgreSQL database server
#cur.close()
# commit the changes
#conn.commit()

In [9]:
cur = conn.cursor()
cur.execute("select relname from pg_class where relkind='r' and relname !~ '^(pg_|sql_)';")
print (cur.fetchall())



[('training_data',)]


### Input data into Pandas

In [10]:
command = '''SELECT * 
             FROM training_data;
             '''

# Read in the data into Pandas
df = pd.read_sql_query(command, con=engine)



In [11]:
df.describe()

Unnamed: 0,source,sentence
count,13364491,13364491
unique,2,10298943
top,BNC,yeah
freq,12736688,57030


In [17]:
df_non_dup = df.drop_duplicates()

In [18]:
temp_replace = {'BNC':0, 'OANC': 1}
df_non_dup['source'] = df_non_dup['source'].replace(temp_replace)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [20]:
df_non_dup.head()

Unnamed: 0,source,sentence
0,0,have you done much work
1,0,i've done some work
2,0,i've tried
3,0,good
4,0,to sort of


In [21]:
# clean up the poor data

def standardize_text(temp_df, text_field):
    temp_df[text_field] = temp_df[text_field].str.replace(r"http\S+", "")
    temp_df[text_field] = temp_df[text_field].str.replace(r"http", "")
    temp_df[text_field] = temp_df[text_field].str.replace(r"@\S+", "")
    temp_df[text_field] = temp_df[text_field].str.replace(r"[^A-Za-z0-9(),!?@\'\`\"\_\n]", " ")
    temp_df[text_field] = temp_df[text_field].str.replace(r"@", "at")
    temp_df[text_field] = temp_df[text_field].str.lower()
    return temp_df

df_non_dup = standardize_text(df_non_dup, "sentence")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexin

In [22]:
df_non_dup.head()

Unnamed: 0,source,sentence
0,0,have you done much work
1,0,i've done some work
2,0,i've tried
3,0,good
4,0,to sort of


In [23]:
# tokenize the sentences

from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')

df_non_dup["tokens"] = df_non_dup["sentence"].apply(tokenizer.tokenize)
df_non_dup.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,source,sentence,tokens
0,0,have you done much work,"[have, you, done, much, work]"
1,0,i've done some work,"[i, ve, done, some, work]"
2,0,i've tried,"[i, ve, tried]"
3,0,good,[good]
4,0,to sort of,"[to, sort, of]"


Inspect our data a bit more.

In [24]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

all_words = [word for tokens in df_non_dup["tokens"] for word in tokens]
sentence_lengths = [len(tokens) for tokens in df_non_dup["tokens"]]
VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
print("Max sentence length is %s" % max(sentence_lengths))

Using TensorFlow backend.
  return f(*args, **kwds)


105069975 words total, with a vocabulary size of 439001
Max sentence length is 853


### Bag of words

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def cv(data):
    count_vectorizer = CountVectorizer()

    emb = count_vectorizer.fit_transform(data)

    return emb, count_vectorizer

list_corpus = df_non_dup["sentence"].tolist()
list_labels = df_non_dup["source"].tolist()

counts, count_vectorizer = cv(list_corpus)

X_train_counts, X_test_counts, y_train_counts, y_test_counts = train_test_split(counts, list_labels, test_size=0.2, random_state=40)

In [30]:
british_vec = gensim.models.Word2Vec(df_non_dup.loc[df_non_dup['source'] == 0, 'tokens'], size=300)
american_vec = gensim.models.Word2Vec(df_non_dup.loc[df_non_dup['source'] == 1, 'tokens'], size=300)

Save the word vectors

In [31]:
british_vec.wv.save("british_big")
american_vec.wv.save("american_big")

In [46]:
train_file = "training_words1000.txt" # from https://www.ef.edu/english-resources/english-vocabulary/top-1000-words/

with utils.smart_open(train_file, "r") as f:
    word_pair = [tuple(utils.any2unicode(line.lower()).strip().split()) for line in f]
        
print(word_pair)

[('a', 'a'), ('ability', 'ability'), ('able', 'able'), ('about', 'about'), ('above', 'above'), ('accept', 'accept'), ('according', 'according'), ('account', 'account'), ('across', 'across'), ('act', 'act'), ('action', 'action'), ('activity', 'activity'), ('actually', 'actually'), ('add', 'add'), ('address', 'address'), ('administration', 'administration'), ('admit', 'admit'), ('adult', 'adult'), ('affect', 'affect'), ('after', 'after'), ('again', 'again'), ('against', 'against'), ('age', 'age'), ('agency', 'agency'), ('agent', 'agent'), ('ago', 'ago'), ('agree', 'agree'), ('agreement', 'agreement'), ('ahead', 'ahead'), ('air', 'air'), ('all', 'all'), ('allow', 'allow'), ('almost', 'almost'), ('alone', 'alone'), ('along', 'along'), ('already', 'already'), ('also', 'also'), ('although', 'although'), ('always', 'always'), ('american', 'american'), ('among', 'among'), ('amount', 'amount'), ('analysis', 'analysis'), ('and', 'and'), ('animal', 'animal'), ('another', 'another'), ('answer', 'a

In [47]:
transmat = models.translation_matrix.TranslationMatrix(american_vec.wv, british_vec.wv)
transmat.train(word_pair)

In [51]:
transmat.translate(['the', 'favor', 'is', 'full'], topn=3, source_lang_vec=american_vec.wv, target_lang_vec=british_vec.wv)

OrderedDict([('the', ['the', 'a', 'this']),
             ('favor', ['favour', 'continuance', 'respect']),
             ('is', ['is', 'was', 'exists']),
             ('full', ['full', 'complete', 'fullness'])])