I'm going to use newspaper text as training data to develop a Neural Network that can translate between different English dialects. As a start, I'll compare different countries (U.S. vs U.K., but could add South Africa, Australia, Bahamas, Belize, Ghana, etc.), but it could be trained to work on regions within countries.<br /><br />
I'll start by downloading U.K. training data from The Guardian.

In [150]:
import json
import requests
from os import makedirs
from os.path import join, exists
from datetime import date, timedelta
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [8]:


ARTICLES_DIR = join('tempdata', 'articles')
makedirs(ARTICLES_DIR, exist_ok=True)
# Sample URL
#
# http://content.guardianapis.com/search?from-date=2010-01-01&
# to-date=2018-01-01&order-by=newest&show-fields=all&page-size=200
# &api-key=your-api-key-goes-here

MY_API_KEY = open("guardian_key.txt").read().strip()
API_ENDPOINT = 'http://content.guardianapis.com/search'
my_params = {
    'from-date': "2011-05-06",
    'to-date': "2018-01-01",
    'order-by': "newest",
    'show-fields': 'all',
    'page-size': 200, # 200 is the max
    'api-key': MY_API_KEY
}


# day iteration from here:
# http://stackoverflow.com/questions/7274267/print-all-day-dates-between-two-dates
start_date = date(2011, 5, 6)
end_date = date(2018, 1, 1)
dayrange = range((end_date - start_date).days + 1)
for daycount in dayrange:
    dt = start_date + timedelta(days=daycount)
    datestr = dt.strftime('%Y-%m-%d')
    fname = join(ARTICLES_DIR, datestr + '.json')
    if not exists(fname):
        # then let's download it
        print("Downloading", datestr)
        all_results = []
        my_params['from-date'] = datestr
        my_params['to-date'] = datestr
        current_page = 1
        total_pages = 1
        while current_page <= total_pages:
            print("...page", current_page)
            my_params['page'] = current_page
            resp = requests.get(API_ENDPOINT, my_params)
            data = resp.json()
            all_results.extend(data['response']['results'])
            # if there is more than one page
            current_page += 1
            total_pages = data['response']['pages']

        with open(fname, 'w') as f:
            print("Writing to", fname)

            # re-serialize it for pretty indentation
            f.write(json.dumps(all_results, indent=2))

Downloading 2011-05-07
...page 1
...page 2
Writing to tempdata/articles/2011-05-07.json
Downloading 2011-05-08
...page 1
Writing to tempdata/articles/2011-05-08.json
Downloading 2011-05-09
...page 1
...page 2
Writing to tempdata/articles/2011-05-09.json
Downloading 2011-05-10
...page 1
...page 2
Writing to tempdata/articles/2011-05-10.json
Downloading 2011-05-11
...page 1
...page 2
Writing to tempdata/articles/2011-05-11.json
Downloading 2011-05-12
...page 1
...page 2
...page 3
Writing to tempdata/articles/2011-05-12.json
Downloading 2011-05-13
...page 1
...page 2
...page 3
Writing to tempdata/articles/2011-05-13.json
Downloading 2011-05-14
...page 1
...page 2
Writing to tempdata/articles/2011-05-14.json
Downloading 2011-05-15
...page 1
Writing to tempdata/articles/2011-05-15.json
Downloading 2011-05-16
...page 1
...page 2
Writing to tempdata/articles/2011-05-16.json
Downloading 2011-05-17
...page 1
...page 2
...page 3
Writing to tempdata/articles/2011-05-17.json
Downloading 2011-05-18

Writing to tempdata/articles/2011-08-07.json
Downloading 2011-08-08
...page 1
...page 2
Writing to tempdata/articles/2011-08-08.json
Downloading 2011-08-09
...page 1
...page 2
Writing to tempdata/articles/2011-08-09.json
Downloading 2011-08-10
...page 1
...page 2
Writing to tempdata/articles/2011-08-10.json
Downloading 2011-08-11
...page 1
...page 2
Writing to tempdata/articles/2011-08-11.json
Downloading 2011-08-12
...page 1
...page 2
...page 3
Writing to tempdata/articles/2011-08-12.json
Downloading 2011-08-13
...page 1
...page 2
Writing to tempdata/articles/2011-08-13.json
Downloading 2011-08-14
...page 1
Writing to tempdata/articles/2011-08-14.json
Downloading 2011-08-15
...page 1
...page 2
Writing to tempdata/articles/2011-08-15.json
Downloading 2011-08-16
...page 1
...page 2
Writing to tempdata/articles/2011-08-16.json
Downloading 2011-08-17
...page 1
...page 2
Writing to tempdata/articles/2011-08-17.json
Downloading 2011-08-18
...page 1
...page 2
Writing to tempdata/articles/201

...page 2
Writing to tempdata/articles/2011-11-08.json
Downloading 2011-11-09
...page 1
...page 2
Writing to tempdata/articles/2011-11-09.json
Downloading 2011-11-10
...page 1
...page 2
...page 3
Writing to tempdata/articles/2011-11-10.json
Downloading 2011-11-11
...page 1
...page 2
...page 3
Writing to tempdata/articles/2011-11-11.json
Downloading 2011-11-12
...page 1
Writing to tempdata/articles/2011-11-12.json
Downloading 2011-11-13
...page 1
...page 2
Writing to tempdata/articles/2011-11-13.json
Downloading 2011-11-14
...page 1
...page 2
Writing to tempdata/articles/2011-11-14.json
Downloading 2011-11-15
...page 1
...page 2
Writing to tempdata/articles/2011-11-15.json
Downloading 2011-11-16
...page 1
...page 2
...page 3
Writing to tempdata/articles/2011-11-16.json
Downloading 2011-11-17
...page 1
...page 2
...page 3
Writing to tempdata/articles/2011-11-17.json
Downloading 2011-11-18
...page 1
...page 2
...page 3
Writing to tempdata/articles/2011-11-18.json
Downloading 2011-11-19
..

...page 2
Writing to tempdata/articles/2012-02-09.json
Downloading 2012-02-10
...page 1
...page 2
...page 3
Writing to tempdata/articles/2012-02-10.json
Downloading 2012-02-11
...page 1
Writing to tempdata/articles/2012-02-11.json
Downloading 2012-02-12
...page 1
...page 2
Writing to tempdata/articles/2012-02-12.json
Downloading 2012-02-13
...page 1
...page 2
Writing to tempdata/articles/2012-02-13.json
Downloading 2012-02-14
...page 1
...page 2
Writing to tempdata/articles/2012-02-14.json
Downloading 2012-02-15
...page 1
...page 2
Writing to tempdata/articles/2012-02-15.json
Downloading 2012-02-16
...page 1
...page 2
Writing to tempdata/articles/2012-02-16.json
Downloading 2012-02-17
...page 1
...page 2
...page 3
Writing to tempdata/articles/2012-02-17.json
Downloading 2012-02-18
...page 1
Writing to tempdata/articles/2012-02-18.json
Downloading 2012-02-19
...page 1
...page 2
Writing to tempdata/articles/2012-02-19.json
Downloading 2012-02-20
...page 1
...page 2
Writing to tempdata/ar

...page 2
Writing to tempdata/articles/2012-05-12.json
Downloading 2012-05-13
...page 1
Writing to tempdata/articles/2012-05-13.json
Downloading 2012-05-14
...page 1
...page 2
Writing to tempdata/articles/2012-05-14.json
Downloading 2012-05-15
...page 1
...page 2
...page 3
Writing to tempdata/articles/2012-05-15.json
Downloading 2012-05-16
...page 1
...page 2
Writing to tempdata/articles/2012-05-16.json
Downloading 2012-05-17
...page 1
...page 2
...page 3
Writing to tempdata/articles/2012-05-17.json
Downloading 2012-05-18
...page 1
...page 2
...page 3
Writing to tempdata/articles/2012-05-18.json
Downloading 2012-05-19
...page 1
...page 2
Writing to tempdata/articles/2012-05-19.json
Downloading 2012-05-20
...page 1
Writing to tempdata/articles/2012-05-20.json
Downloading 2012-05-21
...page 1
...page 2
Writing to tempdata/articles/2012-05-21.json
Downloading 2012-05-22
...page 1
...page 2
Writing to tempdata/articles/2012-05-22.json
Downloading 2012-05-23
...page 1
...page 2
Writing to t

Writing to tempdata/articles/2012-08-12.json
Downloading 2012-08-13
...page 1
...page 2
Writing to tempdata/articles/2012-08-13.json
Downloading 2012-08-14
...page 1
...page 2
Writing to tempdata/articles/2012-08-14.json
Downloading 2012-08-15
...page 1
...page 2
Writing to tempdata/articles/2012-08-15.json
Downloading 2012-08-16
...page 1
...page 2
Writing to tempdata/articles/2012-08-16.json
Downloading 2012-08-17
...page 1
...page 2
...page 3
Writing to tempdata/articles/2012-08-17.json
Downloading 2012-08-18
...page 1
...page 2
Writing to tempdata/articles/2012-08-18.json
Downloading 2012-08-19
...page 1
Writing to tempdata/articles/2012-08-19.json
Downloading 2012-08-20
...page 1
...page 2
Writing to tempdata/articles/2012-08-20.json
Downloading 2012-08-21
...page 1
...page 2
Writing to tempdata/articles/2012-08-21.json
Downloading 2012-08-22
...page 1
...page 2
Writing to tempdata/articles/2012-08-22.json
Downloading 2012-08-23
...page 1
...page 2
Writing to tempdata/articles/201

ChunkedEncodingError: ("Connection broken: ConnectionResetError(54, 'Connection reset by peer')", ConnectionResetError(54, 'Connection reset by peer'))

In [10]:
!ls tempdata/articles/*.json | wc -l # number of files with news articles

     990


In [11]:
!grep webPublicationDate tempdata/articles/*.json | wc -l # count how many articles we have

  315778


In [35]:
# merge the JSON files into a single file
import glob

read_files = glob.glob("tempdata/articles/*.json")

with open("tempdata/articles/merged_file.json", "w") as outfile:
     for f in read_files:
            with open(f, "r") as infile:
                temp_infile = infile.read()
                outfile.write(temp_infile)
                outfile.write(',\n')


tempdata/articles/2011-12-10.json
tempdata/articles/2011-02-10.json
tempdata/articles/2010-08-09.json
tempdata/articles/2012-01-23.json
tempdata/articles/2010-06-17.json
tempdata/articles/2010-09-24.json
tempdata/articles/2010-05-07.json
tempdata/articles/2011-06-04.json
tempdata/articles/2010-12-03.json
tempdata/articles/2012-09-14.json
tempdata/articles/2010-02-03.json
tempdata/articles/2011-05-14.json
tempdata/articles/2011-07-29.json
tempdata/articles/2010-11-13.json
tempdata/articles/2012-06-27.json
tempdata/articles/2010-01-13.json
tempdata/articles/2011-06-12.json
tempdata/articles/2012-09-02.json
tempdata/articles/2010-02-15.json
tempdata/articles/2010-10-28.json
tempdata/articles/2010-12-15.json
tempdata/articles/2012-05-21.json
tempdata/articles/2011-05-02.json
tempdata/articles/2011-09-21.json
tempdata/articles/2010-01-05.json
tempdata/articles/2010-11-05.json
tempdata/articles/2011-02-06.json
tempdata/articles/2011-12-06.json
tempdata/articles/2010-06-01.json
tempdata/artic

tempdata/articles/2010-08-08.json
tempdata/articles/2010-06-16.json
tempdata/articles/2010-07-30.json
tempdata/articles/2011-10-27.json
tempdata/articles/2012-01-29.json
tempdata/articles/2010-08-03.json
tempdata/articles/2012-03-14.json
tempdata/articles/2010-04-20.json
tempdata/articles/2011-07-23.json
tempdata/articles/2010-11-19.json
tempdata/articles/2012-04-10.json
tempdata/articles/2010-03-24.json
tempdata/articles/2010-01-19.json
tempdata/articles/2011-08-10.json
tempdata/articles/2010-12-09.json
tempdata/articles/2010-02-09.json
tempdata/articles/2011-05-08.json
tempdata/articles/2012-08-25.json
tempdata/articles/2012-04-06.json
tempdata/articles/2011-06-18.json
tempdata/articles/2011-04-25.json
tempdata/articles/2011-08-06.json
tempdata/articles/2012-09-08.json
tempdata/articles/2010-10-22.json
tempdata/articles/2012-07-16.json
tempdata/articles/2011-03-21.json
tempdata/articles/2010-07-26.json
tempdata/articles/2011-10-31.json
tempdata/articles/2010-08-15.json
tempdata/artic

tempdata/articles/2012-02-01.json
tempdata/articles/2010-09-16.json
tempdata/articles/2011-08-28.json
tempdata/articles/2012-05-05.json
tempdata/articles/2010-12-31.json
tempdata/articles/2011-05-26.json
tempdata/articles/2011-09-05.json
tempdata/articles/2010-01-21.json
tempdata/articles/2010-11-21.json
tempdata/articles/2012-06-15.json
tempdata/articles/2012-04-28.json
tempdata/articles/2010-08-17.json
tempdata/articles/2010-06-09.json
tempdata/articles/2010-05-19.json
tempdata/articles/2010-07-24.json
tempdata/articles/2011-03-23.json
tempdata/articles/2012-07-14.json
tempdata/articles/2012-05-29.json
tempdata/articles/2010-10-20.json
tempdata/articles/2011-08-04.json
tempdata/articles/2011-04-27.json
tempdata/articles/2012-04-04.json
tempdata/articles/2010-03-30.json
tempdata/articles/2012-08-27.json
tempdata/articles/2011-09-29.json
tempdata/articles/2012-07-02.json
tempdata/articles/2011-08-12.json
tempdata/articles/2010-03-26.json
tempdata/articles/2012-08-31.json
tempdata/artic

tempdata/articles/2011-02-28.json
tempdata/articles/2011-12-28.json
tempdata/articles/2011-10-15.json
tempdata/articles/2010-04-12.json
tempdata/articles/2012-03-26.json
tempdata/articles/2010-08-31.json
tempdata/articles/2011-07-11.json
tempdata/articles/2010-03-16.json
tempdata/articles/2012-08-01.json
tempdata/articles/2012-04-22.json
tempdata/articles/2011-04-01.json
tempdata/articles/2011-08-22.json
tempdata/articles/2012-02-27.json
tempdata/articles/2010-09-30.json
tempdata/articles/2010-05-13.json
tempdata/articles/2011-11-14.json
tempdata/articles/2011-03-29.json
tempdata/articles/2011-01-14.json
tempdata/articles/2010-06-03.json
tempdata/articles/2011-12-04.json
tempdata/articles/2011-02-04.json
tempdata/articles/2010-11-07.json
tempdata/articles/2010-01-07.json
tempdata/articles/2011-09-23.json
tempdata/articles/2012-05-23.json
tempdata/articles/2010-12-17.json
tempdata/articles/2010-02-17.json
tempdata/articles/2011-06-10.json
tempdata/articles/2010-01-11.json
tempdata/artic

## Get U.S. news articles
I had a hard time finding newspaper APIs that would allow me to download the entire article. Instead, I'm going to use Selenium to scrape articles from USA Today.

In [270]:
import urllib.request, json 

outfile = open('nytimes_data.txt', 'w')

for offset in range(10):
    #address = "https://api.nytimes.com/svc/search/v2/articlesearch.json?offset=%s&api-key=cdbf688a895b41dfa24a692d2b85a96a" % str(offset)
    address = "https://api.nytimes.com/svc/search/v2/articlesearch.json?offset=%s&api-key=cdbf688a895b41dfa24a692d2b85a96a" % str(offset)
    with urllib.request.urlopen(address) as url:
        data = json.loads(url.read().decode())
        print(type(data))
        json.dump(data, outfile)

        #api = articleAPI('cdbf688a895b41dfa24a692d2b85a96a')

HTTPError: HTTP Error 504: GATEWAY_TIMEOUT

In [260]:
articles = api.search(q = '',
     fq = {'source':['The New York Times']}, 
     begin_date = 20001231 )

TypeError: must be str, not bytes

In [256]:
from selenium import webdriver

path_to_chromedriver = '/Users/dan/Downloads/chromedriver' # change path as needed
browser = webdriver.Chrome(executable_path = path_to_chromedriver)


In [257]:
url = 'http://www.usatoday.com'
browser.get(url)

#driver.get("http://www.python.org")
#assert "Python" in driver.title
#elem = driver.find_element_by_name("q")
#elem.clear()
#elem.send_keys("pycon")
#elem.send_keys(Keys.RETURN)
#assert "No results found." not in driver.page_source
#driver.close()

Time to create the database

In [63]:
# This JSON file from The Guardian is too large (3.2 GB) to do much with. Instead, I'm going to put it into a Postgres
# database
#

from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd

In [64]:
# Define a database name
# Set your postgres username
dbname = 'news_articles'
username = 'dan' # change this to your username


In [91]:
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
from sqlalchemy import create_engine
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print(engine.url)

postgres://dan@localhost/news_articles


In [92]:
## create a database (if it doesn't exist)
if not database_exists(engine.url):
    create_database(engine.url)
print(database_exists(engine.url))

True


In [247]:
#Create tables in database
command = (
        """
        CREATE TABLE sentences (
            id VARCHAR(150),
            source VARCHAR(20),
            webPublicationDate VARCHAR(150),
            productionOffice CHAR(2),
            sentence TEXT
            )
        """)
print(command)


        CREATE TABLE sentences (
            id VARCHAR(150),
            source VARCHAR(20),
            webPublicationDate VARCHAR(150),
            productionOffice CHAR(2),
            sentence TEXT
            )
        


In [248]:
conn = None
conn = psycopg2.connect(database = dbname, user = username)
cur = conn.cursor()

# create table one by one
cur.execute(command)
# close communication with the PostgreSQL database server
cur.close()
# commit the changes
conn.commit()

In [253]:
# This will delete the database table named 'sentences'
#import psycopg2
#conn = psycopg2.connect("dbname='news_articles' user='dan'")
#cur = conn.cursor()
#cur.execute('DROP TABLE "sentences";')  
#conn.commit()
#conn.close()

In [250]:
#Check to see if tables exist in db
cur = conn.cursor()
cur.execute("select relname from pg_class where relkind='r' and relname !~ '^(pg_|sql_)';")
print (cur.fetchall())

[('sentences',)]


In [252]:
# read a database from JSON and load it into a pandas dataframe
# here I'm adding data to the database where each row contains the id, source, publicationDate, productionOffice, sentence
# to create the sentences, I'm splitting on each '.'. This is not ideal if there are a lot of abbreviations, but I'm
# assuming there aren't.

import glob, json

#Create query to insert data to products table
add_to_guardian = """INSERT INTO guardian(id,webPublicationDate,productionOffice,bodyText) VALUES(%s,%s,%s,%s)"""
read_files = glob.glob("tempdata/articles/2*.json")

# process a subset of the articles from The Guardian
for read_file in read_files:
    with open(read_file) as json_file:
        print(read_file)
        json_data = json.load(json_file) # this opens all entries. we need to iterate through each one to get the important data
        for json_entry in json_data:
            temp_id = json_entry['id']
            temp_source = 'guardian'
            temp_webPublicationDate = json_entry['webPublicationDate']
            temp_productionOffice = json_entry['fields']['productionOffice']
            temp_bodyText = json_entry['fields']['bodyText']

            for sentence in temp_bodyText.split('.'):
                add_to_guardian = """INSERT INTO sentences(id,source,webPublicationDate,productionOffice,sentence) VALUES(%s,%s,%s,%s,%s)"""
                data = (temp_id,temp_source,temp_webPublicationDate,temp_productionOffice,sentence)
                cur.execute(add_to_guardian,data)


cur.close()
conn.commit()

    


tempdata/articles/2011-12-10.json
tempdata/articles/2011-02-10.json
tempdata/articles/2010-08-09.json
tempdata/articles/2012-01-23.json
tempdata/articles/2010-06-17.json
tempdata/articles/2010-09-24.json
tempdata/articles/2010-05-07.json


KeyboardInterrupt: 

In [139]:
cur = conn.cursor()
cur.execute("SELECT count(bodyText) FROM guardian LIMIT 10;")
print (cur.fetchall())

[(17873,)]


In [140]:
# Read in the data into Pandas
df = pd.read_sql_query('select * from "guardian"',con=engine)

In [141]:
df.head()


Unnamed: 0,id,source,webpublicationdate,productionoffice,bodytext
0,world/2010/jan/13/haiti-comment-madison-smartt...,guardian,2010-01-13T23:58:15Z,UK,Haiti is famously the poorest country in the w...
1,world/2010/jan/13/haiti-earthquake-aftermath-p...,guardian,2010-01-13T23:43:17Z,UK,Haiti was a humanitarian disaster even before ...
2,stage/2010/jan/13/trilogy-review,guardian,2010-01-13T23:30:00Z,UK,"Nothing's perfect and neither is Trilogy, Nic ..."
3,commentisfree/2010/jan/13/china-google-interne...,guardian,2010-01-13T23:00:01Z,UK,As the Twittersphere exploded with news that G...
4,music/2010/jan/13/john-mayer-review,guardian,2010-01-13T23:00:00Z,UK,John Mayer is best known in the UK as a Heat m...


In [148]:
# From here on out we will focus on the text portions of the dataset. Tokenizing the text of article using sci-kits Count Vectorizer(bag of words methodolgy):

# Articles from The Guardian
guardian_count_vect = CountVectorizer(stop_words="english")
guardian_matrix_CV = guardian_count_vect.fit_transform(df['bodytext'][df['source'].isin(['guardian'])])
guardian_features  = guardian_count_vect.get_feature_names()

# Articles NOT from The Guardian
not_guardian_count_vect = CountVectorizer(stop_words="english")
not_guardian_matrix_CV = not_guardian_count_vect.fit_transform(df['bodytext'][df['source'].isin(['fake_npr'])])
not_guardian_features  = not_guardian_count_vect.get_feature_names()


In [149]:
guardian_count_vect.vocabulary_.get(u'algorithm')
not_guardian_count_vect.vocabulary_.get(u'algorithm')

6621

<br /><br /><br /><br />
# WORD2VEC TESTING

In [206]:
from gensim.models import word2vec
from gensim.models.keyedvectors import KeyedVectors
#from gensim.models.translation_matrix import BackMappingTranslationMatrix
from gensim.models.translation_matrix import TranslationMatrix
import logging

In [172]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences_lower = word2vec.Text8Corpus('~/Downloads/text8')
sentences_upper = word2vec.Text8Corpus('~/Downloads/text8_upper')
 


In [173]:
model_lower = word2vec.Word2Vec(sentences_lower, size=200) # size is the number of layers in NN, bigger requires more data, but could be more accurate
model_upper = word2vec.Word2Vec(sentences_upper, size=200)


2018-01-15 14:30:02,950 : INFO : collecting all words and their counts
2018-01-15 14:30:02,956 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-01-15 14:30:08,133 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences
2018-01-15 14:30:08,134 : INFO : Loading a fresh vocabulary
2018-01-15 14:30:08,511 : INFO : min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)
2018-01-15 14:30:08,512 : INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)
2018-01-15 14:30:08,744 : INFO : deleting the raw counts dictionary of 253854 items
2018-01-15 14:30:08,754 : INFO : sample=0.001 downsamples 38 most-common words
2018-01-15 14:30:08,755 : INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)
2018-01-15 14:30:08,755 : INFO : estimated required memory for 71290 words and 200 dimensions: 149709000 bytes
2018-01-15 14:30:09,015 : INFO : resetting la

2018-01-15 14:31:16,964 : INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)
2018-01-15 14:31:17,185 : INFO : deleting the raw counts dictionary of 253854 items
2018-01-15 14:31:17,197 : INFO : sample=0.001 downsamples 38 most-common words
2018-01-15 14:31:17,198 : INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)
2018-01-15 14:31:17,198 : INFO : estimated required memory for 71290 words and 200 dimensions: 149709000 bytes
2018-01-15 14:31:17,461 : INFO : resetting layer weights
2018-01-15 14:31:18,483 : INFO : training model with 3 workers on 71290 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2018-01-15 14:31:19,498 : INFO : PROGRESS: at 1.65% examples, 1009652 words/s, in_qsize 5, out_qsize 0
2018-01-15 14:31:20,503 : INFO : PROGRESS: at 3.42% examples, 1055134 words/s, in_qsize 5, out_qsize 0
2018-01-15 14:31:21,513 : INFO : PROGRESS: at 5.07% examples, 1044366 words/s, in_qsize 6

In [175]:
model_lower.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)


  """Entry point for launching an IPython kernel.


[('queen', 0.6422957181930542)]

In [176]:
model_upper.most_similar(positive=['WOMAN', 'KING'], negative=['MAN'], topn=1)

  """Entry point for launching an IPython kernel.


[('QUEEN', 0.6810092926025391)]

In [177]:
model_lower.most_similar(positive=['woman', 'king'], negative=['man'], topn=2) # woman + king - man = queen (or prince)


  """Entry point for launching an IPython kernel.


[('queen', 0.6422957181930542), ('prince', 0.5756404399871826)]

In [178]:
model_lower.most_similar(['man'])


  """Entry point for launching an IPython kernel.


[('woman', 0.7012708783149719),
 ('girl', 0.6079072952270508),
 ('creature', 0.5665295124053955),
 ('person', 0.5221549868583679),
 ('boy', 0.5119364261627197),
 ('evil', 0.511446475982666),
 ('thief', 0.5090669393539429),
 ('bride', 0.49297720193862915),
 ('totoro', 0.4882364571094513),
 ('god', 0.4881860315799713)]

In [195]:
model_lower.save('text8_lower.model')
model_lower.wv.save_word2vec_format('text_lower.model.bin', binary=True)
model1_lower = KeyedVectors.load_word2vec_format('text_lower.model.bin', binary=True)
model1_lower.most_similar(['girl', 'father'], ['boy'], topn=3) # girl is to father as boy is to ???


2018-01-15 15:47:37,417 : INFO : saving Word2Vec object under text8_lower.model, separately None
2018-01-15 15:47:37,418 : INFO : storing np array 'syn0' to text8_lower.model.wv.syn0.npy
2018-01-15 15:47:37,460 : INFO : not storing attribute syn0norm
2018-01-15 15:47:37,461 : INFO : storing np array 'syn1neg' to text8_lower.model.syn1neg.npy
2018-01-15 15:47:37,506 : INFO : not storing attribute cum_table
2018-01-15 15:47:37,686 : INFO : saved text8_lower.model
2018-01-15 15:47:37,687 : INFO : storing 71290x200 projection weights into text_lower.model.bin
2018-01-15 15:47:38,116 : INFO : loading projection weights from text_lower.model.bin
2018-01-15 15:47:38,855 : INFO : loaded (71290, 200) matrix from text_lower.model.bin
2018-01-15 15:47:38,882 : INFO : precomputing L2-norms of word weight vectors


[('mother', 0.7659491300582886),
 ('wife', 0.7009589672088623),
 ('grandmother', 0.6990747451782227)]

In [197]:
model_upper.save('text8_upper.model')
model_upper.wv.save_word2vec_format('text_upper.model.bin', binary=True)
model1_upper = KeyedVectors.load_word2vec_format('text_upper.model.bin', binary=True)
model1_upper.most_similar(['GIRL', 'FATHER'], ['BOY'], topn=3) # girl is to father as boy is to ???

2018-01-15 15:48:04,209 : INFO : saving Word2Vec object under text8_upper.model, separately None
2018-01-15 15:48:04,210 : INFO : storing np array 'syn0' to text8_upper.model.wv.syn0.npy
2018-01-15 15:48:04,256 : INFO : not storing attribute syn0norm
2018-01-15 15:48:04,257 : INFO : storing np array 'syn1neg' to text8_upper.model.syn1neg.npy
2018-01-15 15:48:04,302 : INFO : not storing attribute cum_table
2018-01-15 15:48:04,482 : INFO : saved text8_upper.model
2018-01-15 15:48:04,483 : INFO : storing 71290x200 projection weights into text_upper.model.bin
2018-01-15 15:48:04,947 : INFO : loading projection weights from text_upper.model.bin
2018-01-15 15:48:05,683 : INFO : loaded (71290, 200) matrix from text_upper.model.bin
2018-01-15 15:48:05,684 : INFO : precomputing L2-norms of word weight vectors


[('MOTHER', 0.768741250038147),
 ('GRANDMOTHER', 0.7020315527915955),
 ('WIFE', 0.6858149170875549)]

In [198]:
more_examples = ["he is she", "big bigger bad", "going went being"] 

for example in more_examples:
     a, b, x = example.split()
     predicted = model.most_similar([x, b], [a])[0][0]
     print("'%s' is to '%s' as '%s' is to '%s'" % (a, b, x, predicted))


'he' is to 'is' as 'she' is to 'exists'
'big' is to 'bigger' as 'bad' is to 'worse'
'going' is to 'went' as 'being' is to 'was'


  """


In [199]:
tags = [('one', 'ONE'),('two', 'TWO'),('three', 'THREE'),('four', 'FOUR'),('five', 'FIVE'),('six', 'SIX'),('seven', 'SEVEN'),('eight', 'EIGHT'),('nine', 'NINE'),('ten', 'TEN'),('the', 'THE'),('a', 'A'),('that', 'THAT'),('this', 'THIS'),('zero', 'ZERO')]

In [217]:
# look at a translation between the upper and lowercase files that I made
#transmat = BackMappingTranslationMatrix(tagged_docs=, source_lang_vec=model1_lower, target_lang_vec=model1_upper, random_state=None)
transmat = TranslationMatrix(source_lang_vec=model1_lower, target_lang_vec=model1_upper, random_state=None)
transmat.train(tags)
print ("the shape of translation matrix is: ", transmat.translation_matrix.shape)
translated_word = transmat.translate(['words'], topn=3)
#gensim.models.translation_matrix.BackMappingTranslationMatrix(tags, model_lower, model_upper, random_state=None)

the shape of translation matrix is:  (200, 200)


  "The parameter source_lang_vec isn't specified, "
  "The parameter target_lang_vec isn't specified, "


In [233]:
# Now we're going to test our translation

words = [("vary", "ONE"), ("two", "TWO"), ("three", "THREE"), ("four", "FOUR"), ("five", "FIVE")]
source_word, target_word = zip(*words)
translated_word = transmat.translate(source_word, 5,)



  "The parameter source_lang_vec isn't specified, "
  "The parameter target_lang_vec isn't specified, "


In [234]:
#print(dir(translated_word))

for k, v in translated_word.items():
    print ("word ", k, " and translated word", v)

word  vary  and translated word ['THIS', 'TEN', 'MULTIPLE', 'THESE', 'CERTAIN']
word  two  and translated word ['TWO', 'THREE', 'FOUR', 'ZERO', 'FIVE']
word  three  and translated word ['THREE', 'FOUR', 'FIVE', 'SIX', 'TWO']
word  four  and translated word ['FOUR', 'FIVE', 'SIX', 'THREE', 'SEVEN']
word  five  and translated word ['FIVE', 'FOUR', 'SIX', 'THREE', 'SEVEN']


<br /><br /><br /><br />
# WORD2VEC TESTING WITH PHRASES

In [229]:
from gensim.models.phrases import Phrases

In [239]:
phrases_lower = Phrases(open('/Users/dan/Downloads/text8', 'r').readlines())
phrases_upper = Phrases(open('/Users/dan/Downloads/text8_upper', 'r').readlines())

2018-01-15 16:26:12,562 : INFO : collecting all words and their counts
2018-01-15 16:26:12,563 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types


KeyboardInterrupt: 