In [1]:
import gensim
import nltk
import sklearn
import pandas as pd
import keras
import csv
import time
from nltk.tokenize import TreebankWordTokenizer
import nltk, re, pprint
from nltk import word_tokenize

Using TensorFlow backend.


In [2]:
DATA_PATH = 'Articles.csv'
ENCODING = "utf-8"

In [28]:
import re
nonword = re.compile(r'^(\W*|[0-9]*)$')

In [3]:
corpus = pd.read_csv(DATA_PATH, encoding=ENCODING)
corpus.columns

Index(['Unnamed: 0', 'id', 'title', 'text'], dtype='object')

In [4]:
australia = corpus.loc[13]
australiaArticle = australia.text

In [6]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords += '- -- " \' ? , . ! * ** *** ( ) = == === : ; \'\' ` `` [ ] & %'.split()

In [7]:
from nltk import word_tokenize
wt_tokens = word_tokenize(australiaArticle)
tokenizer = TreebankWordTokenizer()
tb_tokens = tokenizer.tokenize(australiaArticle)

In [16]:
from nltk.tokenize import sent_tokenize
sentence_tokens = sent_tokenize(australiaArticle)
sentence_tokens[10:15]

['The continent of Australia, including the island of Tasmania, was separated from the other continents of the world many millions of years ago.',
 'Because of this, many animals and plants live in Australia that do not live anywhere else.',
 'These include animals like the kangaroo, the koala, the emu, the kookaburra, and the platypus.',
 'People first arrived in Australia more than 50,000 years ago.',
 'These native Australians are called the Australian Aborigines.']

In [45]:
# returns article segmented into tokenized sentences as a list of lists
def prepare_data(text):
    corpus_tokens = []
    for article in text: 
        article_sentences = sent_tokenize(str(article))
        for sentence in article_sentences:
            corpus_tokens.append([word for word in word_tokenize(sentence) if (word not in stopwords and not nonword.search(word))])
    return corpus_tokens

In [46]:
corpus_tokens = prepare_data(corpus.text)

In [47]:
corpus_tokens[0]

['April', '4th', 'month', 'year', 'comes', 'March', 'May']

In [37]:
from gensim.models.word2vec import Word2Vec
import multiprocessing

In [48]:
num_features = 400
min_word_count = 5
num_workers = multiprocessing.cpu_count()
window_size = 6
subsampling = 1e-3
model = Word2Vec(corpus_tokens, workers=num_workers, size=num_features, min_count=min_word_count, window=window_size, sample=subsampling)
model_name = 'simple_wiki_word2vec_model'
model.save(model_name)

In [None]:
# this frees up memory by discarding unneeded data 
# but the model cannot be trained further after this
# model.init_sims(replace=True)

# Alternatively, try this one recommended by gensim docs:
# word_vectors = model.wv
# del model

In [51]:
 model.wv.most_similar(positive=['Germany', 'France'], topn=5)

[('Belgium', 0.6663845777511597),
 ('Austria', 0.5919970273971558),
 ('Prussia', 0.5819699764251709),
 ('German', 0.5740652680397034),
 ('Netherlands', 0.5707762837409973)]

In [52]:
model.wv.doesnt_match("Microsoft Facebook Tindr Qantas".split())

'Qantas'

In [61]:
# things of different kinds?
model.wv.doesnt_match("circle square triangle vector".split())
# :)

'vector'

In [62]:
# words of different POS? 
model.wv.doesnt_match("beautiful colorful implicit pier".split())
# :(

'implicit'

In [54]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

[('queen', 0.6915752291679382)]

In [56]:
model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man'], topn=1)

[('queen', 0.9017835259437561)]

In [74]:
model.wv.similarity('era', 'epoch')

0.58739245

In [76]:
model.wv.similar_by_word("Qantas", topn=5)

[('Jetstar', 0.8130829334259033),
 ('Airbus', 0.8073384761810303),
 ('Cargo', 0.7871814966201782),
 ('A380', 0.7869985103607178),
 ('KLM', 0.772258996963501)]

In [81]:
sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
sentence_president = 'The president greets the press in Chicago'.lower().split()
similarity = model.wv.wmdistance(sentence_obama, sentence_president)
print("{:.4f}".format(similarity))

ModuleNotFoundError: No module named 'pyemd'

In [160]:
import en_core_web_lg  # python -m spacy download en_core_web_sm --user
parser = en_core_web_lg.load()
doc = parser(australiaArticle)

In [87]:
with doc.retokenize() as retokenizer:
    for ent in doc.ents:
        retokenizer.merge(doc[ent.start:ent.end])

In [120]:
def tokenize(text):
    doc = parser(text)
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(doc[ent.start:ent.end], attrs={"LEMMA": ent.text})
    return [x for x in doc if (not x.is_punct and not nonword.search(str(x)) and str(x).lower() not in stopwords)]

In [121]:
test2 = tokenize(australiaArticle)

In [99]:
test[:20]

[,
 Australia,
 formally,
 Commonwealth of Australia,
 country,
 sovereign,
 state,
 southern,
 hemisphere,
 located,
 Oceania,
 capital,
 city,
 Canberra,
 largest,
 city,
 Sydney,
 
 ,
 Australia,
 List]