In [1]:
import gensim, string
import pandas as pd
import numpy as np

In [2]:
from gensim.models import word2vec
print(word2vec.FAST_VERSION)

0


In [3]:
news = pd.read_csv('../data/news.csv')

In [4]:
news.head()

Unnamed: 0,title,score,num_comments,created_utc
0,Texas teen tackled by cop at pool party files ...,0,0,1483629018
1,John Kerry In Leaked Audio Admits U.S. Allowed...,0,0,1483872751
2,2017 American Liberty 225th Anniversary Gold C...,0,0,1484327875
3,Repair broken glass with Sensible cost | Call ...,0,0,1484798494
4,Today is the last day to register for Obamacare,0,0,1485872414


In [5]:
from nltk.tokenize import RegexpTokenizer

pattern = r'[\d.,]+|[A-Z][.A-Z]+\b\.*|\w+|\S'
tokenizer = RegexpTokenizer(pattern)

def prepare_corpus(sentences, tok):
    tok_sentences = [tok.tokenize(x) for x in sentences]
    return [[x.lower() for x in y if x not in string.punctuation] for y in tok_sentences]

In [6]:
news_corpus = prepare_corpus(news['title'], tokenizer)

In [7]:
news_corpus[0]

['texas',
 'teen',
 'tackled',
 'by',
 'cop',
 'at',
 'pool',
 'party',
 'files',
 'lawsuit']

In [8]:
news_model = gensim.models.Word2Vec(news_corpus,
                                    sg=0, # CBOW vs. skip-gram
                                    size=100, # feature vectors' length
                                    window=5, # window size
                                    min_count=1, # ignore all words with total frequency lower than this
                                    negative=1, # if set to 0, no negative samping is used
                                    seed=123
                                   )

In [9]:
news_model.wv.similarity('poland', 'u.s.')

0.703736903364041

In [10]:
news_model.wv.similarity('poland', 'czech')

0.55689644623002

In [11]:
news_model.wv.similarity('poland', 'dog')

0.22361810944524252

In [17]:
news_model.wv.most_similar(positive=['woman', 'actor'], negative=['man'])

[('actress', 0.8806039690971375),
 ('playboy', 0.8434235453605652),
 ('chris', 0.8396222591400146),
 ('mary', 0.8303409218788147),
 ('vinod', 0.8268942832946777),
 ('singer', 0.8241496682167053),
 ('kapoor', 0.8117820620536804),
 ('mlb', 0.8042533993721008),
 ('bruce', 0.803532600402832),
 ('bryan', 0.800155520439148)]

In [13]:
news_model.wv.similarity('woman', 'man')

0.9511879400496241

In [14]:
games = pd.read_csv('../data/gaming.csv')
games_corpus = prepare_corpus(games['title'], tokenizer)

In [15]:
games_model = gensim.models.Word2Vec(games_corpus,
                                     min_count=1,
                                     seed=123
                                    )

In [16]:
def print_most_similar(word, top_n=10):
    games_set = games_model.wv.most_similar(word, topn=top_n)
    news_set = news_model.wv.most_similar(word, topn=top_n)
    games_set = [x[0] for x in games_set]
    news_set = [x[0] for x in news_set]
    print(word)
    print('Games:', ', '.join(games_set))
    print('News:', ', '.join(news_set))
    print()

    
print_most_similar('mouse')
print_most_similar('bomb')
print_most_similar('blood')
print_most_similar('war')

mouse
Games: keyboard, monitor, headphones, headset, laptop, usb, mechanical, calibrator, wireless, router
News: color, perfume, glasses, lip, bifold, applique, herbalife, workout, hoodies, pawn

bomb
Games: grenade, dirty, headshot, collateral, camper, shotgun, plane, night., quad, rat
News: deadly, attack, incident, syrian, terror, weapons, ship, terrorist, strikes, jewish

blood
Games: knights, chaos, quest, slayer, necromancer, knight, gatez, puscaria, lazarski, samurai
News: cancer, patient, alcohol, dementia, disease, brain, organs, excessive, ptsd, epilepsy

war
Games: mordor, bloodsheddayz, snowzilla, wardayz, hanzozeib, morder, warcraft, annihilation, shortround, 402
News: nuclear, syria, peace, threat, conflict, iran, catastrophe, missiles, moab, democracy

