# Games, news - context matters!

In [1]:
import gensim, string
import pandas as pd
import numpy as np

In [2]:
from gensim.models import word2vec

In [3]:
news = pd.read_csv('../data/news_lang.csv')

In [4]:
news.head()

Unnamed: 0,title,score,num_comments,created_utc
0,Texas teen tackled by cop at pool party files ...,0,0,1483629018
1,John Kerry In Leaked Audio Admits U.S. Allowed...,0,0,1483872751
2,2017 American Liberty 225th Anniversary Gold C...,0,0,1484327875
3,Repair broken glass with Sensible cost | Call ...,0,0,1484798494
4,Today is the last day to register for Obamacare,0,0,1485872414


## Tokenization

`gensims`'s `Word2Vec` as an argument takes a corpus as a list of lists of tokens. Let's prepare the tokenized form of the corpus we have.

A `RegexpTokenizer` (from `nltk`) splits a string into substrings using a regular expression. For example, the following tokenizer forms tokens out of alphabetic and numeric sequences. It splits words on *most* punctuation marks. It keeps acronyms and numerics unsplitted (e.g. *U.S.A.* will not be splitted).


In [36]:
from nltk.tokenize import RegexpTokenizer

pattern = r'[\d.,]+|[A-Z][.A-Z]+\b\.*|\w+|\S'
tokenizer = RegexpTokenizer(pattern)

In [44]:
line = "U.S.A Count U.S.A. Sec.of U.S. Name:Dr.John Doe J.Doe, 1.11 1,000 10--20 10-20"
tokenizer.tokenize(line)

['U.S.A',
 'Count',
 'U.S.A.',
 'Sec',
 '.',
 'of',
 'U.S.',
 'Name',
 ':',
 'Dr',
 '.',
 'John',
 'Doe',
 'J.',
 'Doe',
 ',',
 '1.11',
 '1,000',
 '10',
 '-',
 '-',
 '20',
 '10',
 '-',
 '20']

In [43]:
def prepare_corpus(sentences, tok):
    tok_sentences = [tok.tokenize(x) for x in sentences]
    return [[x.lower() for x in y if x not in string.punctuation] for y in tok_sentences]

In [8]:
news_corpus = prepare_corpus(news['title'], tokenizer)

In [9]:
news_corpus[0]

['texas',
 'teen',
 'tackled',
 'by',
 'cop',
 'at',
 'pool',
 'party',
 'files',
 'lawsuit']

In [10]:
news_model = gensim.models.Word2Vec(news_corpus,
                                    sg=0, # CBOW vs. skip-gram
                                    size=100, # feature vectors' length
                                    window=5, # window size
                                    min_count=1, # ignore all words with total frequency lower than this
                                    negative=1, # if set to 0, no negative samping is used
                                    seed=123
                                   )

In [11]:
news_model.wv.similarity('poland', 'u.s.')

0.6634550511503782

In [12]:
news_model.wv.similarity('poland', 'czech')

0.6995020961367224

In [13]:
news_model.wv.similarity('poland', 'dog')

0.12151777113689202

In [19]:
news_model.wv.most_similar('afghanistan')

[('iraq', 0.9142078757286072),
 ('yemen', 0.9018725156784058),
 ('syria', 0.8834158778190613),
 ('raqqa', 0.8758918642997742),
 ('iraqi', 0.8730102777481079),
 ('mosul', 0.8689094185829163),
 ('libya', 0.8657315373420715),
 ('afghan', 0.8612012267112732),
 ('kirkuk', 0.8582304120063782),
 ('myanmar', 0.8478946089744568)]

In [20]:
news_model.wv.most_similar(positive=['woman', 'actor'], negative=['man'])

[('actress', 0.8610506057739258),
 ('mary', 0.8366692066192627),
 ('playboy', 0.8277504444122314),
 ('bryan', 0.8207701444625854),
 ('sir', 0.8192210793495178),
 ('chris', 0.8170576691627502),
 ('filmmaker', 0.813789963722229),
 ('singer', 0.8101747632026672),
 ('songwriter', 0.7951234579086304),
 ('jerry', 0.7950710654258728)]

In [32]:
news_model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

[('prince', 0.8167217969894409),
 ('drake', 0.8065531253814697),
 ('stephanie', 0.7968810796737671),
 ('queen', 0.7942199110984802),
 ('sajal', 0.785223126411438),
 ('kristen', 0.7827389240264893),
 ('deepika', 0.7776816487312317),
 ('stewart', 0.7765840291976929),
 ('middleton', 0.774808406829834),
 ('beyonce', 0.7734202146530151)]

In [22]:
games = pd.read_csv('../data/gaming.csv')
games_corpus = prepare_corpus(games['title'], tokenizer)

In [23]:
games_model = gensim.models.Word2Vec(games_corpus,
                                     min_count=1,
                                     seed=123
                                    )

In [28]:
def print_most_similar(word, top_n=10):
    games_set = games_model.wv.most_similar(word, topn=top_n)
    news_set = news_model.wv.most_similar(word, topn=top_n)
    games_set = [x[0] for x in games_set]
    news_set = [x[0] for x in news_set]
    print(word)
    print('Games:', ', '.join(games_set))
    print('News:', ', '.join(news_set))
    print()

    
print_most_similar('mouse')
print_most_similar('bomb')
print_most_similar('blood')
print_most_similar('war')

mouse
Games: keyboard, monitor, headphones, router, headset, laptop, chair, headsets, usb, desktop
News: colorful, bouquet, lip, tubs, perfume, alluring, bra, workout, patterns, clothes

bomb
Games: bush, shotgun, sohjy, raiding, bow, dirty, quad, nuke, uzi, spawn
News: deadly, drill, incident, attack, strikes, damascus, flooding, ship, blast, wwii

blood
Games: knights, harvest, titan, osiris, motoi, flashpoint, lvl2, chaos, fusion, rush
News: obesity, drugs, patient, cancer, ptsd, organs, bees, heroin, body, disease

war
Games: mordor, wardayz, annihilation, snowzilla, warzeib, morose, warcraft, fight3, tomrrow, okage
News: nuclear, preemptive, peace, syria, moab, threat, democracy, preparing, iran, ukraine

