# Games, news - context matters!

In [8]:
import gensim, string 
import pandas as pd
import numpy as np

In [9]:
from gensim.models import word2vec

In [10]:
news = pd.read_csv('../data/news_lang.csv')

In [11]:
news.head()

Unnamed: 0,title,score,num_comments,created_utc
0,Texas teen tackled by cop at pool party files ...,0,0,1483629018
1,John Kerry In Leaked Audio Admits U.S. Allowed...,0,0,1483872751
2,2017 American Liberty 225th Anniversary Gold C...,0,0,1484327875
3,Repair broken glass with Sensible cost | Call ...,0,0,1484798494
4,Today is the last day to register for Obamacare,0,0,1485872414


## Tokenization

`gensims`'s `Word2Vec` as an argument takes a corpus as a list of lists of tokens. Let's prepare the tokenized form of the corpus we have.

A `RegexpTokenizer` (from `nltk`) splits a string into substrings using a regular expression. For example, the following tokenizer forms tokens out of alphabetic and numeric sequences. It splits words on *most* punctuation marks. It keeps acronyms and numerics unsplitted (e.g. *U.S.A.* will not be splitted).


In [15]:
from nltk.tokenize import RegexpTokenizer

pattern = r'[\d.,]+|[A-Z][.A-Z]+\b\.*|\w+|\S'
tokenizer = RegexpTokenizer(pattern)

In [16]:
line = "U.S.A Count U.S.A. Sec.of U.S. Name:Dr.John Doe J.Doe, 1.11 1,000 10--20 10-20"
tokenizer.tokenize(line)

['U.S.A',
 'Count',
 'U.S.A.',
 'Sec',
 '.',
 'of',
 'U.S.',
 'Name',
 ':',
 'Dr',
 '.',
 'John',
 'Doe',
 'J.',
 'Doe',
 ',',
 '1.11',
 '1,000',
 '10',
 '-',
 '-',
 '20',
 '10',
 '-',
 '20']

In [17]:
def prepare_corpus(sentences, tok):
    tok_sentences = [tok.tokenize(x) for x in sentences]
    return [[x.lower() for x in y if x not in string.punctuation] for y in tok_sentences]

In [18]:
news_corpus = prepare_corpus(news['title'], tokenizer)

In [19]:
news_corpus[0]

['texas',
 'teen',
 'tackled',
 'by',
 'cop',
 'at',
 'pool',
 'party',
 'files',
 'lawsuit']

In [20]:
news_model = gensim.models.Word2Vec(news_corpus,
                                    sg=0, # CBOW vs. skip-gram
                                    size=100, # feature vectors' length
                                    window=5, # window size
                                    min_count=1, # ignore all words with total frequency lower than this
                                    negative=1, # if set to 0, no negative samping is used
                                    seed=123
                                   )

In [21]:
news_model.wv.similarity('poland', 'u.s.')

0.658789823696318

In [22]:
news_model.wv.similarity('poland', 'czech')

0.6328641246562836

In [23]:
news_model.wv.similarity('poland', 'dog')

0.1941660502315914

In [24]:
news_model.wv.most_similar('afghanistan')

[('raqqa', 0.9185310006141663),
 ('iraq', 0.9065133333206177),
 ('yemen', 0.9029085636138916),
 ('libya', 0.896985650062561),
 ('syria', 0.892918050289154),
 ('mosul', 0.8667630553245544),
 ('syrian', 0.8576147556304932),
 ('tehran', 0.8539131283760071),
 ('troops', 0.8490171432495117),
 ('taliban', 0.8487639427185059)]

In [25]:
news_model.wv.most_similar(positive=['woman', 'actor'], negative=['man'])

[('actress', 0.8733424544334412),
 ('mary', 0.8640809655189514),
 ('singer', 0.8306462168693542),
 ('chris', 0.8297317624092102),
 ('columnist', 0.8273946046829224),
 ('playboy', 0.8151363730430603),
 ('filmmaker', 0.8150051832199097),
 ('ian', 0.8096398115158081),
 ('chopra', 0.8077244162559509),
 ('comedian', 0.8040573596954346)]

In [26]:
news_model.wv.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.8345198035240173),
 ('arjun', 0.8182545900344849),
 ('noor', 0.8052791953086853),
 ('dave', 0.7958756685256958),
 ('alexander', 0.795805811882019),
 ('dutt', 0.7911546230316162),
 ('kate', 0.7893512845039368),
 ('kerr', 0.7872197031974792),
 ('emma', 0.7852388620376587),
 ('crown', 0.7845532298088074)]

In [27]:
games = pd.read_csv('../data/gaming.csv')
games_corpus = prepare_corpus(games['title'], tokenizer)

In [28]:
games_model = gensim.models.Word2Vec(games_corpus,
                                     min_count=1,
                                     seed=123
                                    )

In [29]:
def print_most_similar(word, top_n=10):
    games_set = games_model.wv.most_similar(word, topn=top_n)
    news_set = news_model.wv.most_similar(word, topn=top_n)
    games_set = [x[0] for x in games_set]
    news_set = [x[0] for x in news_set]
    print(word)
    print('Games:', ', '.join(games_set))
    print('News:', ', '.join(news_set))
    print()

    
print_most_similar('mouse')
print_most_similar('bomb')
print_most_similar('blood')
print_most_similar('war')

mouse
Games: keyboard, headphones, headset, monitor, laptop, headsets, usb, chair, desktop, router
News: color, cake, workout, lip, gloss, bra, fashionable, fabulous, makeup, trendy

bomb
Games: spawn, bullets, raiding, chopper, camping, dirty, crossbow, grenade, shotgun, bullet
News: deadly, attack, incident, syrian, flooding, strikes, drill, weapons, ship, weapon

blood
Games: knights, wine, chaos, harvest, dragons, playthrough, wonderboy, flashpoint, temple, handcarved
News: patient, alcohol, cancer, disease, epilepsy, body, organs, depression, failure, cells

war
Games: mordor, wardayz, morose, unrest, warcraft, hydro, cutscreen, snowzilla, earth, vegalta
News: nuclear, syria, democracy, moab, divided, threat, iran, preemptive, dprk, regime

