In [94]:
%matplotlib inline
import graphlab
import graphlab.aggregate as agg
import numpy as np
import matplotlib.pyplot as plt  
import string
from nltk import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
nltk.download("stopwords")

stemmer = PorterStemmer()

plt.style.use('ggplot')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\David\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [29]:
sf = graphlab.SFrame('./data/boardgames-no-comments')
sf.remove_columns(['age', 'average', 'bayesaverage', 'categories', 'maxplayers', 'maxplaytime', 'mechanics', 'minplayers', 'minplaytime', 'owned', 'playingtime', 'stddev', 'subdomains', 'trading', 'wanting', 'wishing', 'yearpublished'])
sf = sf[sf['usersrated'] > 10]
sf.head(1)

description,id,name,usersrated
Marjapussi (Berry bag) is a marriage-style trick- ...,26721,Marjapussi,13


In [116]:
tokenizer = RegexpTokenizer(r'[a-zA-Z-]+')
sf['text'] = sf.select_column('description').apply(lambda x: ' '.join([stemmer.stem(w) for w in tokenizer.tokenize(BeautifulSoup(x, 'html.parser').get_text(strip=True)) if w not in stopwords.words('english')]).lower())

In [117]:
sf['word_count'] = graphlab.text_analytics.count_words(sf['text'])
sf['tfidf'] = graphlab.text_analytics.tf_idf(sf['word_count'])['docs']

In [118]:
avalon = sf[sf['name'] == 'The Resistance: Avalon']

In [119]:
avalon[['tfidf']].stack('tfidf', new_column_name=['word', 'count']).sort('count', ascending=False)

word,count
resist,14.2839014158
arthur,12.0716068541
evil,11.7558646722
avalon,9.41786497281
mordr,8.19528767641
futur,7.56094243782
unscrupul,7.50214049585
hidden,7.33255770277
merlin,6.85155292971
riddl,5.9797139604


In [122]:
kvn_model = graphlab.nearest_neighbors.create(sf, features=['tfidf'], label='name', distance='cosine')
kvn_model.summary()

PROGRESS: Starting brute force nearest neighbors model training.
Class                         : NearestNeighborsModel

Attributes
----------
Method                        : brute_force
Number of distance components : 1
Number of examples            : 21743
Number of feature columns     : 1
Number of unpacked features   : 64581
Total training time (seconds) : 0.6505



In [121]:
print avalon['text']
kvn_model.query(avalon)

['the resist avalon pit forc good evil battl control futur civil arthur repres futur britain promis prosper honor yet hidden among brave warrior mordr unscrupul minion these forc evil number knowledg remain hidden one arthur servant merlin alon know agent evil must speak riddl if true ident discov lost the resist avalon standalon game the resist requir play game compat combin', ... ]
PROGRESS: Starting pairwise querying.
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | Query points | # Pairs | % Complete. | Elapsed Time |
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | 0            | 1       | 0.00459918  | 17.015ms     |
PROGRESS: | Done         |         | 100         | 42.029ms     |
PROGRESS: +--------------+---------+-------------+--------------+


query_label,reference_label,distance,rank
0,The Resistance: Avalon,-2.22044604925e-16,1
0,The Resistance: Merlin/Assassin Promo ...,0.61418844099,2
0,Mordred,0.744776373825,3
0,The Resistance: Hidden Agenda ...,0.766776466237,4
0,Apokalypse,0.769739556687,5


## 

In [123]:
kingTokyo = sf[sf['name'] == 'King of Tokyo']

In [124]:
kvn_model.query(kingTokyo, k=10)

PROGRESS: Starting pairwise querying.
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | Query points | # Pairs | % Complete. | Elapsed Time |
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | 0            | 1       | 0.00459918  | 15.015ms     |
PROGRESS: | Done         |         | 100         | 43.02ms      |
PROGRESS: +--------------+---------+-------------+--------------+


query_label,reference_label,distance,rank
0,King of Tokyo,0.0,1
0,Crisis: Tokyo,0.411084747353,2
0,King of Tokyo: Halloween,0.537321316646,3
0,King of Tokyo: Power Up!,0.540388719721,4
0,King of New York: 55 Central Park West ...,0.592674585042,5
0,King of Tokyo Promo Cards,0.622978777592,6
0,Tokyo Express,0.661254436376,7
0,"Oh No, There Goes Tokyo!",0.728458025722,8
0,Tokyo Train,0.74514885236,9
0,King of New York,0.752883790715,10


In [125]:
kingYork = sf[sf['name'] == 'King of New York']

In [126]:
kvn_model.query(kingYork, k=10)

PROGRESS: Starting pairwise querying.
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | Query points | # Pairs | % Complete. | Elapsed Time |
PROGRESS: +--------------+---------+-------------+--------------+
PROGRESS: | 0            | 1       | 0.00459918  | 16.012ms     |
PROGRESS: | Done         |         | 100         | 39.026ms     |
PROGRESS: +--------------+---------+-------------+--------------+


query_label,reference_label,distance,rank
0,King of New York,0.0,1
0,Monsters Menace America,0.706762709647,2
0,Moongha Invaders: Mad Scientists and Atomic ...,0.708842036898,3
0,King of Tokyo: Halloween,0.716156743283,4
0,Smash Monster Rampage!,0.716302206206,5
0,Kleine Monster,0.747344516472,6
0,Monsters Ravage America,0.749983647311,7
0,King of Tokyo,0.752883790715,8
0,Mega Monster City Smash!,0.757591735549,9
0,Monopoly: New York City,0.758311320454,10


In [127]:
set(kingYork[['tfidf']].stack('tfidf', new_column_name=['word', 'count'])['word']).intersection(
set(kingTokyo[['tfidf']].stack('tfidf', new_column_name=['word', 'count'])['word']))

{'attack',
 'becom',
 'card',
 'destroy',
 'dice',
 'effect',
 'energi',
 'game',
 'heal',
 'keep',
 'king',
 'monster',
 'occupi',
 'play',
 'point',
 'purchas',
 'roll',
 'six',
 'the',
 'three',
 'tokyo',
 'turn',
 'victori',
 'whether'}

In [137]:
sf[['tfidf']].stack('tfidf', new_column_name=['word', 'count'])['word'].sketch_summary()


+------------------+---------+----------+
|       item       |  value  | is exact |
+------------------+---------+----------+
|      Length      | 1444741 |   Yes    |
| # Missing Values |    5    |   Yes    |
| # unique values  |  64287  |    No    |
+------------------+---------+----------+

Most frequent items:
+-------+-------+-------+--------+------+------+------+------+------+------+
| value |  game |  the  | player | card | one  | play | use  | turn | two  |
+-------+-------+-------+--------+------+------+------+------+------+------+
| count | 17809 | 15372 | 14570  | 9861 | 9689 | 9414 | 6977 | 6323 | 6030 |
+-------+-------+-------+--------+------+------+------+------+------+------+
+------+
| new  |
+------+
| 5845 |
+------+
