In [1]:
from sqlalchemy import create_engine
import numpy as np
import pandas as pd
import pickle
import json

from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.neighbors import NearestNeighbors

# Load data
eng = create_engine('sqlite:///data/boardgames.db', echo=False)
nn = pickle.load(open('0827test.p', 'rb'))
raw = pd.read_sql_query("SELECT * FROM boardgames", eng).drop(['index', 'designer', 'publisher'], axis=1)



In [125]:
processed.avgrating.describe()

count    17575.000000
mean         0.652484
std          0.092524
min          0.000000
25%          0.640695
50%          0.645855
75%          0.662628
max          1.000000
Name: avgrating, dtype: float64

In [126]:
processed.columns[17:]

Index(['Take That', 'Card Drafting', 'Simultaneous Action Selection',
       'Trick-taking', 'Pick-up and Deliver', 'Press Your Luck',
       'Player Elimination', 'Pattern Recognition', 'Modular Board',
       'Roll / Spin and Move', 'Tile Placement', 'Simulation',
       'Hand Management', 'Partnerships', 'Trading', 'Role Playing',
       'Set Collection', 'Route/Network Building', 'Area Movement', 'Memory',
       'Secret Unit Deployment', 'Action / Movement Programming',
       'Variable Phase Order', 'Action Point Allowance System',
       'Point to Point Movement', 'Campaign / Battle Card Driven',
       'Betting/Wagering', 'Stock Holding', 'Area Control / Area Influence',
       'Commodity Speculation', 'Cooperative Play', 'Dice Rolling',
       'Auction/Bidding', 'Area Enclosure', 'Grid Movement', 'Hex-and-Counter',
       'Rock-Paper-Scissors', 'Crayon Rail System', 'Variable Player Powers',
       'Pattern Building', 'Voting', 'Territory Building',
       'American Civil War'

In [118]:
def preprocess(df):
    # Perform scaling and grab relevant features
    relevant = ['id', 'name', 'ratingscount', 'avgrating', 'published',
       'minplayers', 'maxplayers', 'best', 'recommended', 'not_recommended',
       'playingtime', 'minplaytime', 'maxplaytime', 'minage', 'suggestedage',
       'language_dependence']

    outliers = ['published','best','maxplayers','maxplaytime','minplaytime','not_recommended','playingtime','ratingscount','recommended']
    normal = ['language_dependence','minage','minplayers','suggestedage','avgrating']

    df[relevant] = df[relevant].apply(lambda x: x.fillna(x.median()) if x.dtype != np.dtype('O') else x,axis=0)

    robust = RobustScaler()
    df[outliers] = robust.fit_transform(df[outliers])
    minmax = MinMaxScaler()
    df[normal] = minmax.fit_transform(df[normal])

    return df

In [150]:
processed.avgrating.describe()

count    17575.000000
mean         0.652484
std          0.092524
min          0.000000
25%          0.640695
50%          0.645855
75%          0.662628
max          1.000000
Name: avgrating, dtype: float64

In [4]:
def dropcols(df):
    # Return only relevant features for KNN
    to_drop = ['id', 'name', 'description', 'avgrating']
    return df.drop(to_drop, axis=1)

In [5]:
def get_test_array(names):
    # Aggregate data for list of names to seed recommendation
    inputs = dropcols(processed[processed['name'].isin(names)])
    return inputs.mean().values.reshape(1, -1)

In [26]:
input_array = get_test_array(['Catan'])

In [31]:
input_array[0]

array([ 2.73665615e+02, -8.23529412e-01,  3.00000000e-01,  0.00000000e+00,
        5.00000000e-01,  1.00000000e+00,  0.00000000e+00,  1.25000000e+00,
        7.50000000e-01,  1.25000000e+00,  4.00000000e-01,  3.00000000e-01,
        2.50000000e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        1.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  

In [37]:
dists, neighbors = nn.kneighbors(input_array,3)

In [43]:
dists=dists.tolist()[0]

In [44]:
neighbors=neighbors.tolist()[0]

In [45]:
tuple(zip(dists,neighbors))

((0.0, 12), (4.120048739612911, 709), (7.418377153805491, 7726))

In [166]:
def get_nearest(names, mechanics, n=10):
    # Grab info for given games
    if names:
        input_array = get_test_array(names)
        # Find the nearest neighbors
        dists, neighbors = nn.kneighbors(input_array, n+len(names))
        dists = dists.tolist()[0]
        neighbors = neighbors.tolist()[0]
        neighborhood = list(zip(dists,neighbors))
        # Scale distances by inverse of avgrating
        weights = processed.query('id == @neighbors')[['id','avgrating']]
        print(weights)
        print(neighbors)
        if mechanics:
            # Prefer games with matching mechanics
            mech_games = filter_mechanics(mechanics)
            weights.apply(lambda x: x['avgrating']*10 if x['id'] in mech_games else x['avgrating'],axis=1)
#         neighborhood = [(resident[0]/(weight+1),resident[1]) for resident, weight in list(zip(neighborhood, weights.avgrating))]
        # Sort results by new scaled distance
        print(neighborhood)
        neighbors = [results[1] for results in sorted([(resident[0]/(weight.query('id==@resident[1]').avgrating+.01), resident[1]) for resident, weight in list(zip(list(zip(dists,neighbors)),weights))],key=lambda x: x[0])]
        print(neighbors)
    # Return results not in the given names
    return list(filter(lambda g: g['id'] in neighbors and g['name'] not in names, game_json))[:5]


In [167]:
get_nearest(['Catan','Codenames'],[])

        id  avgrating
11      12   0.851999
612    709   0.643287
1751  2110   0.638605
1927  2356   0.630587
3206  4797   0.637938
4626  8689   0.594629
4878  9577   0.645191
[8226, 9523, 4797, 7777, 2356, 8689, 7726, 709, 12, 14026, 2110, 9577]
[(4.03571837071512, 8226), (4.428712295993859, 9523), (26.429322888696664, 4797), (38.72834436778695, 7777), (40.20068702707624, 2356), (48.90822739893822, 8689), (48.9587794763071, 7726), (52.732089463910576, 709), (53.62922325889055, 12), (53.629223258890576, 14026), (60.62211086317115, 2110), (64.11773689676143, 9577)]


AttributeError: 'float' object has no attribute 'query'

In [92]:
weights = processed.query('id == @neighbors')[['id','avgrating']]

In [113]:
weights

Unnamed: 0,id,avgrating
11,12,9.398441
612,709,-0.117067


In [131]:
neighbors_test = [results[1] for results in sorted([(resident[0]/weight, resident[1]) for resident, weight in list(zip(list(zip(dists,neighbors)),weights.avgrating))],key=lambda x: x[0])]

In [109]:
dists_test

(-35.193948948297006, 709)

In [132]:
neighbors_test

[709, 12]

In [56]:
processed.query('id == @neighbors')[['id','avgrating']]

Unnamed: 0,id,avgrating
11,12,9.398441
612,709,-0.117067


In [72]:
processed.query('id == @neighbors')[['id','avgrating']].apply(lambda x: x['avgrating']/10 if x['id']==12 else x['avgrating'],axis=1)


11     0.939844
612   -0.117067
dtype: float64

In [55]:
for row in processed.query('id == @neighbors')[['id','avgrating']]:
    print (row)

id
avgrating


In [76]:
def filter_mechanics(mechanics, df = processed):
    for mechanic in mechanics:
        df = df.query(mechanic+'==1')
    return df['id'].values.tolist()

In [8]:
with open('data.json') as f:
    game_json = json.load(f)

In [123]:
processed = preprocess(raw)

In [124]:
processed.head()

Unnamed: 0,id,name,description,ratingscount,avgrating,published,minplayers,maxplayers,best,recommended,...,Adventure,Puzzle,Novel-based,Humor,Real-time,Racing,Fantasy,Print & Play,Exploration,Dice
0,1,Die Macher,Die Macher is a game about seven sequential po...,14.832808,0.834453,-1.352941,0.3,0.5,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
1,2,Dragonmaster,Dragonmaster is a trick-taking card game based...,1.290221,0.676818,-1.647059,0.3,0.0,0.5,1.0,...,0,0,0,0,0,0,1,0,0,0
2,3,Samurai,"Part of the Knizia tile-laying trilogy, Samura...",42.643533,0.843664,-0.647059,0.2,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,4,Tal der Könige,When you see the triangular box and the luxuri...,0.659306,0.664694,-1.0,0.2,0.0,0.0,2.0,...,0,0,0,0,0,0,0,0,0,0
4,5,Acquire,"In Acquire, each player strategically invests ...",53.690852,0.835071,-2.647059,0.2,1.0,0.5,1.0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
mechanics=['Adventure','Puzzle']

In [52]:
filter_mechanics(processed,mechanics)

[10765,
 11531,
 31133,
 33959,
 38430,
 42361,
 47170,
 83330,
 127312,
 127398,
 146508,
 161297,
 169984,
 174805,
 185196,
 198287,
 205059,
 207243,
 207991,
 214484,
 218421,
 229965,
 231618,
 234378,
 234439,
 235465,
 242317,
 244769,
 258451]