In [2]:
import pandas as pd
import numpy as np 
import matplotlib
# import matplotlib.pyplot as plt 
import seaborn as sns
import turicreate
import sklearn as sk
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split

### Item - Item similarity model in Turicreate

In [3]:
data_copy = pd.read_pickle("../Data_Cleaning/clean_steam_advanced.pkl")
game_map = pd.read_pickle("../Data_Cleaning/gameMap.pkl")

In [4]:
data_copy.head()

Unnamed: 0,userId,gameName,play,Actions,gameId
41879,5250,Portal 2,1.0,0.06011,3223
41877,5250,Cities Skylines,1.0,0.636464,852
41878,5250,Deus Ex Human Revolution,1.0,0.274033,1248
41880,5250,Alien Swarm,1.0,0.021657,227
41881,5250,Team Fortress 2,1.0,0.003536,4257


According to the documentation [here](https://apple.github.io/turicreate/docs/userguide/recommender/using-trained-models.html), turicreate excludes the items that are observed for each user. In order to test the accuracy of the model, it is important to split a proportion of user's data into train and test for validation. 
but for now, let us just feed all the data in to our model.

There are 3 similarity measurements for similarity type - jaccard, cosine and pearson. In our case, We choose Pearson to measure the loss.

In [5]:
# create SFrame from DataFrame
from turicreate import SFrame

game_sf = SFrame(data=data_copy)

item_item_reco = turicreate.recommender.item_similarity_recommender.create( \
    game_sf, user_id='userId', item_id='gameId', \
    target="Actions", similarity_type='pearson')

In [6]:
# Get the k most similar items for each item in items. Default number is 10.
similar_games = item_item_reco.get_similar_items()

now We have created the model. Turicreate provides a nice functionality for users to check the k most similar items of an input. We would like to expriment our model with a few sample inputs.

In [7]:
# for example to check some results
game_map_sframe = turicreate.SFrame(game_map)
similar_games_named = similar_games.join(game_map_sframe, on="gameId", how="left") \
    .join(game_map_sframe, on={"similar":"gameId"}, how="left")\
    .rename({"gameName.1":"similar_game"}, True) \
    .select_columns(["gameId", "gameName", "similar", "similar_game", "score", "rank"])

# print the result of one game for testing
a =similar_games_named[similar_games_named['gameName']== "Half-Life 2 Episode One"]
a.print_rows(num_rows=10, num_columns=6) 

+--------+-------------------------+---------+-------------------------------+
| gameId |         gameName        | similar |          similar_game         |
+--------+-------------------------+---------+-------------------------------+
|  2072  | Half-Life 2 Episode One |   2070  |          Half-Life 2          |
|  2072  | Half-Life 2 Episode One |   2077  |  Half-Life Deathmatch Source  |
|  2072  | Half-Life 2 Episode One |   2073  |    Half-Life 2 Episode Two    |
|  2072  | Half-Life 2 Episode One |   2074  |     Half-Life 2 Lost Coast    |
|  2072  | Half-Life 2 Episode One |   3627  |       SEGA Bass Fishing       |
|  2072  | Half-Life 2 Episode One |   1003  |           Crazy Taxi          |
|  2072  | Half-Life 2 Episode One |   480   | Battlefield Bad Company 2 ... |
|  2072  | Half-Life 2 Episode One |   3222  |             Portal            |
|  2072  | Half-Life 2 Episode One |   1336  |             Dota 2            |
|  2072  | Half-Life 2 Episode One |   2076  |      

In [8]:
# print the result of one game for testing
b =similar_games_named[similar_games_named['gameName']==  'Rocket League']
b.print_rows(num_rows=10, num_columns=6) 

+--------+---------------+---------+-------------------------------+
| gameId |    gameName   | similar |          similar_game         |
+--------+---------------+---------+-------------------------------+
|  3570  | Rocket League |   602   |           Bloons TD5          |
|  3570  | Rocket League |   3564  | Robot Roller-Derby Disco D... |
|  3570  | Rocket League |   1049  |           Cubemen 2           |
|  3570  | Rocket League |   1718  |        Fight The Dragon       |
|  3570  | Rocket League |   4305  |        The Banner Saga        |
|  3570  | Rocket League |   3508  |     Retro City Rampage DX     |
|  3570  | Rocket League |   3569  |          Rock of Ages         |
|  3570  | Rocket League |   4656  |   Tony Hawk's Pro Skater HD   |
|  3570  | Rocket League |   1938  | Glorkian Warrior The Trial... |
|  3570  | Rocket League |   3809  | Shower With Your Dad Simul... |
+--------+---------------+---------+-------------------------------+
+---------------------+------+
|  

It seems the result we get is actually reasonable and impressive.  The model is able to find games in similar genres, and sometimes even the same game of other versions. This proves that our model we build is able to capture the relationships embedded behind the mappings of items and users.

### Let's pick a user and see what recommendations he / she gets:

In [8]:
rec_result = item_item_reco.recommend(diversity=1,random_seed=0).join(game_map_sframe, on="gameId", how="left")
print (rec_result)

+--------+--------+-------+------+--------------------------------+
| userId | gameId | score | rank |            gameName            |
+--------+--------+-------+------+--------------------------------+
|  5250  |  2955  |  1.0  |  1   | Nobunaga's Ambition Souzou...  |
|  5250  |  813   |  1.0  |  2   |   Championship Manager 2010    |
|  5250  |  1271  |  1.0  |  3   |          Diaper Dash           |
|  5250  |  4934  |  1.0  |  4   |     Warrior Kings Battles      |
|  5250  |  4459  |  1.0  |  5   |       The Promised Land        |
|  5250  |  838   |  1.0  |  6   |         Choplifter HD          |
|  5250  |  3600  |  1.0  |  7   | Rugby League Team Manager 2015 |
|  5250  |  1705  |  1.0  |  8   |    Fast & Furious Showdown     |
|  5250  |  2306  |  1.0  |  9   |           Jack Keane           |
|  5250  |  2265  |  1.0  |  10  |            Insane 2            |
+--------+--------+-------+------+--------------------------------+
[123930 rows x 5 columns]
Note: Only the head of

### In order to better estimate the accuracy of our model, we decide to split the training and testing set, and use "precision_recall" as our metric. 
check [here](https://apple.github.io/turicreate/docs/api/generated/turicreate.recommender.item_similarity_recommender.ItemSimilarityRecommender.evaluate.html#turicreate.recommender.item_similarity_recommender.ItemSimilarityRecommender.evaluate) for more info.

In [10]:
game_sf2 = SFrame(data=data_copy)

#The test dataset is generated by first choosing max_num_users out of the total number of users in dataset. Then, for each of the chosen test users, 
#a portion of the user’s items (determined by item_test_proportion) is randomly chosen to be included in the test set. 
#This split allows the training data to retain enough information about the users in the testset, so that adequate recommendations can be made. 
#The total number of users in the test set may be fewer than max_num_users if a user was chosen for the test set but none of their items are selected.

train_sframe, test_sframe = turicreate.recommender.util.random_split_by_user(game_sf2, user_id="userId", item_id="gameId", max_num_users=500,item_test_proportion=0.2,random_seed = 0)

In [11]:
data_copy.head()

Unnamed: 0,userId,gameName,play,Actions,gameId
41879,5250,Portal 2,1.0,0.06011,3223
41877,5250,Cities Skylines,1.0,0.636464,852
41878,5250,Deus Ex Human Revolution,1.0,0.274033,1248
41880,5250,Alien Swarm,1.0,0.021657,227
41881,5250,Team Fortress 2,1.0,0.003536,4257


In [12]:
item_item_reco = turicreate.recommender.item_similarity_recommender.create( \
    train_sframe, user_id='userId', item_id='gameId', \
    target="Actions", similarity_type='pearson')
rec = item_item_reco.recommend(k= 50)
rec = rec.join(game_map_sframe, on="gameId", how="left") 
from turicreate.toolkits.recommender.util import precision_recall_by_user
result = precision_recall_by_user(test_sframe, rec[ ["userId", "gameId"] ] )

In [13]:
result.print_rows(num_rows=50, num_columns=5)

+---------+--------+-----------+--------+-------+
|  userId | cutoff | precision | recall | count |
+---------+--------+-----------+--------+-------+
|   5250  |   10   |    0.0    |  1.0   |   0   |
|  76767  |   10   |    0.0    |  1.0   |   0   |
|  86540  |   10   |    0.0    |  1.0   |   0   |
|  103360 |   10   |    0.0    |  1.0   |   0   |
|  144736 |   10   |    0.0    |  1.0   |   0   |
|  181212 |   10   |    0.0    |  1.0   |   0   |
|  229911 |   10   |    0.0    |  1.0   |   0   |
|  298950 |   10   |    0.0    |  1.0   |   0   |
|  299153 |   10   |    0.0    |  1.0   |   0   |
|  381543 |   10   |    0.0    |  1.0   |   0   |
|  547685 |   10   |    0.0    |  1.0   |   0   |
|  554278 |   10   |    0.0    |  1.0   |   0   |
|  561758 |   10   |    0.0    |  1.0   |   0   |
|  577614 |   10   |    0.0    |  1.0   |   0   |
|  604988 |   10   |    0.0    |  1.0   |   0   |
|  622362 |   10   |    0.0    |  1.0   |   0   |
|  635733 |   10   |    0.0    |  1.0   |   0   |


In [14]:
#look at the data of user 858433 in recommendation

# Filtering
filter_sf =rec[(rec['userId']== 8585433 )] 

# Displaying
(filter_sf[['userId','gameId','gameName','score']]). print_rows(num_rows=50, num_columns=4)

+---------+--------+--------------------------------+--------------------+
|  userId | gameId |            gameName            |       score        |
+---------+--------+--------------------------------+--------------------+
| 8585433 |  3848  |         SimpleRockets          |        1.0         |
| 8585433 |  4042  |      Stargate Resistance       |        1.0         |
| 8585433 |  831   |       Chip's Challenge 2       |        1.0         |
| 8585433 |  2265  |            Insane 2            |        1.0         |
| 8585433 |  4459  |       The Promised Land        |        1.0         |
| 8585433 |  1705  |    Fast & Furious Showdown     |        1.0         |
| 8585433 |  1271  |          Diaper Dash           |        1.0         |
| 8585433 |  3364  |            RECYCLE             |        1.0         |
| 8585433 |  5061  |          Xpand Rally           |        1.0         |
| 8585433 |  1607  |        FIFA Manager 10         |        1.0         |
| 8585433 |  1324  |     

In [15]:
# Filtering
test_sf = test_sframe[(test_sframe['userId']==   8585433 )] 
# Displaying
test_sf.materialize()
(test_sf[['userId', 'gameName', 'gameId','Actions']]). print_rows(num_rows=21, num_columns=4)

+---------+-------------------------------+--------+------------------------+
|  userId |            gameName           | gameId |        Actions         |
+---------+-------------------------------+--------+------------------------+
| 8585433 |             VVVVVV            |  4797  | 0.0009546376638652595  |
| 8585433 |         Super Meat Boy        |  4146  | 0.00017047101140451064 |
| 8585433 | Sang-Froid - Tales of Were... |  3696  | 0.00022161231482586383 |
| 8585433 |    Ori and the Blind Forest   |  3034  |  0.001636521709483302  |
| 8585433 | Rising Storm/Red Orchestra... |  3553  | 8.523550570225532e-06  |
| 8585433 |             Saira             |  3662  | 8.523550570225532e-06  |
| 8585433 |  The Walking Dead Season Two  |  4517  | 0.0014490035969383403  |
| 8585433 |       Shadowrun Returns       |  3772  | 8.523550570225532e-06  |
| 8585433 |            Distance           |  1294  | 0.00011932970798315743 |
| 8585433 |         Mirror's Edge         |  2748  | 0.000852355

In [16]:
# Filtering
train_sf = train_sframe[(train_sframe['userId']==   8585433  )] 
# Displaying
train_sf.materialize()
(train_sf[['userId', 'gameName', 'gameId','Actions']]). print_rows(num_rows=35, num_columns=4)

+---------+-------------------------------+--------+------------------------+
|  userId |            gameName           | gameId |        Actions         |
+---------+-------------------------------+--------+------------------------+
| 8585433 |          Cave Story+          |  807   | 0.00022161231482586383 |
| 8585433 |           Black Ice           |  535   | 0.0001875181125449617  |
| 8585433 |            Hacknet            |  2064  | 0.0010569202707079658  |
| 8585433 |     FTL Faster Than Light     |  1635  | 0.0009716847650057106  |
| 8585433 |      Monday Night Combat      |  2761  | 0.0009716847650057106  |
| 8585433 |           Gauntlet            |  1900  | 0.0009375905627248085  |
| 8585433 |       Quest of Dungeons       |  3345  | 0.0001875181125449617  |
| 8585433 |      Dungeon Defenders II     |  1407  | 0.0011421557764102212  |
| 8585433 |            Ricochet           |  3531  | 8.523550570225532e-06  |
| 8585433 |          Left 4 Dead          |  2474  | 0.002164981

In [14]:
# Filtering
total_sf = game_sf2[(game_sf2['userId']==   8542204 )] 
# Displaying
total_sf.materialize()
(total_sf[['userId', 'gameName', 'gameId','Actions']]). print_rows(num_rows=44, num_columns=4)

+---------+-------------------------------+--------+-----------------------+
|  userId |            gameName           | gameId |        Actions        |
+---------+-------------------------------+--------+-----------------------+
| 8542204 |             Dota 2            |  1336  |   0.4533473906167634  |
| 8542204 |           Dead Space          |  1152  | 0.0010542962572482868 |
| 8542204 |           Metro 2033          |  2712  |  0.001405728342997716 |
| 8542204 |      Natural Selection 2      |  2901  |  0.001405728342997716 |
| 8542204 |   Call of Duty Black Ops II   |  730   | 0.0017571604287471448 |
| 8542204 |         Left 4 Dead 2         |  2475  | 0.0017571604287471448 |
| 8542204 | Microsoft Flight Simulator... |  2721  |  0.000702864171498858 |
| 8542204 | Call of Duty Advanced Warfare |  724   | 0.0001757160428747145 |
| 8542204 |             Arma 2            |  322   |  0.000702864171498858 |
| 8542204 |   Arma 2 Operation Arrowhead  |  326   |  0.000351432085749429 |

In [44]:
result = test_sframe.join(result, on="userId", how="left") \
   .select_columns(["userId","gameId", "gameName", "precision", "recall", "count"])
result.print_rows(num_rows=100, num_columns=6)

+----------+--------+-------------------------------+-----------+--------+-------+
|  userId  | gameId |            gameName           | precision | recall | count |
+----------+--------+-------------------------------+-----------+--------+-------+
| 1024319  |  2078  |    Half-Life Opposing Force   |    0.0    |  0.0   |   1   |
| 1364546  |  1179  |       Deathmatch Classic      |    0.0    |  0.0   |   1   |
| 4325465  |  1179  |       Deathmatch Classic      |    0.0    |  0.0   |   1   |
| 8542204  |  2712  |           Metro 2033          |    0.0    |  0.0   |   9   |
| 8542204  |  4527  |  The Witcher Enhanced Edition |    0.0    |  0.0   |   9   |
| 8542204  |  1978  |       Grand Theft Auto V      |    0.0    |  0.0   |   9   |
| 8542204  |  4343  |            The Crew           |    0.0    |  0.0   |   9   |
| 8542204  |  726   |     Call of Duty Black Ops    |    0.0    |  0.0   |   9   |
| 8542204  |  3223  |            Portal 2           |    0.0    |  0.0   |   9   |
| 85

In [59]:
# Filtering
nonzero_sframe = result[(result['precision']!=  0.0)] 
nonzero_sframe.materialize()
print(nonzero_sframe)

# this is weird. we need more investigation

+--------+--------+-----------+--------+-------+
| userId | cutoff | precision | recall | count |
+--------+--------+-----------+--------+-------+
+--------+--------+-----------+--------+-------+
[0 rows x 5 columns]

