<a href="https://colab.research.google.com/github/francoisdoanp/projects/blob/master/graph_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Link prediction for Steam dataset

Our goal is to predict games for a recommender system based on link prediction on graphs. We use the Steam 200k users dataset [available here](https://www.kaggle.com/tamber/steam-video-games/version/1) as well as games information to supplement our node features, available [here](https://www.kaggle.com/nikdavis/steam-store-games#steam.csv).

## Installs & Imports

In [None]:
!pip install -U -q PyDrive
!pip install stellargraph

Collecting stellargraph
[?25l  Downloading https://files.pythonhosted.org/packages/13/f8/91a0f8597064b3084f57f314273f10d214b5cfa34cc03a83af981dc32f65/stellargraph-0.11.1-py3-none-any.whl (348kB)
[K     |████████████████████████████████| 358kB 2.8MB/s 
Installing collected packages: stellargraph
Successfully installed stellargraph-0.11.1


In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import pandas as pd
import datetime
import matplotlib.pyplot as plt
import numpy as np

import stellargraph as sg
from stellargraph.mapper import HinSAGELinkGenerator
from stellargraph.layer import HinSAGE, link_regression, link_classification
from tensorflow.keras import Model, optimizers, losses, metrics
import tensorflow.keras.backend as K
from stellargraph.data import EdgeSplitter, BiasedRandomWalk
from gensim.models import Word2Vec

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn import model_selection, preprocessing
scaler = preprocessing.StandardScaler()

import multiprocessing

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# User CSV file link
linku = 'https://drive.google.com/open?id=1v4bZDymvxhXz9K8bt6QdUFltqUycwV1i'
_ , idu = linku.split('=')
# Games CSV file link
linkg = 'https://drive.google.com/open?id=1Aj105y410QmR7uZFoTyh8BAIpfAoNk87'
_ , idg = linkg.split('=')

In [None]:
dlu = drive.CreateFile({'id':idu}) 
dlu.GetContentFile('steam-200k.csv')  
temp_users = pd.read_csv('steam-200k.csv', header=None, usecols=[0,1,2,3], names=['userID', 'gameTitle', 'behavior', 'value'])
temp_users.userID = 'u' + temp_users.userID.astype(str)

dlg = drive.CreateFile({'id':idg}) 
dlg.GetContentFile('steam.csv')  
temp_games = pd.read_csv('steam.csv')
temp_games.appid = 'g' + temp_games.appid.astype(str)

## Data Preprocessing

In [None]:
# Modifying game titles so as to match the most games possibles

temp_users['gameTitle'] = temp_users.gameTitle.str.lower()
temp_users['gameTitle'] = temp_users.gameTitle.str.replace(r'\W+', ' ').replace('\s+', ' ', regex=True)
temp_games['name'] = temp_games.name.str.lower()
temp_games['name'] = temp_games.name.str.replace(r'\W+', ' ').replace('\s+', ' ', regex=True)

# Removing games from games_df that are not in the user interaction dataset and vice versa
games_df = temp_games[temp_games.name.isin(temp_users.gameTitle)]
userst = temp_users[temp_users.gameTitle.isin(temp_games.name)]

# From these outputs, we see that we lost quite a few interactions (from 200k to 51 330)
print(games_df.name.nunique())
print(userst.gameTitle.nunique())
print(userst.userID.nunique())
print(userst.shape)
print(games_df.shape)

# There also seems to be duplicate games in our dataset. We will only keep the first observation.
games = games_df.drop_duplicates(subset=['name'])
print(games.duplicated(subset=['name']).any())
print(games.shape)


3057
3057
11369
(135163, 4)
(3066, 18)
False
(3057, 18)


### Building game features

In [None]:
#########################
# WITH LESS FEATURES

# Adding dummies for the owners variable in the games df
games = pd.concat([games, games['owners'].str.get_dummies()], axis=1)

# The developper, publisher, platforms, categories, genres, steamspy_tags columns all contain
# semicolons as separators when there are multiple entries. We need to transform those to dummies
mult = ['developer', 'publisher', 'platforms', 'categories', 'genres', 'steamspy_tags']

tempg = games

for col in mult:
  games = pd.concat([games, tempg[col].str.get_dummies(sep=';')], axis=1)

# Dropping columns with the same name, which leads to a restricted number of features
games.drop(mult, axis=1, inplace=True)
print(games.shape)
games = games.loc[:, ~games.columns.duplicated()]
games.drop([' ', '(none)'], axis=1, inplace=True)
print(games.shape)


# We also need to change to format of the date variable. Here, we will extract year and month.

games['year'] = pd.DatetimeIndex(games['release_date']).year
games['month'] = pd.DatetimeIndex(games['release_date']).month 

(3057, 3986)
(3057, 2827)


In [None]:
# Building dictionnary of game id's and titles
games_dict = games.set_index('name').to_dict()['appid']


In [None]:
#########################
# WITH EVEN LESS FEATURES

# Adding dummies for the owners variable in the games df
games = pd.concat([games, games['owners'].str.get_dummies()], axis=1)

# The developper, publisher, platforms, categories, genres, steamspy_tags columns all contain
# semicolons as separators when there are multiple entries. We need to transform those to dummies
mult = ['developer', 'publisher', 'platforms', 'categories', 'genres', 'steamspy_tags']

tempg = games

for col in mult:
  games = pd.concat([games, tempg[col].str.get_dummies(sep=';')], axis=1)

# Dropping columns with the same name, which leads to a restricted number of features
games.drop(mult, axis=1, inplace=True)
print(games.shape)
games = games.loc[:, ~games.columns.duplicated()]
games.drop([' ', '(none)'], axis=1, inplace=True)
print(games.shape)

games.drop([col for col, val in games.iloc[:,12::].sum().iteritems() if val < 10], axis=1, inplace=True)


# We also need to change to format of the date variable. Here, we will extract year and month.

games['year'] = pd.DatetimeIndex(games['release_date']).year
games['month'] = pd.DatetimeIndex(games['release_date']).month 


(3057, 3986)
(3057, 2827)


In [None]:
"""

# This code is for FULL FEATURES, which leads to about 3500 variables. We used a restricted dataset

# Adding dummies for the owners variable in the games df
games = pd.concat([games, games['owners'].str.get_dummies()], axis=1)

# The developper, publisher, platforms, categories, genres, steamspy_tags columns all contain
# semicolons as separators when there are multiple entries. We need to transform those to dummies

mult = ['developer', 'publisher', 'platforms', 'categories', 'genres', 'steamspy_tags']
prefix = ['d_', 'p_', 'pl_', 'cat_', 'gen_', 'ss_']

tempg = games

for col,pf in zip(mult, prefix):
  games = pd.concat([games, tempg[col].str.get_dummies(sep=';').add_prefix(pf)], axis=1)

games.drop(mult, axis=1, inplace=True)

# We also need to change to format of the date variable. Here, we will extract year and month.

games['year'] = pd.DatetimeIndex(games['release_date']).year
games['month'] = pd.DatetimeIndex(games['release_date']).month

"""


### Building user features

In [None]:
# For the users, we feel we can perhaps extract features such as:
# - Number of purchases
# - Total playtime
#
# These will be taken from the original data, disregarding whether we have the games in question in our dataset or not

n_purchases = temp_users[temp_users['behavior'] == 'purchase'].groupby(['userID'])['value'].sum()
n_playtime = temp_users[temp_users['behavior'] == 'play'].groupby(['userID'])['value'].sum()

users = pd.merge(n_purchases, n_playtime, left_on='userID', right_on='userID', how='left').reset_index().rename(columns={'value_x' : 'nPurchases', 'value_y':'totalPlaytime'}).fillna(0)


## Model 1: Link prediction with playtime as weights

### Data preparation


In [None]:
# Removing purchase observations
temp_u = userst[userst.behavior == 'play']

# Building user dataframe with retained users
uniqueu_w = temp_u.userID.unique()
users_w = users[users.userID.isin(uniqueu_w)]
users_w.set_index('userID', inplace=True)

# Building game dataframe with retained games
uniqueg_w = temp_u.gameTitle.unique()
games_w = games[games.name.isin(uniqueg_w)]

# Dropping columns we will not be using from games dataframe
games_w.drop(['release_date', 'owners', 'name'], axis=1, inplace=True)
games_w.set_index('appid', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
# Normalizing values
users_w[['nPurchases', 'totalPlaytime']] = scaler.fit_transform(users_w[['nPurchases', 'totalPlaytime']])
games_w[['positive_ratings', 'negative_ratings', 'average_playtime', 'median_playtime', 'price', 'year', 'month', 'required_age', 'achievements']] = scaler.fit_transform(games_w[['positive_ratings', 'negative_ratings', 'average_playtime', 'median_playtime', 'price', 'year', 'month', 'required_age', 'achievements']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.loc._setitem_with_indexer((slice(None), indexer), value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_array(key, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.

In [None]:
# Building edgelist

edgelist_w = temp_u.reset_index().drop(['behavior', 'index'], axis=1)
edgelist_w.replace({'gameTitle' : games_dict}, inplace=True) 

### Final data exploration


### Building Model


In [None]:
# Building graph

graph_w = sg.StellarGraph({'users': users_w, 'games': games_w}, 
                        {'play': edgelist_w[['userID', 'gameTitle']]},
                        source_column='userID',
                        target_column='gameTitle')

In [None]:
print(graph_w.info())

StellarDiGraph: Directed multigraph
 Nodes: 12908, Edges: 51330

 Node types:
  users: [10492]
    Features: float32 vector, length 2
    Edge types: users-play->games
  games: [2416]
    Features: float32 vector, length 2825
    Edge types: none

 Edge types:
    users-play->games: [51330]


In [None]:
# Train test split

edges_train_w, edges_test_w = model_selection.train_test_split(edgelist_w, train_size=0.8, test_size=0.2)
edgelist_train_w = list(edges_train_w[['userID', 'gameTitle']].itertuples(index=False))
edgelist_test_w = list(edges_test_w[['userID', 'gameTitle']].itertuples(index=False))

labels_train_w = edges_train_w["value"]
labels_test_w = edges_test_w["value"]

In [None]:
# Parameters

batch_size_w = 50
epochs_w = 50
num_samples_w = [20,10]
hinsage_layer_w = [256,256]
num_workers_w = multiprocessing.cpu_count()

generator_w = HinSAGELinkGenerator(graph_w, batch_size_w, num_samples_w, head_node_types=['users', 'games'])
train_gen_w = generator_w.flow(edgelist_train_w, labels_train_w, shuffle=True)
test_gen_w = generator_w.flow(edgelist_test_w, labels_test_w)

assert len(hinsage_layer_w) == len(num_samples_w)

hinsage_w = HinSAGE(layer_sizes=hinsage_layer_w, generator=generator_w, bias=True)

x_inw, x_outw = hinsage_w.in_out_tensors()
pred_w = link_regression(edge_embedding_method="concat")(x_outw)

IndexError: ignored

In [None]:
# Defining loss function

def root_mean_square_error(s_true, s_pred):
    return K.sqrt(K.mean(K.pow(s_true - s_pred, 2)))

# Building model

model_w = Model(inputs=x_inw, outputs=pred_w)
model_w.compile(
    optimizer=optimizers.Adam(lr=0.1),
    loss=losses.mean_squared_error,
    metrics=[root_mean_square_error, metrics.mae],
)

In [None]:
history_w = model_w.fit(
    train_gen_w,
    validation_data=test_gen_w,
    epochs=epochs_w,
    verbose=1,
    shuffle=False,
    use_multiprocessing=False,
    workers=num_workers_w,
)

## Model 2: Restricted data


### Data preparation

In [None]:
# Keeping only purchases

temp_ur = userst[userst.behavior == 'play']

# Building user dataframe with retained users
uniqueu_r = temp_ur.userID.unique()
users_r =  users[users.userID.isin(uniqueu_r)]
users_r.set_index('userID', inplace=True)

# Building game dataframe with retained games
uniqueg_r = temp_ur.gameTitle.unique()
games_r = games[games.name.isin(uniqueg_r)]

# Dropping columns we will not be using from games dataframe
games_r.drop(['release_date', 'owners', 'name'], axis=1, inplace=True)
games_r.set_index('appid', inplace=True)


In [None]:
# Normalizing values
users_r[['nPurchases', 'totalPlaytime']] = scaler.fit_transform(users_r[['nPurchases', 'totalPlaytime']])
games_r[['positive_ratings', 'negative_ratings', 'average_playtime', 'median_playtime', 'price', 'year', 'month', 'required_age', 'achievements']] = scaler.fit_transform(games_r[['positive_ratings', 'negative_ratings', 'average_playtime', 'median_playtime', 'price', 'year', 'month', 'required_age', 'achievements']])


In [None]:
# Building edgelist

edgelist_r = temp_ur.reset_index().drop(['behavior', 'index'], axis=1)
edgelist_r.replace({'gameTitle' : games_dict}, inplace=True) 

In [None]:
# Removing games or users that have less than 5 occurence in our edgelist
v = edgelist_r[['userID', 'gameTitle']]
edgelist_reduced = edgelist_r[v.replace(v.stack().value_counts()).gt(5).all(1)].reset_index(drop=True)


In [None]:
u_g = edgelist_reduced.gameTitle.unique()
u_u = edgelist_reduced.userID.unique()

games_reduced = games_r[games_r.index.isin(u_g)]
users_reduced = users_r[users_r.index.isin(u_u)]

### Building model

In [None]:
graph_b = sg.StellarGraph({'users': users_reduced, 'games': games_reduced}, 
                        {'play': edgelist_reduced[['userID', 'gameTitle']]},
                        source_column='userID',
                        target_column='gameTitle')

In [None]:
# Train test split

edges_train_r, edges_test_r = model_selection.train_test_split(edgelist_reduced, train_size=0.8, test_size=0.2)
edgelist_train_r = list(edges_train_r[['userID', 'gameTitle']].itertuples(index=False))
edgelist_test_r = list(edges_test_r[['userID', 'gameTitle']].itertuples(index=False))

labels_train_r = edges_train_r["value"]
labels_test_r = edges_test_r["value"]

In [None]:
# Parameters

batch_size_r = 50
epochs_r = 50
num_samples_r = [20,10]
hinsage_layer_r = [256,256]
num_workers_r = multiprocessing.cpu_count()

generator_r = HinSAGELinkGenerator(graph_b, batch_size_r, num_samples_r, head_node_types=['users', 'games'])
train_gen_r = generator_r.flow(edgelist_train_r, labels_train_r, shuffle=True)
test_gen_r = generator_r.flow(edgelist_test_r, labels_test_r)

hinsage_r = HinSAGE(layer_sizes=hinsage_layer_r, generator=generator_r, bias=True)

x_inr, x_outr = hinsage_r.in_out_tensors()
pred_r = link_regression(edge_embedding_method="concat")(x_outr)

link_regression: using 'concat' method to combine node embeddings into edge embeddings


In [None]:
# Defining loss function

def root_mean_square_error(s_true, s_pred):
    return K.sqrt(K.mean(K.pow(s_true - s_pred, 2)))

# Building model

model_r = Model(inputs=x_inr, outputs=pred_r)
model_r.compile(
    optimizer=optimizers.Adam(lr=0.1),
    loss=losses.mean_squared_error,
    metrics=[root_mean_square_error, metrics.mae],
)

In [None]:
history_r = model_r.fit(
    train_gen_r,
    validation_data=test_gen_r,
    epochs=epochs_r,
    verbose=1,
    shuffle=False,
    use_multiprocessing=False,
    workers=num_workers_r,
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-67-0c4235456590>", line 8, in <module>
    workers=num_workers_r,
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 66, in _method_wrapper
    return method(self, *args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 875, in fit
    return_dict=True)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 66, in _method_wrapper
    return method(self, *args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py", line 1084, in evaluate
    tmp_logs = test_function(iterator)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/eager/def_function.py", line 580, in __call__

KeyboardInterrupt: ignored

## Model 3: Binary link prediction

Keeping only observations for purchases

### Data preparation

In [None]:
# Keeping only purchases

temp_ub = userst[userst.behavior == 'purchase']

# Building user dataframe with retained users
uniqueu_b = temp_ub.userID.unique()
users_b =  users[users.userID.isin(uniqueu_b)]
users_b.set_index('userID', inplace=True)

# Building game dataframe with retained games
uniqueg_b = temp_ub.gameTitle.unique()
games_b = games[games.name.isin(uniqueg_b)]

# Dropping columns we will not be using from games dataframe
games_b.drop(['release_date', 'owners', 'name'], axis=1, inplace=True)
games_list = games_b[['appid']]
games_b.set_index('appid', inplace=True)


In [None]:
# Normalizing values
users_b[['nPurchases', 'totalPlaytime']] = scaler.fit_transform(users_b[['nPurchases', 'totalPlaytime']])
games_b[['positive_ratings', 'negative_ratings', 'average_playtime', 'median_playtime', 'price', 'year', 'month', 'required_age', 'achievements']] = scaler.fit_transform(games_b[['positive_ratings', 'negative_ratings', 'average_playtime', 'median_playtime', 'price', 'year', 'month', 'required_age', 'achievements']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.loc._setitem_with_indexer((slice(None), indexer), value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_array(key, value)


In [None]:
# Building edgelist
edgelist_b = temp_ub.drop(['behavior', 'value'], axis=1)
edgelist_b.replace({'gameTitle' : games_dict}, inplace=True)
edgelist_b.drop_duplicates(subset=['userID', 'gameTitle'], inplace=True)
edgelist_b.reset_index(drop=True, inplace=True)

### Creating data validation for 10 users

In [None]:
# Sampling users that have more than 15 purchases
user_count = edgelist_b.groupby('userID').nunique()
user_count.drop(user_count.columns[0], axis=1, inplace=True)
user_count.reset_index(inplace=True)
sample_user = user_count[user_count.gameTitle >= 20].sample(10)

# Edge list for samples users
temp_edge = edgelist_b[edgelist_b.userID.isin(sample_user.userID)]

# Removing 10 observations for each userID in our list
edge_10 = temp_edge.groupby('userID', group_keys=False, sort=False).apply(pd.DataFrame.tail, n=-10)

# Removing sampled users from our original edgelist
temp_edge2 = edgelist_b[~edgelist_b.userID.isin(sample_user.userID)]
# Adding back the edgelist with 10 observations per user sampled removed
edge_wo_10 = pd.concat([temp_edge2, edge_10], axis=0).reset_index(drop=True)



In [None]:
# Creating interactions with all games for our 10 validation users
user_repeat = sample_user['userID'].repeat(len(games_list)).reset_index(drop=True)
games_repeat = pd.concat([games_list]*10).reset_index(drop=True)

games_repeat['userID'] = user_repeat
games_repeat = games_repeat.reindex(columns=['userID', 'appid'])

# Removing from the validation examples the observations that will go into the edgelist

g_index = games_repeat.set_index(['userID', 'appid']).index
edge_10_index = edge_10.set_index(['userID', 'gameTitle']).index

mask = ~g_index.isin(edge_10_index)

validation_examples = games_repeat[mask]

# Creating labels

obs_100 = pd.concat([temp_edge, edge_10]).drop_duplicates(keep=False)
index_obs = obs_100.set_index(['userID', 'gameTitle']).index
index_val = validation_examples.set_index(['userID', 'appid']).index

validation_labels = index_val.isin(index_obs).astype(int)

validation_examples = np.array(validation_examples)

### Building model

In [None]:
# Building graph

graph_b = sg.StellarGraph({'users': users_b, 'games': games_b}, 
                        {'play': edge_wo_10[['userID', 'gameTitle']]},
                        source_column='userID',
                        target_column='gameTitle')

In [None]:
# Splitting train/test

# Test set
edge_splitter_test = EdgeSplitter(graph_b)
graph_test, examples_test, labels_test = edge_splitter_test.train_test_split(p=0.05, method="global", edge_label='play')

# Training set
edge_splitter_train = EdgeSplitter(graph_test, graph_b)
graph_train, examples, labels = edge_splitter_train.train_test_split(p=0.05, method="global", edge_label='play')


Network has 83194 edges of type play
Removed 1000 edges
Removed 2000 edges
Removed 3000 edges
Removed 4000 edges
Network has 83194 edges of type play
Sampled 1000 negative edges
Sampled 2000 negative edges
Sampled 3000 negative edges
Sampled 4000 negative edges
** Sampled 4159 positive and 4159 negative edges. **
Network has 79035 edges of type play
Removed 1000 edges
Removed 2000 edges
Removed 3000 edges
Network has 79035 edges of type play
Sampled 1000 negative edges
Sampled 2000 negative edges
Sampled 3000 negative edges
** Sampled 3951 positive and 3951 negative edges. **


In [None]:
# Parameters

batch_size_b = 50
epochs_b = 20
num_samples_b = [8,8,4,4]
hinsage_layer_b = [256,256,256,256]
num_workers_b = multiprocessing.cpu_count()

# Training generator and flow
generator_train_b = HinSAGELinkGenerator(graph_train, batch_size_b, num_samples_b, head_node_types=['games', 'users'])
train_flow_b = generator_train_b.flow(examples, labels, shuffle=True)

# test generator and flow
test_gen_b = HinSAGELinkGenerator(graph_test, batch_size_b, num_samples_b, head_node_types=['games', 'users'])
test_flow_b = test_gen_b.flow(examples_test, labels_test)

# valid generator and flow
valid_gen_b = HinSAGELinkGenerator(graph_b, batch_size_b, num_samples_b, head_node_types=['users', 'games'])
valid_flow_b = valid_gen_b.flow(validation_examples, validation_labels)

hinsage_b = HinSAGE(layer_sizes=hinsage_layer_b, generator=generator_train_b, bias=True, dropout=0.3)

x_inb, x_outb = hinsage_b.in_out_tensors()
pred_b = link_classification(output_dim=1, output_act="relu", edge_embedding_method="ip")(x_outb)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [None]:
# Model

model_b = Model(inputs=x_inb, outputs=pred_b)

model_b.compile(
    optimizer=optimizers.Adam(lr=1e-3),
    loss=losses.binary_crossentropy,
    metrics=["acc", metrics.Precision()],
)

In [None]:
history_b = model_b.fit(
    train_flow_b,
    validation_data=test_flow_b,
    epochs=epochs_b,
    verbose=1,
    shuffle=False,
    use_multiprocessing=False,
    workers=num_workers_b,
)

Epoch 1/20

In [None]:
# Predict on test set

#y_pred = model_b.predict(valid_flow_b, batch_size=batch_size_b)
y_eval = model_b.evaluate(valid_flow_b)



In [None]:
predictions_final = pd.concat([pd.DataFrame(validation_examples), pd.DataFrame(y_pred)], axis=1)

In [None]:
predictions_final.columns = ['userID', 'gameID', 'prob']

pred_f = predictions_final.groupby('userID').apply(pd.DataFrame.sort_values, 'prob', ascending=False)

# Model 3 - Node2Vec

## Data Preparation (same as model 2)


In [None]:
# Keeping only purchases

temp_ub = userst[userst.behavior == 'purchase']

# Building user dataframe with retained users
uniqueu_b = temp_ub.userID.unique()
users_b =  users[users.userID.isin(uniqueu_b)]
users_b.set_index('userID', inplace=True)

# Building game dataframe with retained games
uniqueg_b = temp_ub.gameTitle.unique()
games_b = games[games.name.isin(uniqueg_b)]

# Dropping columns we will not be using from games dataframe
games_b.drop(['release_date', 'owners', 'name'], axis=1, inplace=True)
games_b.set_index('appid', inplace=True)

In [None]:
# Normalizing values
users_b[['nPurchases', 'totalPlaytime']] = scaler.fit_transform(users_b[['nPurchases', 'totalPlaytime']])
games_b[['positive_ratings', 'negative_ratings', 'average_playtime', 'median_playtime', 'price', 'year', 'month', 'required_age', 'achievements']] = scaler.fit_transform(games_b[['positive_ratings', 'negative_ratings', 'average_playtime', 'median_playtime', 'price', 'year', 'month', 'required_age', 'achievements']])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.loc._setitem_with_indexer((slice(None), indexer), value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_array(key, value)


In [None]:
# Building edgelist
edgelist_b = temp_ub.reset_index().drop(['behavior', 'index', 'value'], axis=1)
edgelist_b.replace({'gameTitle' : games_dict}, inplace=True) 

## Model

In [None]:
# Building graph

graph_b = sg.StellarGraph({'users': users_b, 'games': games_b}, 
                        {'play': edge_wo_10[['userID', 'gameTitle']]},
                        source_column='userID',
                        target_column='gameTitle')

In [None]:
# Splitting train/test

# Test set
edge_splitter_test = EdgeSplitter(graph_b)
graph_test, examples_test, labels_test = edge_splitter_test.train_test_split(p=0.05, method="global", edge_label='play')

# Train set
edge_splitter_train = EdgeSplitter(graph_test, graph_b)
graph_train, examples, labels = edge_splitter_train.train_test_split(p=0.05, method="global", edge_label='play')

(
    examples_train,
    examples_model_selection,
    labels_train,
    labels_model_selection,
) = train_test_split(examples, labels, train_size=0.75, test_size=0.25)


Network has 83733 edges of type play
Removed 1000 edges
Removed 2000 edges
Removed 3000 edges
Removed 4000 edges
Network has 83733 edges of type play
Sampled 1000 negative edges
Sampled 2000 negative edges
Sampled 3000 negative edges
Sampled 4000 negative edges
** Sampled 4186 positive and 4186 negative edges. **
Network has 79547 edges of type play
Removed 1000 edges
Removed 2000 edges
Removed 3000 edges
Network has 79547 edges of type play
Sampled 1000 negative edges
Sampled 2000 negative edges
Sampled 3000 negative edges
** Sampled 3977 positive and 3977 negative edges. **


In [None]:
# Parameters 

p = 1.0
q = 1.0
dimensions = 128
num_walks = 10
walk_length = 80
window_size = 10
num_iter_n2 = 1
workers_n2 = multiprocessing.cpu_count()

def node2vec_embedding(graph, name):
    rw = BiasedRandomWalk(graph)
    walks = rw.run(graph.nodes(), n=num_walks, length=walk_length, p=p, q=q)
    print(f"Number of random walks for '{name}': {len(walks)}")

    model = Word2Vec(
        walks,
        size=dimensions,
        window=window_size,
        min_count=0,
        sg=1,
        workers=workers_n2,
        iter=num_iter_n2,
    )

    def get_embedding(u):
        return model.wv[u]

    return get_embedding

In [None]:
embed_train = node2vec_embedding(graph_train, "Train Graph")

Number of random walks for 'Train Graph': 144260


### Function definitions

In [None]:
def link_examples_to_features(link_examples, transform_node, binary_operator):
    return [
        binary_operator(transform_node(src), transform_node(dst))
        for src, dst in link_examples
    ]


# 2. training classifier
def train_link_prediction_model(
    link_examples, link_labels, get_embedding, binary_operator
):
    clf = link_prediction_classifier()
    link_features = link_examples_to_features(
        link_examples, get_embedding, binary_operator
    )
    clf.fit(link_features, link_labels)
    return clf


def link_prediction_classifier(max_iter=2000):
    lr_clf = LogisticRegressionCV(Cs=10, cv=10, scoring="roc_auc", max_iter=max_iter)
    return Pipeline(steps=[("sc", preprocessing.StandardScaler()), ("clf", lr_clf)])


# 3. and 4. evaluate classifier
def evaluate_link_prediction_model(
    clf, link_examples_test, link_labels_test, get_embedding, binary_operator
):
    link_features_test = link_examples_to_features(
        link_examples_test, get_embedding, binary_operator
    )
    score = evaluate_average_precision(clf, link_features_test, link_labels_test)
    return score


def evaluate_roc_auc(clf, link_features, link_labels):
    predicted = clf.predict_proba(link_features)

    # check which class corresponds to positive links
    positive_column = list(clf.classes_).index(1)
    return roc_auc_score(link_labels, predicted[:, positive_column])

    
def evaluate_average_precision(clf, link_features, link_labels):
    predicted = clf.predict_proba(link_features)

    # check which class corresponds to positive links
    positive_column = list(clf.classes_).index(1)
    return average_precision_score(link_labels, predicted[:, positive_column])

In [None]:
def operator_hadamard(u, v):
    return u * v


def operator_l1(u, v):
    return np.abs(u - v)


def operator_l2(u, v):
    return (u - v) ** 2


def operator_avg(u, v):
    return (u + v) / 2.0


def run_link_prediction(binary_operator):
    clf = train_link_prediction_model(
        examples, labels, embed_train, binary_operator
    )
    score = evaluate_link_prediction_model(
        clf,
        examples_model_selection,
        labels_model_selection,
        embed_train,
        binary_operator,
    )

    return {
        "classifier": clf,
        "binary_operator": binary_operator,
        "score": score,
    }

binary_operators = [operator_hadamard, operator_l1, operator_l2, operator_avg]

## Model training and testing

In [None]:

results = [run_link_prediction(op) for op in binary_operators]
best_result = max(results, key=lambda result: result["score"])

print(f"Best result from '{best_result['binary_operator'].__name__}'")

pd.DataFrame(
    [(result["binary_operator"].__name__, result["score"]) for result in results],
    columns=("name", "Average Precision Score"),
).set_index("name")

In [None]:
embedding_test = node2vec_embedding(graph_test, "Test Graph")

Number of random walks for 'Test Graph': 144260


In [None]:
embedding_valid = node2vec_embedding(graph_b, "Valid Graph")

Number of random walks for 'Valid Graph': 144260
