In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
from sklearn import linear_model
import statistics
import matplotlib.pyplot as plt
import os
import sys
from surprise import SVD, Reader, Dataset
import tensorflow as tf

In [2]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [3]:
def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

In [4]:
def readJSON(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        yield u,g,d

## Load the dataset

In [6]:
allHours = []
gameCount = defaultdict(int)
gameHours = defaultdict(int)
games = set()                    # set consisting of distinct games of entire set
totalPlayed = 0

for user, game, l in readJSON("train.json.gz"):
    allHours.append(l)
    gameCount[game] += 1
    gameHours[game] += l['hours_transformed']
    totalPlayed += 1
    games.add(game)

In [7]:
gameHours['g99934462'], max(gameHours)

(161.0416462670569, 'g99934462')

In [8]:
totalPlayed

174999

In [9]:
hoursTrain = allHours[:165000]
hoursValid = allHours[165000:]

In [11]:
hoursValid[7], len(hoursTrain), len(hoursValid)

({'hours': 11.9,
  'gameID': 'g01549519',
  'hours_transformed': 3.689299160535892,
  'early_access': True,
  'date': '2017-02-25',
  'text': "Great Pirate game.\nI waited for such a game for very long, it's a great game especially being made by 2 men. It's great.\nHighly recommended, nice people and nice Gameplay. The only problem I have is that the pvp part is very clunky and spongy.",
  'userID': 'u92465525'},
 165000,
 9999)

## Data Pre-processing

In [12]:
allHours_1 = []
userHours = defaultdict(list)
user_game_v1 = defaultdict(list)   # user-game pairs of original validation set
user_game_v = defaultdict(list)    # user-game pairs of validation set - updation
user_game_t = defaultdict(list)    # user-game pairs of training set
ground = list()
users, gamesd = set(), set()

for d in hoursTrain:
    h = d['hours_transformed']
    user = d['userID']
    allHours_1.append(h)
    userHours[user].append(h)
    user_game_t[user].append(d['gameID'])
    users.add(user)
    gamesd.add(d['gameID'])
    
for d in hoursValid:
    user = d['userID']
    user_game_v[user].append(d['gameID'])
    user_game_v1[user].append(d['gameID'])
    #ground.append(1)
    
globalAverage = sum(allHours_1) / len(allHours_1)
userAverage = {}
for u in userHours:
    userAverage[u] = sum(userHours[u]) / len(userHours[u])

predictions = open("predictions_Hours.csv", 'w')
for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    if u in userAverage:
        predictions.write(u + ',' + g + ',' + str(userAverage[u]) + '\n')
    else:
        predictions.write(u + ',' + g + ',' + str(globalAverage) + '\n')

predictions.close()
globalAverage, len(ground), sum(ground)

(3.716088074007024, 0, 0)

## Jaccard Similarity

In [13]:
def Jaccard(s1, s2):
    inter = len(s1.intersection(s2))
    union = len(s1.union(s2))
    if union==0:
        return 0
    return inter/union

## User-Game Interaction Dictionary

In [14]:
userIDs = {}
itemIDs = {}
interactions = []

for d in hoursTrain:
    u = d['userID']
    i = d['gameID']
    r = d['hours_transformed']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    interactions.append((u,i,r))

In [37]:
interactionsTrain = interactions[:165000]
len(interactionsTrain), len(interactions)

(165000, 174999)

In [16]:
mu = sum([r for _,_,r in interactionsTrain]) / len(interactionsTrain)
mu

3.716088074007024

## User list - Game list

In [17]:
usersPerItem = defaultdict(list) # Maps an item to the users who rated it
itemsPerUser = defaultdict(list) # Maps a user to the items that they rated

for u,i,r in interactionsTrain:
    itemsPerUser[u].append(i)
    usersPerItem[i].append(u)

In [18]:
items = list(itemIDs.keys())

## Bayesian Personalized Ranking

In [19]:
class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        # Regularization coefficient
        self.lamb = lamb

    # Prediction for a single instance
    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
                            tf.nn.l2_loss(self.gammaU) +\
                            tf.nn.l2_loss(self.gammaI))
    
    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui
    
    def score_prob(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        probability = tf.math.sigmoid(x_ui)
        
        return probability

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))

In [20]:
optimizer = tf.keras.optimizers.Adam(0.1)
modelBPR = BPRbatch(2, 0.00001)

2024-02-04 12:29:10.370061: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-02-04 12:29:10.370116: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-02-04 12:29:10.370128: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-02-04 12:29:10.370851: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-02-04 12:29:10.371445: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [21]:
def trainingStepBPR(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for _ in range(Nsamples):
            u,i,_ = random.choice(interactions) # positive sample
            j = random.choice(items) # negative sample
            while j in itemsPerUser[u]:
                j = random.choice(items)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleJ.append(itemIDs[j])

        loss = model(sampleU,sampleI,sampleJ)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [22]:
for i in range(100):
    obj = trainingStepBPR(modelBPR, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.5167789
iteration 20, objective = 0.4877952
iteration 30, objective = 0.47672123
iteration 40, objective = 0.47458503
iteration 50, objective = 0.46697414
iteration 60, objective = 0.4662987
iteration 70, objective = 0.46800274
iteration 80, objective = 0.46280053
iteration 90, objective = 0.4658589
iteration 100, objective = 0.45968518


In [24]:
for d in hoursValid:
    u = d['userID']
    i = d['gameID']
    r = d['hours_transformed']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    interactions.append((u,i,r))

## Classifying Most popular Games

In [25]:
mostPopular = [(gameCount[x], x) for x in gameCount]
mosttimed = [(gameHours[y], y) for y in gameHours]
mostPopular.sort()
mostPopular.reverse()
mosttimed.sort()
mosttimed.reverse()
accuracy = list()
return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > (totalPlayed * 70) / 100: break

In [26]:
user_game1 = defaultdict(list)
for u in user_game_v:
    l = len(user_game_v[u])
    for i in range(0,l):
        g1 = random.choice(list(games))
        while(g1 in user_game_v[u] or g1 in return1):
            g1 = random.choice(list(games))
        user_game_v[u].append(g1)
        #user_game_v[u].append([g1,0])
        #ground.append(0)

In [27]:
hoursPerUser = defaultdict(float) # Maps time played by user
hoursPerItem = defaultdict(float) # Maps time to a game played
for d in hoursTrain:
    user,item, hr = d['userID'], d['gameID'], d['hours_transformed']
    #print(type(hr))
    hoursPerUser[user] += hr
    hoursPerItem[item] += hr

In [28]:
predictions = open("dinesh_q1_predictions_Played.csv", 'w')
user_game_test = defaultdict(list)
stat_pred = defaultdict(list)
nk = 0

for l in open("pairs_Played.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    user_game_test[u].append(g)
    nk +=1

In [29]:
len(user_game_test), max(user_game_test), nk, len(user_game_test['u32377855'])

(4738, 'u99971403', 20000, 36)

## Predicting Games for users

In [30]:
#Algorithm for predicting/recommending games to user
for u in user_game_test.keys():
    pred2 = []
    sampleUx = list()
    if u in userIDs:
        sampleUx.append(userIDs[u])
        for g in user_game_test[u]:
            sampleI2x = list()
            sampleI2x.append(itemIDs[g])
            pred2.append((u,g,modelBPR.score(sampleUx, sampleI2x).numpy()))
            
        pred2.sort(key = lambda x: x[2], reverse = True)
        for k in pred2[:len(pred2)//2]:
            predictions.write(u + ',' + k[1] + ',' + str(1) + '\n')

        for k in pred2[len(pred2)//2:]:
            predictions.write(u + ',' + k[1] + ',' + str(0) + '\n')
    else:
        for j in user_game_test[u]:
            if j in return1:
                predictions.write(u + ',' + j + ',' + str(1) + '\n')
            else:
                predictions.write(u + ',' + j + ',' + str(0) + '\n')

predictions.close()

# Game time prediction for user-game pair

In [32]:
trainHours = [r['hours_transformed'] for r in hoursTrain]
alpha = sum(trainHours) * 1.0 / len(trainHours)

In [33]:
alpha, trainHours[10], hoursTrain[10]

(3.716088074007024,
 1.84799690655495,
 {'hours': 2.6,
  'gameID': 'g21544048',
  'hours_transformed': 1.84799690655495,
  'early_access': False,
  'date': '2011-03-01',
  'text': 'SAY GOOD BYE TO YOUR CTRL KEY',
  'userID': 'u27998358'})

In [34]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
playedhDict = {} # To retrieve a rating for a specific user/item pair

for d in hoursTrain:
    user,item = d['userID'], d['gameID']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    playedhDict[(user,item)] = d['hours_transformed']

In [35]:
hoursPerUser = defaultdict(float) # Maps time played by user
hoursPerItem = defaultdict(float) # Maps time to a game played
for d in hoursTrain:
    user,item, hr = d['userID'], d['gameID'], d['hours_transformed']
    #print(type(hr))
    hoursPerUser[user] += hr
    hoursPerItem[item] += hr

In [36]:
betaU = {}
betaI = {}
for u in hoursPerUser:
    betaU[u] =  hoursPerUser[u]/len(itemsPerUser[u])  
for g in hoursPerItem:
    betaI[g] = hoursPerItem[g]/len(usersPerItem[g])

## Latent Factor Model

In [41]:
# Latent Factor Model
def iterationY(alpha, lamb1, lamb2):
    alpha1 = 0
    for d in hoursTrain:
        u, g, h = d['userID'], d['gameID'], d['hours_transformed']
        alpha1 += h - (betaU[u] + betaI[g])
    alpha = alpha1 / len(hoursTrain)
    #print(alpha)
    for a in itemsPerUser:
        betau = 0
        for b in itemsPerUser[a]:
            betau += playedhDict[a,b] - (alpha + betaI[b]) 
        betaU[a] = betau / (lamb1 + len(itemsPerUser[a]))
        #gamma_u = tf.nn.embedding_lookup(gammaU, u)
    for c in usersPerItem:
        betai = 0
        for d in usersPerItem[c]:
            betai += playedhDict[d,c] - (alpha + betaU[d]) 
        betaI[c] = betai / (lamb2 + len(usersPerItem[c]))
        #gamma_i = tf.nn.embedding_lookup(gammaI, i)
        #print(betaU[u], betaI[g])

    return alpha  # Return the updated alpha

# Initialize alpha before calling the function
alpha = globalAverage

# Perform iterations
for i in range(95):
    alpha = iterationY(alpha, 8, 2.5)

In [42]:
gt, pred2 = [], []
for d in hoursValid:
    u,g,h = d['userID'], d['gameID'], d['hours_transformed']
    gt.append(h)
    pred2.append(alpha + betaU[u] + betaI[g])

In [43]:
def MSE(y, ypred):
    diff = [(a-b)**2 for a,b in zip(y, ypred)]
    return sum(diff)/len(diff)

In [44]:
validMSE = MSE(gt, pred2)
validMSE

2.985489116761304

In [45]:
predictions = open("Game_Time_Prediction.csv", 'w')
for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    _ = predictions.write(u + ',' + g + ',' + str(alpha + betaU[u] + betaI[g]) + '\n')

predictions.close()