In [149]:
import datetime
import os
import random
import lightfm
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from lightfm import LightFM
from scipy import sparse
from collections import defaultdict
from typing import List
from sklearn import linear_model
from sklearn.metrics import accuracy_score
import tensorflow.keras as keras
from tensorflow.keras.layers import (
    Concatenate,
    Dense,
    Embedding,
    Flatten,
    Input,
    Multiply,
)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

In [6]:

def readJSON(path):
    #f = gzip.open(path, 'rt')
    f = open(path)
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        h = d['hours_transformed']
        yield u,g,h

In [7]:
userIDs = {}
gameIDs = {}

f = open("train.json")
f.readline()
for l in f:
    d = eval(l)
    u = d['userID']
    g = d['gameID']
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not g in gameIDs: gameIDs[g] = len(gameIDs)

In [1226]:
rawData = []
for l in readJSON("train.json"):
    rawData.append(l)

In [1310]:
random.shuffle(rawData)

In [1311]:
usersPerGame = defaultdict(set)
gamesPerUser = defaultdict(set)
for user,game,_ in rawData:
    usersPerGame[game].add(user)
    gamesPerUser[user].add(game)
games = list(usersPerGame.keys())
users = list(gamesPerUser.keys())

In [1312]:
nTrain = int(len(rawData) * 0.95)
nTest = len(rawData) - nTrain
train = rawData[:nTrain]
test = rawData[nTrain:]

In [1314]:
df_train = pd.DataFrame(train, columns=["user", "item", "interaction"])

In [1315]:
df_train["user_id"] = df_train["user"].map(userIDs)
df_train["item_id"] = df_train["item"].map(gameIDs)

In [1316]:
df_train = df_train[["user_id", "item_id", "interaction"]]

In [1329]:
def create_ncf(
    number_of_users: int,
    number_of_items: int,
    latent_dim_mf: int = 2,
    latent_dim_mlp: int = 2,
    reg_mf: int = 0.001,
    reg_mlp: int = 0.002,
    dense_layers: List[int] = [16, 8],
    reg_layers: List[int] = [0.01, 0.01],
#     activation_dense: str = "relu",
) -> keras.Model:

    # input layer
    user = Input(shape=(), dtype="int32", name="user_id")
    item = Input(shape=(), dtype="int32", name="item_id")

    # embedding layers
    mf_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mf,
        name="mf_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
        input_length=1,
    )
    mf_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mf,
        name="mf_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
        input_length=1,
    )

    mlp_user_embedding = Embedding(
        input_dim=number_of_users,
        output_dim=latent_dim_mlp,
        name="mlp_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
        input_length=1,
    )
    mlp_item_embedding = Embedding(
        input_dim=number_of_items,
        output_dim=latent_dim_mlp,
        name="mlp_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
        input_length=1,
    )

    # MF vector
    mf_user_latent = Flatten()(mf_user_embedding(user))
    mf_item_latent = Flatten()(mf_item_embedding(item))
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])

    # MLP vector
    mlp_user_latent = Flatten()(mlp_user_embedding(user))
    mlp_item_latent = Flatten()(mlp_item_embedding(item))
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])

    mlp_vector = mlp_cat_latent

    # build dense layers for model
    for i in range(len(dense_layers)):
        layer = Dense(
            dense_layers[i],
            activity_regularizer=l2(reg_layers[i]),
            name="layer%d" % i,
        )
        mlp_vector = layer(mlp_vector)

    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])

    result = Dense(
        1, name="interaction"
    )

    output = result(predict_layer)

    model = Model(
        inputs=[user, item],
        outputs=[output],
    )

    return model

In [1330]:
n_users = df_train['user_id'].nunique()
n_items = df_train['item_id'].nunique()
ncf_model = create_ncf(n_users, n_items)

ncf_model.compile(
    optimizer=Adam(),
    #loss="binary_crossentropy",
    loss="mean_squared_error",
    metrics=[
            tf.keras.metrics.RootMeanSquaredError(name='root_mean_squared_error')
    ],
)
ncf_model._name = "neural_collaborative_filtering"
#ncf_model.summary()



In [1331]:
def make_tf_dataset(
    df: pd.DataFrame,
    targets: List[str],
    val_split: float = 0.05,
    batch_size: int = 4096,
    seed=None,
):

    n_val = round(df.shape[0] * val_split)
    if seed:
        # shuffle all the rows
        x = df.sample(frac=1, random_state=seed).to_dict("series")
    else:
        x = df.to_dict("series")
    y = dict()
    for t in targets:
        y[t] = x.pop(t)
    ds = tf.data.Dataset.from_tensor_slices((x, y))

    ds_val = ds.take(n_val).batch(batch_size)
    ds_train = ds.skip(n_val).batch(batch_size)
    return ds_train, ds_val

In [1332]:
ds_train, ds_val = make_tf_dataset(df_train, ["interaction"])

In [1333]:

N_EPOCHS = 30
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_root_mean_squared_error", patience=1
)

train_hist = ncf_model.fit(
    ds_train,
    validation_data=ds_val,
    epochs=N_EPOCHS,
    callbacks=[tensorboard_callback, early_stopping_callback],
    verbose=1,
)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30


In [1334]:
df_test = pd.DataFrame(test, columns=["user", "item", "interaction"])
df_test["user_id"] = df_test["user"].map(userIDs)
df_test["item_id"] = df_test["item"].map(gameIDs)
df_test=df_test[["user_id", "item_id", "interaction"]]
ds_test, _ = make_tf_dataset(df_test, ["interaction"], val_split=0, seed=None)
ncf_predictions = ncf_model.predict(ds_test)
df_test["ncf_predictions"] = ncf_predictions



In [1335]:
df_test

Unnamed: 0,user_id,item_id,interaction,ncf_predictions
0,5697,968,2.867896,1.753195
1,3395,415,3.655352,6.697020
2,2364,1033,1.137504,5.564166
3,88,1533,3.321928,3.046285
4,3686,1317,2.432959,3.790053
...,...,...,...,...
8745,1388,645,2.405992,2.356273
8746,94,1245,3.277985,4.075471
8747,1297,152,1.321928,1.648593
8748,6032,1419,4.781360,5.744066


In [1336]:
np.mean(np.square(np.array(df_test['interaction']) - np.array(df_test['ncf_predictions'])))

3.019717403456202

In [1325]:
dfc = pd.read_csv("pairs_Hours.csv")
dfc["user_id"] = dfc["userID"].map(userIDs)
dfc["item_id"] = dfc["gameID"].map(gameIDs)
dfc["interaction"] = dfc["prediction"]
dfc=dfc[["user_id", "item_id", "interaction"]]
dsc, _ = make_tf_dataset(dfc, ["interaction"], val_split=0, seed=None)
preds = ncf_model.predict(dsc)
dfc["ncf_predictions"] = preds



In [1326]:
dfc

Unnamed: 0,user_id,item_id,interaction,ncf_predictions
0,1374,459,,4.074759
1,5565,590,,1.117396
2,759,281,,5.171386
3,1928,784,,2.958941
4,6216,600,,3.097157
...,...,...,...,...
9995,4630,392,,6.688177
9996,5520,481,,4.045860
9997,2930,1889,,3.361379
9998,892,1728,,3.649117


In [1327]:
sample = dfc.values.tolist()
#sample

In [1328]:
i=0
predictions = open("predictions_Hours.csv", 'w')
for l in open("pairs_Hours.csv"):
    if l.startswith("userID"):
        predictions.write(l)
        continue
    u,g = l.strip().split(',')
    
    pred = sample[i][3]
    #print(sample[i])
    i = i+1
    
    #print(pred)
    
    _ = predictions.write(u + ',' + g + ',' + str(pred) + '\n')

predictions.close()