In [None]:
import csv
import matplotlib.pyplot as plt
import os
import pandas as pd
import pytz
import numpy as np
import seaborn as sns
import sys
import time
import yaml
import csv
import itertools

from datetime import datetime
from datetime import timedelta
from pprint import pprint

from sklearn.model_selection import train_test_split

from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k

import warnings

warnings.simplefilter("ignore")

%matplotlib inline

In [None]:
with open("hyperparameters.yaml", "r") as file:
    hyperparameters = yaml.load(file)

In [None]:
NUM_THREADS = os.cpu_count() # set this to the number of CPU cores to take advantage of parallel training
EPOCHS = 50

### Parameter search

In [None]:
def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """

    while True:
        yield {
            "no_components": np.random.randint(50, 150),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["warp", "bpr"]),
            "learning_rate": np.random.exponential(0.05),
            "max_sampled": np.random.randint(5, 15),
            "num_epochs": np.random.randint(50, 150),
        }


def random_search(i_train, i_test, w_train, w_test, dwriter, num_samples=10):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.

    Parameters
    ----------

    train: np.float32 coo_matrix of shape [n_users, n_items]
        Training data.
    test: np.float32 coo_matrix of shape [n_users, n_items]
        Test data.
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.


    Returns
    -------

    generator of (auc_score, hyperparameter dict, fitted model)

    """
    for i in range(num_samples):
        hyperparams = sample_hyperparameters().__next__()
        print("---Training with hyperparameters---")
        pprint(hyperparams)
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        start = time.time()
        model.fit(
            i_train,
            item_features=item_features_m,
            user_features=user_features_m,
            sample_weight=w_train,
            verbose=False,
            epochs=num_epochs, 
            num_threads=NUM_THREADS
        )
        train_time = int(time.time() - start)
        score = precision_at_k(
            model, 
            test_interactions=m_i_test,
            train_interactions=m_i_train, 
            user_features = user_features_m,
            item_features = item_features_m,
            k = 50,
            num_threads=NUM_THREADS,
            check_intersections=False
            ).mean()

        hyperparams["num_epochs"] = num_epochs
        hyperparams["train_time"] = train_time
        hyperparams["score"] = score
        dwriter.writerow(hyperparams)
        yield (score, hyperparams, model)

In [None]:
with open("hyperparameters-warp-100.csv", "w") as outcsv, open("hyperparameters.yaml", "r") as hypfile:
    dwriter = csv.DictWriter(
        outcsv, 
        fieldnames=[
            "num_epochs",
            "train_time",
            "mar@k",
            "map@k",
            "auc"
        ]
    )
    
    dwriter.writeheader()

    hyperparams = yaml.load(hypfile)
    
    for num_epochs in range(50, 151, 10):
        csv_output = {}
        
        male_interactions, male_weights = dataset_m.build_interactions(
            list(zip(male_interaction_df.user_id, male_interaction_df.target_user_id, male_interaction_df.score))
        )

        m_i_train, m_i_test, m_w_train, m_w_test = train_test_split(male_interactions, male_weights, test_size = 0.33)

        start = time.time()

        model = LightFM(**hyperparams)
        model.fit(
            m_i_train,
            item_features=item_features_m,
            user_features=user_features_m,
            sample_weight=m_w_train.tocoo(),
            verbose=False,
            epochs=num_epochs, 
            num_threads=NUM_THREADS
        )

        train_time = int(time.time() - start)

        mapk = precision_at_k(
            model, 
            test_interactions=m_i_test,
            train_interactions=m_i_train, 
            user_features = user_features_m,
            item_features = item_features_m,
            k = 50,
            num_threads=NUM_THREADS,
            check_intersections=False
        ).mean()
        mark = recall_at_k(
            model, 
            test_interactions=m_i_test,
            train_interactions=m_i_train, 
            user_features = user_features_m,
            item_features = item_features_m,
            k = 50,
            num_threads=NUM_THREADS,
            check_intersections=False
        ).mean()
        auc = auc_score(
            model,
            test_interactions=m_i_test,
            train_interactions=m_i_train,
            user_features=user_features_m,
            item_features=item_features_m,
            preserve_rows=False,
            num_threads=NUM_THREADS,
            check_intersections=False
        ).mean()

        csv_output["num_epochs"] = num_epochs
        csv_output["train_time"] = train_time
        csv_output["mar@k"] = mark
        csv_output["map@k"] = mapk
        csv_output["auc"] = auc

        dwriter.writerow(csv_output)