In [112]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict

import torch
import torch.nn as nn
import torch.optim as optim
from livelossplot import PlotLosses

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load the dataset for recommenders

In [113]:
data_path = os.path.join("data", "hotel_data")

interactions_df = pd.read_csv(os.path.join(data_path, "hotel_data_interactions_df.csv"), index_col=0)

base_item_features = ['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay']

column_values_dict = {
    'term': ['WinterVacation', 'Easter', 'OffSeason', 'HighSeason', 'LowSeason', 'MayLongWeekend', 'NewYear', 'Christmas'],
    'length_of_stay_bucket': ['[0-1]', '[2-3]', '[4-7]', '[8-inf]'],
    'rate_plan': ['Standard', 'Nonref'],
    'room_segment': ['[0-160]', '[160-260]', '[260-360]', '[360-500]', '[500-900]'],
    'n_people_bucket': ['[1-1]', '[2-2]', '[3-4]', '[5-inf]'],
    'weekend_stay': ['True', 'False']
}

interactions_df.loc[:, 'term'] = pd.Categorical(
    interactions_df['term'], categories=column_values_dict['term'])
interactions_df.loc[:, 'length_of_stay_bucket'] = pd.Categorical(
    interactions_df['length_of_stay_bucket'], categories=column_values_dict['length_of_stay_bucket'])
interactions_df.loc[:, 'rate_plan'] = pd.Categorical(
    interactions_df['rate_plan'], categories=column_values_dict['rate_plan'])
interactions_df.loc[:, 'room_segment'] = pd.Categorical(
    interactions_df['room_segment'], categories=column_values_dict['room_segment'])
interactions_df.loc[:, 'n_people_bucket'] = pd.Categorical(
    interactions_df['n_people_bucket'], categories=column_values_dict['n_people_bucket'])
interactions_df.loc[:, 'weekend_stay'] = interactions_df['weekend_stay'].astype('str')
interactions_df.loc[:, 'weekend_stay'] = pd.Categorical(
    interactions_df['weekend_stay'], categories=column_values_dict['weekend_stay'])

display(HTML(interactions_df.head(15).to_html()))

Unnamed: 0,user_id,item_id,term,length_of_stay_bucket,rate_plan,room_segment,n_people_bucket,weekend_stay
0,1,0,WinterVacation,[2-3],Standard,[260-360],[5-inf],True
1,2,1,WinterVacation,[2-3],Standard,[160-260],[3-4],True
2,3,2,WinterVacation,[2-3],Standard,[160-260],[2-2],False
3,4,3,WinterVacation,[4-7],Standard,[160-260],[3-4],True
4,5,4,WinterVacation,[4-7],Standard,[0-160],[2-2],True
5,6,5,Easter,[4-7],Standard,[260-360],[5-inf],False
6,7,6,OffSeason,[2-3],Standard,[260-360],[5-inf],True
7,8,7,HighSeason,[2-3],Standard,[160-260],[1-1],False
8,9,8,HighSeason,[2-3],Standard,[0-160],[1-1],True
9,8,7,HighSeason,[2-3],Standard,[160-260],[1-1],False


# (Optional) Prepare numerical user features

The method below is left here for convenience if you want to experiment with content-based user features as an input for your neural network.

In [114]:
def prepare_users_df(interactions_df):

    # Write your code here
    users_df = interactions_df.drop(columns=['item_id'], errors='ignore')
    users_df = pd.get_dummies(users_df, columns=['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay'], prefix='', prefix_sep='')
    users_df = users_df.drop_duplicates(subset = ["user_id"])
    drop_values = []

    for column_name in interactions_df.columns[2:]:
        num_of_unique_values = interactions_df[column_name].nunique()
        if num_of_unique_values > 2:
            top_num = int(num_of_unique_values/2)
            worst_values = interactions_df[column_name].value_counts().sort_values(ascending=True).head(top_num).index
            for v in worst_values:
                drop_values.append(v)


    users_df.drop(drop_values, axis=1, inplace=True)
    users_df.add_prefix('user_')
    user_features = users_df.columns.values[1:].tolist()

    return users_df, user_features


users_df, user_features = prepare_users_df(interactions_df)

print(user_features)

display(users_df.loc[users_df['user_id'].isin([706, 1736, 7779, 96, 1, 50, 115])].head(15))

['WinterVacation', 'OffSeason', 'HighSeason', 'LowSeason', '[2-3]', '[4-7]', 'Standard', 'Nonref', '[0-160]', '[160-260]', '[260-360]', '[2-2]', '[3-4]', 'True', 'False']


Unnamed: 0,user_id,WinterVacation,OffSeason,HighSeason,LowSeason,[2-3],[4-7],Standard,Nonref,[0-160],[160-260],[260-360],[2-2],[3-4],True,False
0,1,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0
42,50,0,1,0,0,1,0,1,0,0,1,0,0,0,1,0
333,706,1,0,0,0,1,0,1,0,0,1,0,0,0,0,1
569,115,0,1,0,0,0,1,1,0,0,1,0,0,0,0,1
1350,1736,0,1,0,0,0,1,0,1,0,1,0,1,0,1,0
2967,96,0,0,1,0,1,0,0,1,0,1,0,0,1,1,0
12667,7779,0,1,0,0,0,1,1,0,0,1,0,0,1,1,0


# (Optional) Prepare numerical item features

The method below is left here for convenience if you want to experiment with content-based item features as an input for your neural network.

In [115]:
def prepare_items_df(interactions_df):

    # Write your code here
    items_df = interactions_df.drop(columns=['user_id'], errors='ignore')
    items_df = pd.get_dummies(items_df, columns=['term', 'length_of_stay_bucket', 'rate_plan', 'room_segment', 'n_people_bucket', 'weekend_stay'])
    items_df = items_df.drop_duplicates(subset = ["item_id"])
    item_features = items_df.columns.values[1:].tolist()

    return items_df, item_features


items_df, item_features = prepare_items_df(interactions_df)

print(item_features)

display(items_df.loc[items_df['item_id'].isin([0, 1, 2, 3, 4, 5, 6])].head(15))

['term_WinterVacation', 'term_Easter', 'term_OffSeason', 'term_HighSeason', 'term_LowSeason', 'term_MayLongWeekend', 'term_NewYear', 'term_Christmas', 'length_of_stay_bucket_[0-1]', 'length_of_stay_bucket_[2-3]', 'length_of_stay_bucket_[4-7]', 'length_of_stay_bucket_[8-inf]', 'rate_plan_Standard', 'rate_plan_Nonref', 'room_segment_[0-160]', 'room_segment_[160-260]', 'room_segment_[260-360]', 'room_segment_[360-500]', 'room_segment_[500-900]', 'n_people_bucket_[1-1]', 'n_people_bucket_[2-2]', 'n_people_bucket_[3-4]', 'n_people_bucket_[5-inf]', 'weekend_stay_True', 'weekend_stay_False']


Unnamed: 0,item_id,term_WinterVacation,term_Easter,term_OffSeason,term_HighSeason,term_LowSeason,term_MayLongWeekend,term_NewYear,term_Christmas,length_of_stay_bucket_[0-1],...,room_segment_[160-260],room_segment_[260-360],room_segment_[360-500],room_segment_[500-900],n_people_bucket_[1-1],n_people_bucket_[2-2],n_people_bucket_[3-4],n_people_bucket_[5-inf],weekend_stay_True,weekend_stay_False
0,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,1,0
1,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0
2,2,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,1
3,3,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
5,5,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,1
6,6,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,1,0


# Neural network recommender

<span style="color:red"><font size="4">**Task:**</font></span><br> 
Code a recommender based on a neural network model. You are free to choose any network architecture you find appropriate. The network can use the interaction vectors for users and items, embeddings of users and items, as well as user and item features (you can use the features you developed in the first project).

Remember to keep control over randomness - in the init method add the seed as a parameter and initialize the random seed generator with that seed (both for numpy and pytorch):

```python
self.seed = seed
self.rng = np.random.RandomState(seed=seed)
```
in the network model:
```python
self.seed = torch.manual_seed(seed)
```

You are encouraged to experiment with:
  - the number of layers in the network, the number of neurons and different activation functions,
  - different optimizers and their parameters,
  - batch size and the number of epochs,
  - embedding layers,
  - content-based features of both users and items.

In [129]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from recommenders.recommender import Recommender

class NNRecommender(Recommender):
    """
    Linear recommender class based on user and item features.
    """

    def __init__(self, seed=6789, n_neg_per_pos=5):
        """
        Initialize recommender params and variables.
        """
        super().__init__()
        self.model = None
        self.n_neg_per_pos = n_neg_per_pos
        self.recommender_df = pd.DataFrame(columns=['user_id', 'item_id', 'score'])
        self.users_df = None
        self.user_features = None
        self.seed = seed
        self.rng = np.random.RandomState(seed=seed)

def fit(self, interactions_df, users_df, items_df):
    """
    Training of the recommender.

    :param pd.DataFrame interactions_df: DataFrame with recorded interactions between users and items
        defined by user_id, item_id and features of the interaction.
    :param pd.DataFrame users_df: DataFrame with users and their features defined by user_id and the user feature columns.
    :param pd.DataFrame items_df: DataFrame with items and their features defined by item_id and the item feature columns.
    """

    interactions_df = interactions_df.copy()



    users_df, user_features = prepare_users_df(interactions_df)

    self.users_df = users_df
    self.user_features = user_features

    items_df, item_features = prepare_items_df(interactions_df)
    items_df = items_df.loc[:, ['item_id'] + item_features]



    interactions_df = interactions_df.loc[:, ['user_id', 'item_id']]

    interactions_df.loc[:, 'interacted'] = 1


    user_id_mapping = {user_id: i for i, user_id in enumerate(interactions_df['user_id'].unique())}
    item_id_mapping = {item_id: i for i, item_id in enumerate(interactions_df['item_id'].unique())}

    self.user_id_mapping = user_id_mapping
    self.item_id_mapping = item_id_mapping

    interactions_df['user_id'] = interactions_df['user_id'].map(user_id_mapping)
    interactions_df['item_id'] = interactions_df['item_id'].map(item_id_mapping)


    negative_interactions = self.generate_negative_interactions(interactions_df)
    negative_interactions['interacted'] = 0

    interactions_df = pd.concat([interactions_df, negative_interactions], ignore_index=True)

    interactions_df = pd.merge(interactions_df, users_df, on='user_id', how='left')
    interactions_df = pd.merge(interactions_df, items_df, on='item_id', how='left')


    inputs = torch.Tensor(interactions_df.drop(columns=['user_id', 'item_id', 'interacted']).values.astype('float32'))
    labels = torch.Tensor(interactions_df['interacted'].values.reshape(-1, 1).astype('float32'))
    input_dim = len(user_features) + len(item_features)
    hidden_dim = 60

    self.model = nn.Sequential(
        nn.Linear(input_dim, hidden_dim),
        nn.ReLU(),
        nn.Linear(hidden_dim, 1)
    )
    mse = nn.MSELoss()
    optimizer = optim.Adam(self.model.parameters(), lr=0.001)
    num_epochs = 12
    batch_size = 30
    num_batches = len(inputs) / batch_size
    for epoch in range(num_epochs):
        for batch in range(num_batches):
            start = batch * batch_size
            end = start + batch_size

            optimizer.zero_grad()
            batch_inputs = inputs[start:end]
            batch_labels = labels[start:end]
            outputs = self.model(batch_inputs)
            loss = mse(outputs, batch_labels)

            loss.backward()
            optimizer.step()

def recommend(self, users_df, items_df, n_recommendations=1):

    self.recommender_df = self.recommender_df[:0]

    items_df = items_df.copy()
    items_df['item_id'] = items_df['item_id'].map(self.item_id_mapping)

    for ix, user in users_df.iterrows():
        recommendations = []
        user_id = user['user_id']

        if user_id in self.user_id_mapping:
            mapped_user_id = self.user_id_mapping[user_id]


            scores = []

            chosen_pos = np.argsort(-scores)[:n_recommendations]

            for item_pos in chosen_pos:
                item_id = items_df.loc[item_pos, 'item_id']
                score = scores[item_pos]

                recommendations.append({
                    'user_id': user_id,
                    'item_id': item_id,
                    'score': score
                })

        self.recommender_df = self.recommender_df.append(recommendations, ignore_index=True)

    return self.recommender_df




# Quick test of the recommender

In [130]:
items_df = interactions_df.loc[:, ['item_id'] + base_item_features].drop_duplicates()

In [131]:
# Fit method
nn_recommender = NNRecommender()
nn_recommender.fit(interactions_df, None, None)

In [132]:
# Recommender method

recommendations = nn_recommender.recommend(pd.DataFrame([[1], [2], [3], [4], [5]], columns=['user_id']), items_df, 10)

recommendations = pd.merge(recommendations, items_df, on='item_id', how='left')
display(HTML(recommendations.to_html()))

Unnamed: 0,user_id,item_id,score,term,length_of_stay_bucket,rate_plan,room_segment,n_people_bucket,weekend_stay
0,1,-1,3.0,,,,,,
1,1,-1,3.0,,,,,,
2,1,-1,3.0,,,,,,
3,1,-1,3.0,,,,,,
4,1,-1,3.0,,,,,,
5,1,-1,3.0,,,,,,
6,1,-1,3.0,,,,,,
7,1,-1,3.0,,,,,,
8,1,-1,3.0,,,,,,
9,1,-1,3.0,,,,,,


# Tuning method

In [None]:
from evaluation_and_testing.testing import evaluate_train_test_split_implicit

seed = 6789

In [None]:
from hyperopt import hp, fmin, tpe, Trials
import traceback

def tune_recommender(recommender_class, interactions_df, items_df, 
                     param_space, max_evals=1, show_progressbar=True, seed=6789):
    # Split into train_validation and test sets

    shuffle = np.arange(len(interactions_df))
    rng = np.random.RandomState(seed=seed)
    rng.shuffle(shuffle)
    shuffle = list(shuffle)

    train_test_split = 0.8
    split_index = int(len(interactions_df) * train_test_split)

    train_validation = interactions_df.iloc[shuffle[:split_index]]
    test = interactions_df.iloc[shuffle[split_index:]]

    # Tune

    def loss(tuned_params):
        recommender = recommender_class(seed=seed, **tuned_params)
        hr1, hr3, hr5, hr10, ndcg1, ndcg3, ndcg5, ndcg10 = evaluate_train_test_split_implicit(
            recommender, train_validation, items_df, seed=seed)
        return -hr10

    n_tries = 1
    succeded = False
    try_id = 0
    while not succeded and try_id < n_tries:
        try:
            trials = Trials()
            best_param_set = fmin(loss, space=param_space, algo=tpe.suggest, 
                                  max_evals=max_evals, show_progressbar=show_progressbar, trials=trials, verbose=True)
            succeded = True
        except:
            traceback.print_exc()
            try_id += 1
            
    if not succeded:
        return None
        
    # Validate
    
    recommender = recommender_class(seed=seed, **best_param_set)

    results = [[recommender_class.__name__] + list(evaluate_train_test_split_implicit(
        recommender, {'train': train_validation, 'test': test}, items_df, seed=seed))]

    results = pd.DataFrame(results, 
                           columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

    display(HTML(results.to_html()))
    
    return best_param_set

## Tuning of the recommender

<span style="color:red"><font size="4">**Task:**</font></span><br> 
Tune your model using the code below. You only need to put the class name of your recommender and choose an appropriate parameter space.

In [None]:
param_space = {
    'n_neg_per_pos': hp.quniform('n_neg_per_pos', 1, 10, 1),
    ########################
    # Write your code here #
    ########################
}

best_param_set = tune_recommender(NNRecommender, interactions_df, items_df,
                                  param_space, max_evals=10, show_progressbar=True, seed=seed)

print("Best parameters:")
print(best_param_set)

# Final evaluation

<span style="color:red"><font size="4">**Task:**</font></span><br> 
Run the final evaluation of your recommender and present its results against the Amazon and Netflix recommenders' results. You just need to give the class name of your recommender and its tuned parameters below.

It's optional, but for better effect you can include here the results from all recommenders created during in this class.

In [44]:
nn_recommender = NNRecommender(n_neg_per_pos=1)  # Initialize your recommender here

# Give the name of your recommender in the line below
nn_tts_results = [['NNRecommender'] + list(evaluate_train_test_split_implicit(
    nn_recommender, interactions_df, items_df))]

nn_tts_results = pd.DataFrame(
    nn_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(nn_tts_results.to_html()))

NameError: name 'evaluate_train_test_split_implicit' is not defined

In [None]:
from recommenders.amazon_recommender import AmazonRecommender

amazon_recommender = AmazonRecommender()

amazon_tts_results = [['AmazonRecommender'] + list(evaluate_train_test_split_implicit(
    amazon_recommender, interactions_df, items_df))]

amazon_tts_results = pd.DataFrame(
    amazon_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(amazon_tts_results.to_html()))

In [None]:
from recommenders.netflix_recommender import NetflixRecommender

netflix_recommender = NetflixRecommender(embedding_dim=8, n_epochs=200, print_type='live')

netflix_tts_results = [['NetflixRecommender'] + list(evaluate_train_test_split_implicit(
    netflix_recommender, interactions_df, items_df))]

netflix_tts_results = pd.DataFrame(
    netflix_tts_results, columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])

display(HTML(netflix_tts_results.to_html()))

In [None]:
tts_results = pd.concat([nn_tts_results, amazon_tts_results, netflix_tts_results]).reset_index(drop=True)
display(HTML(tts_results.to_html()))

# Summary

<span style="color:red"><font size="4">**Task:**</font></span><br> 
Write a summary of your experiments. What worked well and what did not? What are your thoughts how could you possibly further improve the model?

In [None]:
###########################
# Functions in data preprocessing and features were improved, but recommender did not work correctly. Neutral network model was created and there was no error, but no matter how scores were calculated in recommend method, quick test of the recommender was showing -1 item ids and array with NaNs.
# I think neutral network model should be improved.#
###########################