In [None]:
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import display, HTML
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format='retina'
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


import warnings
warnings.filterwarnings('ignore')

In [None]:
from hwer.utils import normalize_affinity_scores_by_user_item, normalize_affinity_scores_by_user

from hwer.utils import unit_length, build_user_item_dict, build_item_user_dict, cos_sim, shuffle_copy
from hwer import HybridRecommender
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from typing import List, Dict, Tuple, Sequence, Type, Set, Optional

from surprise import Dataset
from surprise import accuracy
import pandas as pd
from pathlib import Path

from surprise.model_selection import train_test_split
import numpy as np
from tqdm import tqdm,tqdm_notebook


In [None]:


users = pd.read_csv("users.csv", sep="\t", engine="python")
movies = pd.read_csv("movies.csv", sep="\t", engine="python")
ratings = pd.read_csv("ratings.csv", sep="\t", engine="python")

users['user_id'] = users['user_id'].astype(str)
movies['movie_id'] = movies['movie_id'].astype(str)
ratings['movie_id'] = ratings['movie_id'].astype(str)
ratings['user_id'] = ratings['user_id'].astype(str)

print(users.shape, movies.shape, ratings.shape)


from importlib import reload
import hwer
reload(hwer)




In [None]:
from ast import literal_eval

movies.genres = movies.genres.fillna("[]").apply(literal_eval)
movies['year'] = movies['year'].fillna(-1).astype(int)

movies.keywords = movies.keywords.fillna("[]").apply(literal_eval)
movies.keywords = movies.keywords.apply(lambda x: " ".join(x))

movies.tagline = movies.tagline.fillna("")
text_columns = ["title","keywords","overview","tagline","original_title"]
movies[text_columns] = movies[text_columns].fillna("")

movies['text'] = movies["title"] +" "+ movies["keywords"] +" "+ movies["overview"] +" "+ movies["tagline"] +" "+ movies["original_title"]
movies["title_length"] = movies["title"].apply(len)
movies["overview_length"] = movies["overview"].apply(len)
movies["runtime"] = movies["runtime"].fillna(0.0)


In [None]:
ratings.head().values
user_item_affinities = [[row[0], row[1], row[2]] for row in ratings.values]


In [None]:

from hwer import MultiCategoricalEmbedding, FlairGlove100AndBytePairEmbedding, CategoricalEmbedding, NumericEmbedding, FlairGlove100Embedding
from hwer import Feature, FeatureSet, ContentRecommendation, FeatureType

embedding_mapper = {}
embedding_mapper['gender'] = CategoricalEmbedding(n_dims=1)
embedding_mapper['age'] = CategoricalEmbedding(n_dims=1)
embedding_mapper['occupation'] = CategoricalEmbedding(n_dims=2)
embedding_mapper['zip'] = CategoricalEmbedding(n_dims=2)

embedding_mapper['text'] = FlairGlove100Embedding()
embedding_mapper['numeric'] = NumericEmbedding(2)
embedding_mapper['genres'] = MultiCategoricalEmbedding(n_dims=2)


recsys = ContentRecommendation(embedding_mapper=embedding_mapper, knn_params=None, n_output_dims=8, rating_scale=(1,5))


u1 = Feature(feature_name="gender", feature_type=FeatureType.CATEGORICAL, values=users.gender.values)
u2 = Feature(feature_name="age", feature_type=FeatureType.CATEGORICAL, values=users.age.astype(str).values)
u3 = Feature(feature_name="occupation", feature_type=FeatureType.CATEGORICAL, values=users.occupation.astype(str).values)
u4 = Feature(feature_name="zip", feature_type=FeatureType.CATEGORICAL, values=users.zip.astype(str).values)
user_data = FeatureSet([u1, u2, u3, u4])

i1 = Feature(feature_name="text", feature_type=FeatureType.STR, values=movies.text.values)
i2 = Feature(feature_name="genres", feature_type=FeatureType.MULTI_CATEGORICAL, values=movies.genres.values)
i3 = Feature(feature_name="numeric", feature_type=FeatureType.NUMERIC, values=movies[["title_length", "overview_length", "runtime"]].values)
item_data = FeatureSet([i1, i2, i3])

kwargs = {}
kwargs['user_data'] = user_data
kwargs['item_data'] = item_data

user_vectors, item_vectors = recsys.fit(users.user_id.values, movies.movie_id.values,
               user_item_affinities, **kwargs)




In [None]:
res, dist = zip(*recsys.find_items_for_user(user='1', positive=[], negative=[]))
res = res[:100]

preds = set(movies[movies.movie_id.isin(res)]["title"])
actuals = set(movies.merge(ratings[ratings.user_id=='1'],on='movie_id')["title"])

len(preds.intersection(actuals))


# Code Graveyard

In [None]:
# Make Base Hybrid Recsys
# Do train-test split before testing
# Use Content Vectors as initialisers
# Train with fixed mu + bu for collaborative embeddings
# Train prediction network with mu+bu+bi

In [None]:
mean, bu, bi, spread, user_item_affinities = normalize_affinity_scores_by_user(user_item_affinities_orig)

In [None]:


class FixedNorm(tf.keras.constraints.Constraint):
    """
    Refer: 
    https://github.com/keras-team/keras/issues/1580
    https://github.com/tensorflow/tensorflow/issues/33755
    """
    def __init__(self, m=1.):
        self.m = m

    def __call__(self, p):
        p = K.transpose(p)
        unit_norm = p / (K.sqrt(K.sum(K.square(p), axis=0)) + 1e-6)
        unit_norm = K.transpose(unit_norm)
        return unit_norm * self.m

    def get_config(self):
        return {'name': self.__class__.__name__, 'm': self.m}

In [None]:
self = recsys

In [None]:
import tensorflow.keras.backend as K

In [None]:

def __entity_entity_affinities_trainer__(entity_ids: List[str],
                                         entity_entity_affinities: List[Tuple[str, str, float]],
                                         entity_id_to_index: Dict[str, int],
                                         vectors: np.ndarray,
                                         n_output_dims: int,
                                         lr=0.001,
                                         epochs=15,
                                         batch_size = 512) -> np.ndarray:
    train_affinities, validation_affinities = train_test_split(entity_entity_affinities, test_size=0.5)
    batch_size = batch_size

    def generate_training_samples(affinities: List[Tuple[str, str, float]]):
        def generator():
            for i, j, r in affinities:
                first_item = entity_id_to_index[i]
                second_item = entity_id_to_index[j]
                r = np.clip(r, -1.0, 1.0)
                yield (first_item, second_item), r

        return generator

    output_shapes = (((), ()), ())
    output_types = ((tf.int64, tf.int64), tf.float32)

    train = tf.data.Dataset.from_generator(generate_training_samples(train_affinities),
                                           output_types=output_types, output_shapes=output_shapes, )
    validation = tf.data.Dataset.from_generator(generate_training_samples(validation_affinities),
                                                output_types=output_types,
                                                output_shapes=output_shapes, )
    train = train.shuffle(batch_size).batch(batch_size)
    validation = validation.shuffle(batch_size).batch(batch_size)

    input_1 = keras.Input(shape=(1,))
    input_2 = keras.Input(shape=(1,))

    def build_base_network(embedding_size, vectors):
        avg_value = np.mean(vectors)
        i1 = keras.Input(shape=(1,))

        embeddings_initializer = tf.keras.initializers.Constant(vectors)
        embeddings = keras.layers.Embedding(len(entity_ids), embedding_size, input_length=1,
                                            embeddings_initializer=embeddings_initializer)
        # embeddings_constraint=FixedNorm()
        # embeddings_constraint=tf.keras.constraints.unit_norm(axis=2)
        item = embeddings(i1)
        item = tf.keras.layers.Flatten()(item)
        item = tf.keras.layers.GaussianNoise(0.01 * avg_value)(item)
        dense = keras.layers.Dense(embedding_size * 8, activation="relu", )
        item = dense(item)
        dense_0 = keras.layers.Dense(embedding_size * 8, activation="relu", )
        item = dense_0(item)
        dense_1 = keras.layers.Dense(embedding_size * 8, activation="relu", )
        item = dense_1(item)
        dense_2 = keras.layers.Dense(embedding_size, activation="linear", )
        item = dense_2(item)
        item = tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=-1))(item)
        item = K.l2_normalize(item, axis=-1)
        base_network = keras.Model(inputs=i1, outputs=item)
        return base_network

    bn = build_base_network(n_output_dims, vectors)

    item_1 = bn(input_1)
    item_2 = bn(input_2)

    pred = tf.keras.layers.Dot(axes=1, normalize=True)([item_1, item_2])
    #     pred = K.sum(item_1*item_2, keepdims=True, axis=-1)

    #     pred = pred/2 + 0.5
#     pred = K.clip(pred, -1, 1)
    pred = K.tanh(pred)
    print(pred, pred.shape)

    model = keras.Model(inputs=[input_1, input_2],
                        outputs=[pred])
    #     encoder = tf.keras.Model(input_1, item_1)
    encoder = bn

    adam = tf.keras.optimizers.Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False)
    model.compile(optimizer=adam,
                  loss=['mean_squared_error'])

    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0, patience=8, verbose=0, )
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.2, patience=2, min_lr=0.0001)
    callbacks = [es, reduce_lr]

    model.fit(train, epochs=epochs,
              validation_data=validation, callbacks=callbacks)
    
    K.set_value(model.optimizer.lr, lr)

    model.fit(validation, epochs=epochs,
              validation_data=train, callbacks=callbacks)

    return encoder.predict(
        tf.data.Dataset.from_tensor_slices([entity_id_to_index[i] for i in entity_ids]).batch(batch_size))


In [None]:
import tensorflow.keras.backend as K
def __item_item_affinities_triplet_trainer__(self,
                                   item_ids: List[str],
                                   item_item_affinities: List[Tuple[str, str, int]],
                                   item_vectors: np.ndarray) -> np.ndarray:
    train_affinities, validation_affinities = train_test_split(item_item_affinities, test_size=0.5)
    batch_size = 512

    def generate_training_samples(affinities: List[Tuple[str, str, bool]], 
                                  random_pair_proba=0.25, random_pair_weight=0.25):
        item_ids = list(self.item_id_to_index.keys())
        item_close_dict = {}
        item_far_dict = {}
        for i,j,r in affinities:
            assert r!=0
            if r>0:
                if i in item_close_dict:
                    item_close_dict[i].append((j,r))
                else:
                    item_close_dict[i] = [(j,r)]
                    
                if j in item_close_dict:
                    item_close_dict[j].append((i,r))
                else:
                    item_close_dict[j] = [(i,r)]
            if r<0:
                if i in item_far_dict:
                    item_far_dict[i].append((j,r))
                else:
                    item_far_dict[i] = [(j,r)]
                    
                if j in item_far_dict:
                    item_far_dict[j].append((i,r))
                else:
                    item_far_dict[j] = [(i,r)]
        
        total_items = len(item_ids)
        def generator():
            for i, j, r in affinities:
                first_item = self.item_id_to_index[i]
                second_item = self.item_id_to_index[j]
                distant_item = None
                if np.random.rand() < random_pair_proba:
                    distant_item = item_ids[np.random.randint(0,len(item_ids))]
                    distant_item = self.item_id_to_index[distant_item]
                    distant_item_weight = random_pair_weight
                    if r<0:
                        distant_item, second_item = second_item, distant_item
                        distant_item_weight, r = r, distant_item_weight
                        
                #
                else:
                    if r>0:
                        if i in item_far_dict:
                            distant_item,distant_item_weight = item_far_dict[i][np.random.randint(0,len(item_far_dict[i]))]
                        else:
                            distant_item = item_ids[np.random.randint(0,len(item_ids))]
                            distant_item_weight = random_pair_weight
                        distant_item = self.item_id_to_index[distant_item]
                    else:
                        if i in item_close_dict:
                            distant_item,distant_item_weight = item_close_dict[i][np.random.randint(0,len(item_close_dict[i]))]
                        else:
                            distant_item = item_ids[np.random.randint(0,len(item_ids))]
                            distant_item_weight = random_pair_weight
                        distant_item = self.item_id_to_index[distant_item]
                        distant_item, second_item = second_item, distant_item
                        distant_item_weight, r = r, distant_item_weight
#                 print((first_item, second_item, distant_item, r, distant_item_weight), r)      
                yield (first_item, second_item, distant_item, r, distant_item_weight), r
        return generator

    output_shapes = (((), (), (), (), ()), ())
    output_types = ((tf.int64, tf.int64, tf.int64, tf.float32, tf.float32), tf.float32)

    train = tf.data.Dataset.from_generator(generate_training_samples(train_affinities),
                                           output_types=output_types, output_shapes=output_shapes, )
    validation = tf.data.Dataset.from_generator(generate_training_samples(validation_affinities),
                                                output_types=output_types,
                                                output_shapes=output_shapes,)
    
    train = train.shuffle(batch_size).batch(batch_size)
    validation = validation.shuffle(batch_size).batch(batch_size)
#     for t in iter(train):
#         print(t)
#     for t in iter(validation):
#         print(t)
#     return

    input_1 = keras.Input(shape=(1,))
    input_2 = keras.Input(shape=(1,))
    input_3 = keras.Input(shape=(1,))
    
    close_weight = keras.Input(shape=(1,))
    far_weight = keras.Input(shape=(1,))
    
    def build_base_network(embedding_size, item_vectors):
        avg_value = np.mean(item_vectors)
        i1 = keras.Input(shape=(1,))
        
    
        embeddings_initializer = tf.keras.initializers.Constant(item_vectors)
        embeddings = keras.layers.Embedding(len(item_ids), self.n_output_dims, input_length=1,
                                     embeddings_initializer=embeddings_initializer)
        item = embeddings(i1)
        item = tf.keras.layers.Flatten()(item)
        item = tf.keras.layers.GaussianNoise(0.001*avg_value)(item)
        dense = keras.layers.Dense(embedding_size*8, activation="relu",)
        item = dense(item)
        dense_0 = keras.layers.Dense(embedding_size*8, activation="relu",)
        item = dense_0(item)
        dense_1 = keras.layers.Dense(embedding_size*8, activation="relu",)
        item = dense_1(item)
        dense_2 = keras.layers.Dense(embedding_size, activation="linear",)
        item = dense_2(item)
        #         item = tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(item)
        item = K.l2_normalize(item, axis=-1)
        base_network = keras.Model(inputs=i1, outputs=item)
        return base_network
    
    bn = build_base_network(self.n_output_dims, item_vectors)
    
        
    
    item_1 = bn(input_1)
    item_2 = bn(input_2)
    item_3 = bn(input_3)
    
    i1_i2_dist = tf.keras.layers.Dot(axes=1, normalize = True)([item_1, item_2])
    i1_i2_dist = 1 - K.tanh(i1_i2_dist)
    i1_i2_dist = close_weight * i1_i2_dist
    
    i1_i3_dist = tf.keras.layers.Dot(axes=1, normalize = True)([item_1, item_3])
    i1_i3_dist = 1 - K.tanh(i1_i3_dist)
    i1_i3_dist = i1_i3_dist / K.abs(far_weight)
    #     pred = K.sum(item_1*item_2, keepdims=True, axis=-1)
    
    #     pred = pred/2 + 0.5
    margin = 1.0
    loss = K.relu(i1_i2_dist - i1_i3_dist + margin)
    

    model = keras.Model(inputs=[input_1, input_2, input_3, close_weight, far_weight ],
                        outputs=[loss])
    #     encoder = tf.keras.Model(input_1, item_1)
    encoder = bn

    adam = tf.keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.01, amsgrad=False)
    model.compile(optimizer=adam,
                  loss=['mean_squared_error'])

    es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.0, patience=5, verbose=0, )
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.4, patience=2, min_lr=0.00001)
    callbacks=[es, reduce_lr]


    model.fit(train, epochs=30,
              validation_data=validation, callbacks=callbacks)

    model.fit(validation, epochs=30,
              validation_data=train, callbacks=callbacks)

    return encoder.predict(tf.data.Dataset.from_tensor_slices([self.item_id_to_index[i] for i in item_ids]).batch(batch_size))


In [None]:
class FakeRec:
    def __init__(self, item_id_to_index, n_output_dims):
        self.item_id_to_index = item_id_to_index
        self.n_output_dims = n_output_dims
        
self = FakeRec(dict(zip(map(str,range(1,9)),range(0,8))),2)
self
item_ids = list(self.item_id_to_index.keys())
ivs = np.random.randn(len(self.item_id_to_index.keys()),2)
ivs

random_item_item_aff = [("1","2",1),("2","3",1),("3","4",1),("1","3",1),("1","4",1),("2","4",1),
                        ("5","6",1),("6","7",1),("7","8",1),("5","7",1),("5","8",1),("6","8",1),
                        ("1","5",-1),("2","6",-1),("3","7",-1),("4","8",-1),
                        ("2","5",-1),("3","6",-1),("4","7",-1),("1","8",-1),
                        ("2","6",-1),("1","7",-1),("3","8",-1),("4","5",-1),
                        ("1","6",-1),("1","8",-1),("2","7",-1),("2","8",-1),
                        ("3","8",-1),("4","5",-1),("3","5",-1),("4","6",-1)]
sample_embeddings = __item_item_affinities_trainer__(self,item_ids,random_item_item_aff,ivs)


In [None]:
self = recsys
item_ids = list(self.item_id_to_index.keys())
random_item_item_aff = [(i,item_ids[np.random.randint(0,len(item_ids))],1 if int(i)%2==0 else -1) for i in item_ids]

sample_embeddings = __entity_entity_affinities_trainer__(item_ids,random_item_item_aff,
                                                         self.item_id_to_index,item_vectors, self.n_output_dims)


In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(x=ivs[:,0], y=ivs[:,1], hue=[int(i) for i in item_ids])
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.scatterplot(x=sample_embeddings[:,0], y=sample_embeddings[:,1], hue=[int(i) for i in item_ids])
plt.show()

In [None]:

actual_vs_pred = [(r,cos_sim(item_vectors[self.item_id_to_index[i]],item_vectors[self.item_id_to_index[j]])) for i,j,r in random_item_item_aff]
np.sqrt(np.mean(np.square(np.array([a-p for a,p in actual_vs_pred]))))

In [None]:

import tensorflow.keras.backend as K
batch_size = 2


def generate_training_samples():
    def generator():
        for i in range(batch_size*10):
            yield (np.random.rand(3),np.random.rand(3), np.random.rand()), 5
    return generator

output_shapes = (((3), (3), ()), ())
output_types = (((tf.float32), (tf.float32), tf.float32), tf.float32)
train = tf.data.Dataset.from_generator(generate_training_samples(),
                                       output_types=output_types, output_shapes=output_shapes,)

train = train.shuffle(batch_size).batch(batch_size)

from tensorflow.keras import layers

input_1 = keras.Input(shape=(3,))
input_2 = keras.Input(shape=(3,))
input_3 = keras.Input(shape=(1,))

inputs = K.concatenate([input_1, input_2, input_3])
inputs = tf.keras.layers.Flatten()(inputs)
dense_1 = layers.Dense(16, activation='relu')


x = dense_1(inputs)

x = layers.Dense(8, activation="relu")(x)

pred = layers.Dense(1, activation='linear')(x)

model = keras.Model(inputs=[input_1, input_2, input_3],
                    outputs=[pred])

adam = tf.keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.1, amsgrad=False)
model.compile(optimizer=adam,
              loss=['mean_squared_error'])


model.fit(train, epochs=2)


def generate_prediction_samples():
    def generator():
        for i in range(batch_size*2):
            yield np.random.rand(3),np.random.rand(3), np.random.rand()
    return generator

output_shapes = ((3), (3), ())
output_types = (tf.float32, tf.float32, tf.float32)
predict = tf.data.Dataset.from_generator(generate_prediction_samples(),
                                       output_types=output_types, output_shapes=output_shapes,)

predict = predict.batch(batch_size)
next(iter(predict))

model.predict(next(iter(predict)))

# model.predict_generator(iter(predict), steps=2)




In [None]:
def generate_prediction_samples():
    def generator():
        for i in range(batch_size*2):
#             yield np.random.rand(3),np.random.rand(3)
            yield [np.random.rand(3).reshape((-1,3)),np.random.rand(3).reshape((-1,3)), np.array([np.random.rand()])]


    return generator


model.predict_generator(iter(generate_prediction_samples()()), steps=4)
model.predict_generator(generate_prediction_samples()(), steps=4)


# model.predict(next(iter(generate_prediction_samples()())))
