It is highly recommended to use a powerful **GPU**, you can use it for free uploading this notebook to [Google Colab](https://colab.research.google.com/notebooks/intro.ipynb).
<table align="center">
 <td align="center"><a target="_blank" href="https://colab.research.google.com/github/ezponda/intro_deep_learning/blob/main/class/NLP/Embedding_layer.ipynb">
        <img src="https://colab.research.google.com/img/colab_favicon_256px.png"  width="50" height="50" style="padding-bottom:5px;" />Run in Google Colab</a></td>
  <td align="center"><a target="_blank" href="https://github.com/ezponda/intro_deep_learning/blob/main/class/NLP/Embedding_layer.ipynb">
        <img src="https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png"  width="50" height="50" style="padding-bottom:5px;" />View Source on GitHub</a></td>
</table>

In [None]:
import pathlib
import os
import sklearn
import tensorflow as tf
import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

## Embedding layer

Take a look at the documentation of the [Embedding](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding) layer.

The Embedding layer can be understood as a lookup table that maps from integer indices  to dense vectors (their embeddings). 

```python
tf.keras.layers.Embedding(
    input_dim, output_dim, input_length=None
)
```

- **input_dim**	Integer. Number of different categories (size of the vocabulary, number of films..) , i.e. maximum integer index + 1.
- **output_dim** Integer. Dimension of the dense embedding.
- **input_length** Length of input sequences, It is not necessary if you are not using sequences.

In [None]:
embedding_layer = tf.keras.layers.Embedding(input_dim=100, output_dim=5, input_length=None)

In [None]:
vector_ind_0 = embedding_layer(tf.constant([0]))
vector_ind_1 = embedding_layer(tf.constant([1]))
vector_ind_2 = embedding_layer(tf.constant([2]))

print(vector_ind_0.shape)
print('Embedding of entity with index 0: ', vector_ind_0.numpy().flatten())
print('Embedding of entity with index 1: ', vector_ind_1.numpy().flatten())
print('Embedding of entity with index 2: ', vector_ind_2.numpy().flatten())

In [None]:
input_sequence = [0, 1, 2, 1]
print('input sequence', input_sequence)
sequence = embedding_layer(tf.constant(input_sequence))
print('sequence embeddings shape', sequence.shape)
print('sequence embeddings', sequence.numpy())

# Applications: Recommender System

We are going too use the [MovieLens 100K dataset](https://grouplens.org/datasets/movielens/100k/). We can create model to recommend movies for a given user.

In [None]:
import os
import pathlib
import tensorflow as tf

dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'

# Download & extract, saving the file with its original name so that .with_suffix('') gives the right folder
zip_path = tf.keras.utils.get_file(
    fname=os.path.basename(dataset_url),   # 'ml-latest-small.zip'
    origin=dataset_url,
    extract=True
)
data_dir = pathlib.Path(zip_path).with_suffix('')  # -> .../ml-latest-small
print(os.listdir(data_dir))

In [None]:
ratings_file = data_dir / "ratings.csv"
df_ratings = pd.read_csv(ratings_file)
df_ratings = sklearn.utils.shuffle(df_ratings)
df_ratings.head()

In [None]:
## ratings  
from collections import Counter
print(Counter(df_ratings['rating']))

In [None]:
# Create a user index
user_ids = sorted(list(df_ratings["userId"].unique()))
user2index = {u: ind for ind, u in enumerate(user_ids)}
index2user = {ind: u for u, ind in user2index.items()}
print(list(user2index.items())[:3])

In [None]:
# Create a movie index
movie_ids = sorted(list(df_ratings["movieId"].unique()))
movie2index = {m: ind for ind, m in enumerate(movie_ids)}
index2movie = {ind: m for m, ind in movie2index.items()}
print(list(movie2index.items())[-2:])

In [None]:
df_ratings["user"] = df_ratings["userId"].apply(lambda user_id: user2index[user_id])
df_ratings["movie"] = df_ratings["movieId"].apply(lambda movie_id: movie2index[movie_id])

In [None]:
num_users, num_movies = (len(user2index), len(movie2index))
print("Number of users: {0} \nNumber of Movies: {1}".format(
    num_users, num_movies))

In [None]:
movies_file = data_dir / "movies.csv"
df_movies = pd.read_csv(movies_file)
df_movies.head()

In [None]:
movie2title_genres = {}
for _, row in df_movies.iterrows():
    movie_id = row['movieId']
    if movie_id not in movie2index:
        continue
    movie = movie2index[movie_id]
    movie2title_genres[movie] = (row['title'], row['genres'])
print(list(movie2title_genres.items())[:3])
print(list(movie2title_genres.items())[-2:])

### Create the model

In [None]:
embedding_size = 5

In [None]:
inputs = tf.keras.Input(shape=(2, ), name='user_input')

## user embeddings
user_input = inputs[:, 0]
user_embedding = layers.Embedding(num_users,
                                  embedding_size,
                                  name='user_embedding')(user_input)

## movie embeddings
movie_input = inputs[:, 1]
movie_embedding = layers.Embedding(num_movies,
                                   embedding_size,
                                   name='movie_embedding')(movie_input)

## Concat embeddings
concat = layers.concatenate([user_embedding, movie_embedding], axis=1)

## Predict Rating
layer_1 = layers.Dense(128, activation="relu", name='layer_1')(concat)

## Predict rating
outputs = layers.Dense(1, activation='relu', name='output')(layer_1)

model = tf.keras.Model(inputs=inputs, outputs=outputs, name='movie')

model.summary()

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(model, show_shapes=True)

In [None]:
#df = df.sample(frac=1, random_state=42)
x = df_ratings[["user", "movie"]].values
# Normalize the targets between 0 and 1
y = df_ratings["rating"].values  #/ 5.0 

model.compile(
    loss='mse', optimizer='adam'
)
history = model.fit(
    x=x,
    y=y,
    batch_size=64,
    epochs=4,
    verbose=1,
    validation_split=0.2,
)

In [None]:
for i,ly in enumerate(model.layers):
    print(i, ly.name)

In [None]:
%load_ext tensorboard

In [None]:
from tensorboard.plugins import projector
import os
# Set up a logs directory, so Tensorboard knows where to look for files
log_dir = './logs/mov/'
if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# Save Labels separately on a line-by-line manner.
with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
    for i, (t,g) in movie2title_genres.items():
        f.write("{}\n".format(t))
    

weights = tf.Variable(model.layers[4].get_weights()[0])
# Create a checkpoint from embedding, the filename and key are
# name of the tensor.
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

# Set up config
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

In [None]:
%tensorboard --logdir ./logs/mov/

### Show recommendations

In [None]:
def show_recommendations(user,
                         model,
                         df_ratings,
                         movie2title_genres,
                         n_movies=10):

    total_movies = set(df_ratings['movie'].unique())
    df_user = df_ratings[df_ratings.user == user].copy()
    df_user = df_user.sort_values(by="rating", ascending=False)
    movies_watched = set(list(df_user['movie'].values))
    movies_unwatched = sorted(total_movies - movies_watched)
    top_movies = df_user[['movie', 'rating']].head(5)

    print("Movies with high ratings from user {0}".format(user))
    print('-' * 50)
    for _, row in top_movies.iterrows():
        title, genres = movie2title_genres[row['movie']]
        rating = row['rating']
        print('Movie: {0} | Rating: {1}, Genres: {2}'.format(
            title, rating, genres))

    print()
    print("Movies with low ratings from user {0}".format(user))
    print('-' * 50)
    for _, row in df_user[['movie', 'rating']].tail(5).iterrows():
        title, genres = movie2title_genres[row['movie']]
        rating = row['rating']
        print('Movie: {0} | Rating: {1}, Genres: {2}'.format(
            title, rating, genres))

    movies_array = np.array(movies_unwatched)
    user_array = np.array([user] * len(movies_array))
    x = np.vstack([user_array, movies_array]).T

    ratings = model.predict(x).flatten()
    movie_ratings = [(movies_array[i],ratings[i]) for i in np.argsort(-ratings)[:n_movies]]
    
    print()
    print("Movies recommended to user {0}".format(user))
    print('-' * 50)
    for movie, rating in movie_ratings:
        title, genres = movie2title_genres[movie]
        rating = rating #* 5
        print('Movie: {0} | Rating pred: {1:.1f}, Genres: {2}'.format(
            title, rating, genres))
        
    
    movie_ratings = [(movies_array[i],ratings[i]) for i in np.argsort(ratings)[:3]]
    print()
    print("Movies NOT recommended to user {0}".format(user))
    print('-' * 50)
    for movie, rating in movie_ratings:
        title, genres = movie2title_genres[movie]
        rating = rating #* 5
        print('Movie: {0} | Rating pred: {1:.1f}, Genres: {2}'.format(
            title, rating, genres))
    
    
    return

In [None]:
from typing import Dict, Tuple
import keras

def show_recommendations(
    user: int,
    model: keras.Model,
    df_ratings: pd.DataFrame,
    movie2title_genres: Dict[int, Tuple[str, str]],
    n_movies: int = 10
) -> None:
    """
    Display:
      - Top and bottom 5 movies the user has rated
      - Top N movie recommendations (predicted highest ratings)
      - A few least recommended movies (predicted lowest ratings)
    """
    # 1. Gather all movie IDs and this user's ratings
    all_movies = set(df_ratings['movie'].unique())
    user_df = (
        df_ratings[df_ratings['user'] == user]
        .sort_values(by='rating', ascending=False)
    )

    # 2. Split into watched vs. unwatched
    watched = set(user_df['movie'])
    unwatched = np.array(sorted(all_movies - watched), dtype=int)

    # 3. Print user's top 5 rated movies
    print(f"Top 5 movies rated by user {user}")
    print('-' * 50)
    for _, row in user_df.head(5).iterrows():
        title, genres = movie2title_genres[row['movie']]
        print(f"Movie: {title} | Rating: {row['rating']} | Genres: {genres}")

    # 4. Print user's bottom 5 rated movies
    print(f"\nBottom 5 movies rated by user {user}")
    print('-' * 50)
    for _, row in user_df.tail(5).iterrows():
        title, genres = movie2title_genres[row['movie']]
        print(f"Movie: {title} | Rating: {row['rating']} | Genres: {genres}")

    # 5. Build input array for the model: pairs of [user_id, movie_id]
    user_array = np.full(shape=unwatched.shape, fill_value=user, dtype=int)
    input_pairs = np.column_stack((user_array, unwatched))

    # 6. Predict ratings for all unwatched movies
    preds = model.predict(input_pairs).flatten()

    # 7. Recommend top N movies
    top_idxs = np.argsort(-preds)[:n_movies]
    print(f"\nTop {n_movies} recommendations for user {user}")
    print('-' * 50)
    for idx in top_idxs:
        m_id = unwatched[idx]
        title, genres = movie2title_genres[m_id]
        print(f"Movie: {title} | Predicted Rating: {preds[idx]:.1f} | Genres: {genres}")

    # 8. Show a few least recommended movies
    bottom_count = min(3, len(preds))
    bottom_idxs = np.argsort(preds)[:bottom_count]
    print(f"\nMovies least recommended for user {user}")
    print('-' * 50)
    for idx in bottom_idxs:
        m_id = unwatched[idx]
        title, genres = movie2title_genres[m_id]
        print(f"Movie: {title} | Predicted Rating: {preds[idx]:.1f} | Genres: {genres}")


In [None]:
unique_users = df_ratings['user'].unique()
user = np.random.choice(unique_users)
show_recommendations(user, model, df_ratings, movie2title_genres, n_movies=10)

## Question 1: Change the embeddings dimensions and add more complexity to the model

In [None]:
user_embedding_size = ...
movie_embedding_size = ...

In [None]:
inputs = tf.keras.Input(shape=(2, ), name='user_input')

## user embeddings
user_input = inputs[:, 0]
user_embedding = layers.Embedding(num_users,
                                  ...,
                                  name='user_embedding')(user_input)

## movie embeddings
movie_input = inputs[:, 1]
movie_embedding = layers.Embedding(num_movies,
                                   ...,
                                   name='movie_embedding')(movie_input)

## Concat embeddings
concat = tf.concat([user_embedding, movie_embedding], axis=1)

## Predict Rating
layer_1 = ...(concat)

## Predict rating
outputs = layers.Dense(1, activation='relu', name='output')(layer_1)

model = tf.keras.Model(inputs=inputs, outputs=outputs, name='movie')


In [None]:
model.compile(
    loss='mse', optimizer='adam'
)
history = model.fit(
    x=x,
    y=y,
    batch_size=64,
    epochs=4,
    verbose=1,
    validation_split=0.2,
)

In [None]:
unique_users = df_ratings['user'].unique()
user = np.random.choice(unique_users)
show_recommendations(user, model, df_ratings, movie2title_genres, n_movies=10)

## Bigger Dataset


In [None]:
import os
import pathlib
import tensorflow as tf

dataset_url = 'http://files.grouplens.org/datasets/movielens/ml-latest.zip'

# 1. Download & extract, saving the file with its original name so that .with_suffix('') gives the right folder
zip_path = tf.keras.utils.get_file(
    fname=os.path.basename(dataset_url),   # 'ml-latest-small.zip'
    origin=dataset_url,
    extract=True
)

# 2. The extracted folder is named exactly like the zip minus “.zip”
data_dir = pathlib.Path(zip_path).with_suffix('')  # -> .../ml-latest

# 3. List its contents
print(os.listdir(data_dir))


In [None]:
movies_file = data_dir / "movies.csv"
ratings_file = data_dir / "ratings.csv"

### Data Preparation

In [None]:
def prepare_data(ratings_file, movies_file, nrows=750000):

    df_ratings = pd.read_csv(ratings_file, nrows=nrows)
    df_ratings = sklearn.utils.shuffle(df_ratings)

    # Create a user index
    user_ids = sorted(list(df_ratings["userId"].unique()))
    user2index = {u: ind for ind, u in enumerate(user_ids)}
    index2user = {ind: u for u, ind in user2index.items()}
    # Create a movie index
    movie_ids = sorted(list(df_ratings["movieId"].unique()))
    movie2index = {m: ind for ind, m in enumerate(movie_ids)}
    index2movie = {ind: m for m, ind in movie2index.items()}
    # Change ids
    df_ratings["user"] = df_ratings["userId"].apply(
        lambda user_id: user2index[user_id])
    df_ratings["movie"] = df_ratings["movieId"].apply(
        lambda movie_id: movie2index[movie_id])

    num_users, num_movies = (len(user2index), len(movie2index))
    print("Number of users: {0} \nNumber of Movies: {1}".format(
        num_users, num_movies))

    df_movies = pd.read_csv(movies_file)
    movie2title_genres = {}
    for _, row in df_movies.iterrows():
        movie_id = row['movieId']
        if movie_id not in movie2index:
            continue
        movie = movie2index[movie_id]
        movie2title_genres[movie] = (row['title'], row['genres'])

    return df_ratings, movie2title_genres, num_users, num_movies

df_ratings, movie2title_genres, num_users, num_movies = prepare_data(ratings_file, movies_file)
print(len(df_ratings))

### Question 2: Obtain a better model and compare the number of parameter

In [None]:
embedding_size = ...
inputs = tf.keras.Input(shape=(2, ), name='user_input')

## user embeddings
user_input = inputs[:, 0]
user_embedding = layers.Embedding(
    num_users,
    ...,
)(user_input)

## movie embeddings
movie_input = inputs[:, 1]
movie_embedding = layers.Embedding(num_movies, ...)(movie_input)

## Concat embeddings
concat = tf.concat([user_embedding, movie_embedding], axis=1)

## Predict Rating
layer_1 = ...
## Predict rating
outputs = layers.Dense(1, activation='relu', name='output')(...)

model = tf.keras.Model(inputs=inputs, outputs=outputs, name='movie')

model.summary()

In [None]:
x = df_ratings[["user", "movie"]].values
# Normalize the targets between 0 and 1
y = df_ratings["rating"].values  #/ 5.0 
model.compile(
    loss='BinaryCrossentropy', optimizer='adam'
)
history = model.fit(
    x,
    y,
    validation_split= 0.1,
    epochs=5,
    verbose=1,
    batch_size=512,
    shuffle=True
)


In [None]:
unique_users = df_ratings['user'].unique()
user = np.random.choice(unique_users)
show_recommendations(user, model, df_ratings, movie2title_genres, n_movies=10)