# Recommdender System

In this tutorial we build a recommendation system using deep learning.

# Set Up

In [0]:
# @title Imports (run this cell)
from __future__ import print_function

import numpy as np
import pandas as pd
import collections
from mpl_toolkits.mplot3d import Axes3D
from IPython import display
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)

# Load and Preprocess Data

## Load MovieLens Dataset
The dataset we use for this tutorial is the [MovieLens](https://movielens.org/) dataset.

In [0]:
from urllib.request import urlretrieve
import zipfile

urlretrieve('http://files.grouplens.org/datasets/movielens/ml-100k.zip',
             'movielens.zip')
zip_ = zipfile.ZipFile('movielens.zip', 'r')
zip_.extractall()

In [0]:
# Load each dataset

# Users df
users_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(
    'ml-100k/u.user', sep='|', names=users_cols, encoding='latin-1')

# Ratings df
ratings_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(
    'ml-100k/u.data', sep='\t', names=ratings_cols,encoding='latin-1')

# Genres df
genre_cols = [
    'genre_unknown', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy',
    'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 
    'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Movies (with `genre_cols`) df
movies_cols = [
    'movie_id', 'title', 'release_date',
    'video_release_data', 'imdb_url'] + genre_cols
movies = pd.read_csv(
    'ml-100k/u.item', sep='|', names=movies_cols, encoding='latin-1')

# Since IDs start at 1, we shift so they start at 0.
users["user_id"] = users["user_id"].apply(lambda x: str(x-1))
movies["movie_id"] = movies["movie_id"].apply(lambda x: str(x-1))
movies["year"] = movies['release_date'].apply(lambda x: str(x).split('-')[-1])
ratings["movie_id"] = ratings["movie_id"].apply(lambda x: str(x-1))
ratings["user_id"] = ratings["user_id"].apply(lambda x: str(x-1))
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

Since some movies can belong to more than one genre, we create different `genre` column as follows:
- `all_genres`: all active genres for a movie 
- `genre`: randomply sampled from the active genres

In [0]:
def mark_genres(movies, genres):

    def get_random_genre(gs):
        active = [genre for genre, g in zip(genres, gs) if g==1]
        if len(active) == 0:
            return 'Other'
        return np.random.choice(active)
    
    def get_all_genres(gs):
        active = [genre for genre, g in zip(genres, gs) if g==1]
        if len(active) == 0:
            return 'Other'
        return '-'.join(active)

    def get_all_genres(gs):
        active = [genre for genre, g in zip(genres, gs) if g==1]
        if len(active) == 0:
            return 'Other'
        return '-'.join(active)

    movies['genre'] = [
        get_random_genre(gs) for gs in zip(*[movies[genre] for genre in genres])]
    movies['all_genres'] = [
        get_all_genres(gs) for gs in zip(*[movies[genre] for genre in genres])]


mark_genres(movies, genre_cols)

In [0]:
le = LabelEncoder()

In [6]:
movie_lens = users.merge(ratings, on=['user_id'])
movie_lens = movie_lens.merge(movies, on=['movie_id'])
movie_lens = movie_lens[['age', 'sex', 'occupation', 'zip_code',
                         'rating', 'year', 'all_genres']]

movie_lens.head()

Unnamed: 0,age,sex,occupation,zip_code,rating,year,all_genres
0,24,M,technician,85711,4.0,1994,Drama
1,47,M,educator,29206,4.0,1994,Drama
2,35,F,other,37212,4.0,1994,Drama
3,27,M,programmer,52246,5.0,1994,Drama
4,49,M,educator,8403,4.0,1994,Drama


In [0]:
def normalize(X):
    mean = np.mean(X)
    std = np.std(X)
    normed = [(x-mean)/std for x in X]
    return normed

CATEGORICAL_FEATURES = ['sex', 'occupation', 'zip_code', 'year', 'all_genres']
NUMERICAL_FEATURES = ['age']
movie_lens[CATEGORICAL_FEATURES] = movie_lens[CATEGORICAL_FEATURES].apply(le.fit_transform, 0)
movie_lens[NUMERICAL_FEATURES] = movie_lens[NUMERICAL_FEATURES].apply(normalize, 0)

In [0]:
dataset = np.array(movie_lens)
dataset = tf.data.Dataset.from_tensors(dataset)


In [0]:
X = np.array(movie_lens[['age', 'sex', 'occupation', 'zip_code',
                         'year', 'all_genres']])
y = np.array(movie_lens['rating'])

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [0]:
# feature columns
age = tf.feature_column.numeric_column('age')
sex = tf.feature_column.numeric_column('sex')
occupation = tf.feature_column.numeric_column('occupation')
zip_code = tf.feature_column.numeric_column('zip_code')
year = tf.feature_column.numeric_column('year')
genres = tf.feature_column.numeric_column('genres')

optimizer = tf.keras.optimizers.RMSprop(0.001)
estimator = tf.estimator.DNNClassifier(64, feature_columns=[age,
                                                            sex, 
                                                            occupation,
                                                            zip_code, 
                                                            year, 
                                                            genres],
                                       optimizer=optimizer)

In [0]:
train_dataset = tf.data.Dataset.from_tensor_slices(X_train)
train_labels = tf.data.Dataset.from_tensor_slices(y_train)

In [0]:
def build_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=[6]),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(rate=0.3), 
        tf.keras.layers.Dense(1)
    ])

    optimizer = tf.keras.optimizers.RMSprop(learning_rate=1e-6)
    model.compile(loss='mse', optimizer=optimizer, metrics=['mae', 'mse'])
    return model

In [0]:
model = build_model()

In [19]:
model.fit(X_train, y_train, epochs=100)

Train on 75000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/10

<tensorflow.python.keras.callbacks.History at 0x7fdcf9abdf60>

In [20]:
model.evaluate(X_test, y_test)



[1.5049378686141968, 1.0057887, 1.5049382]

In [0]:
y_hat = model.predict(X_test)

In [55]:
# some predictions will be > 5. We round y_hat down to 5 if y_hat > 5.
y_hat[y_hat > 5.] = 5

array([4.973141 , 4.8510685, 4.881345 , 5.       , 4.866328 , 4.8612633,
       4.91703  , 4.8830423, 5.       , 4.802253 , 4.867864 , 5.       ,
       5.       , 5.       , 4.8253403, 5.       , 5.       , 5.       ,
       4.8666697, 5.       , 5.       , 5.       , 4.924727 , 5.       ,
       5.       , 5.       , 4.930423 , 4.8336596, 5.       , 4.8932214,
       4.895593 , 5.       , 5.       , 4.847019 , 5.       , 4.813768 ,
       4.8088655, 5.       , 5.       , 4.872428 , 4.836344 , 5.       ,
       4.8302813, 5.       , 4.8366776, 5.       , 4.8516464, 4.841177 ,
       5.       , 5.       , 4.8973064, 4.8512096, 4.900649 , 5.       ,
       4.802587 , 4.8379874, 4.88648  , 5.       , 4.9676666, 4.891804 ,
       5.       ], dtype=float32)

In [21]:
!pip install plotly==4.4.1

Collecting plotly==4.4.1
[?25l  Downloading https://files.pythonhosted.org/packages/8e/ce/6ea5683c47b682bffad39ad41d10913141b560b1b875a90dbc6abe3f4fa9/plotly-4.4.1-py2.py3-none-any.whl (7.3MB)
[K     |████████████████████████████████| 7.3MB 3.2MB/s 
Installing collected packages: plotly
  Found existing installation: plotly 4.1.1
    Uninstalling plotly-4.1.1:
      Successfully uninstalled plotly-4.1.1
Successfully installed plotly-4.4.1


In [0]:
pca = PCA(n_components=2)
pca_fit = pca.fit_transform(X_test)
pc1 = pca_fit[:, 0]
pc2 = pca_fit[:, 1]

In [90]:
kmeans = KMeans(n_clusters=64)
kmeans_fit = kmeans.fit_predict(pc1.reshape(-1, 1), pc2.reshape(-1, 1))
kmeans_fit

array([19, 53,  1, ..., 15,  4, 40], dtype=int32)

In [91]:
import plotly.express as px
fig = px.scatter_3d(x=pc1, y=pc2, z=y_hat, color=kmeans_fit)

fig.show()