In [416]:
# We have downloaded the dataset from https://www.kaggle.com/rounakbanik/the-movies-dataset and stored in archive folder
# Load the data using pandas
import tensorflow as tf
import numpy as np
import pandas as pd

In [431]:
# Load movie metadata 
df_data = pd.read_csv('archive/movies_metadata.csv', low_memory=False)

In [432]:
df_keywords = pd.read_csv('archive/keywords.csv')

# Some ids have irregular format, so we will remove them
df_cb = df_data.copy(deep=True)[df_data.id.apply(lambda x: x.isnumeric())]
df_cb['id'] = df_cb['id'].astype(int)
df_keywords['id'] = df_keywords['id'].astype(int)

# Merging keywords, credits of movies with main data set
df_movies_data = pd.merge(df_cb, df_keywords, on='id')
df_movies_data = df_movies_data.drop(['status','tagline','vote_average','vote_count','keywords', 'budget', 'adult', 'original_language','production_companies','production_countries', 'revenue','runtime','spoken_languages','belongs_to_collection', 'homepage', 'imdb_id', 'original_title', 'overview', 'poster_path', 'release_date', 'video'], axis=1)

In [433]:
#We are processing the genres column to do the hot encoding, This could be done better, at this time, I'm trying this approach.
import json

data = {'color': [[{'name':'red'},{'name':'blue'}], [{'name':'blue'}], [{'name':'green'}], [{'name':'red'}], [{'name':'blue'}]]}
df = pd.DataFrame(data)
cat = []
def list_me(df):
    a = []
    dict = {}
    X = df['genres']
    # Replace single quotes with double quotes
    X = str(X).replace("'", '"')
    
    #convert string to  object
    X = json.loads(X)
    for i in range(len(X)):
        x = X[i]
        a.append(x['name'])
        if x['name'] not in cat: 
            cat.append(x['name'])
    dict = {}
    dict["colors"] = a
    #return json.dumps(dict, indent = 4)
    return dict

df_movies_data['genres'] = df_movies_data.apply(list_me, axis = 1)
print(df_movies_data['genres'])
temp = pd.get_dummies(pd.DataFrame.from_records(df_movies_data.genres.values)['colors'].explode())
temp = temp.replace({True: 1, False: 0})
temp = temp.groupby(temp.index).sum()

0         {'colors': ['Animation', 'Comedy', 'Family']}
1        {'colors': ['Adventure', 'Fantasy', 'Family']}
2                     {'colors': ['Romance', 'Comedy']}
3            {'colors': ['Comedy', 'Drama', 'Romance']}
4                                {'colors': ['Comedy']}
                              ...                      
46477                   {'colors': ['Drama', 'Family']}
46478                             {'colors': ['Drama']}
46479       {'colors': ['Action', 'Drama', 'Thriller']}
46480                                    {'colors': []}
46481                                    {'colors': []}
Name: genres, Length: 46482, dtype: object


In [434]:
df_movies_data = df_movies_data.drop(columns=['genres'])
df_movies_data = df_movies_data.join(temp, how="inner")
df_movies_data = df_movies_data.sort_values(by=['id']).reset_index()
df_x1 = df_movies_data.drop(columns=['index','id','title','popularity'])
print(df_movies_data)
# Now we have the movie data with some features. This is our X.


       index      id popularity                             title  Action  \
0       4356       2   3.860491                             Ariel       0   
1      12991       3    2.29211               Shadows in Paradise       0   
2         17       5   9.026586                        Four Rooms       0   
3        474       6   5.538671                    Judgment Night       1   
4        256      11  42.149697                         Star Wars       1   
...      ...     ...        ...                               ...     ...   
46477  46095  465044   0.281008                         Abduction       0   
46478  46289  467731   0.001189       Tragedy in a Temporary Town       0   
46479  21965  468343   0.001202          Silja - nuorena nukkunut       0   
46480  46414  468707   0.347806  Thick Lashes of Lauri Mäntyvaara       0   
46481  20268  469172   0.001097   Manuel on the Island of Wonders       0   

       Adventure  Animation  Comedy  Crime  Documentary  ...  History  Horr

In [435]:
# In the dataset , we have only user id as the user feature.Now, we need to generate some features from the data. For instance, avaerage rating per genre for each user.


In [436]:
df_ratings = pd.read_csv('archive/ratings_small.csv', low_memory=False)

In [437]:
# importing reduce()
from functools import reduce
 
def average(lst):
    return reduce(lambda a, b: a + b, lst) / len(lst)


ratings = df_ratings.pivot(index='userId', columns='movieId', values='rating')
ratings = ratings.fillna(0)
ratings_another = ratings.copy()
movieIds = []
for col in ratings.columns:
    movieIds.append(col)
genres_name= []
for col in df_movies_data.columns:
    genres_name.append(col)
genres_name = genres_name[4:]

# Initialize data of lists
#data = [{'b': 2, 'c': 3}, {'a': 10, 'b': 20, 'c': 30}]
data = {}
  
# Creates pandas DataFrame by passing
# Lists of dictionaries and row index.
#df = pd.DataFrame(data, index=['first', 'second'])
df_x2 = pd.DataFrame(data, index=[])

# Iterate all rows using DataFrame.iterrows()
for user, row in ratings.iterrows():
    
    # All the movie rated by the user
    row = row.to_numpy()

    # Get the column index of  movies with a rating greater than 0.0. The column index is the id of the film.
    r_movies = np.where(row > 0.0)

    # Converted to a list
    rated_movies = r_movies[0].tolist()
    user_ratings = {}

    #iterate through ids
    for i in range(len(rated_movies)):
        
        # One movie at a time rate by the user j
        movie_df = df_movies_data.loc[df_movies_data['id'] == rated_movies[i]] 
        score = row[rated_movies[i]]
        if movie_df.shape[0] == 1:
            
            # Convert the movie data to a list
            movie = movie_df.values[0].tolist()
            
            # Get the genres-only movie data
            genres_all = np.array(movie[4:])

            # Get the genres that are being activated for this particular movie.
            genres = np.where(genres_all == 1)[0]

            # List through genres
            for k in range(len(genres.tolist())):
                genre = genres_name[genres.tolist()[k]]
                if genre in user_ratings.keys():
                    user_ratings[genre].append(score)
                else:
                    user_ratings[genre] = []
                    user_ratings[genre].append(score)
    #break

    for key in user_ratings:
        user_ratings[key] = round(average(user_ratings[key]),2)

    index = []
    index.append(user)
    df_x2 = pd.concat([df_x2, pd.DataFrame(user_ratings, index=index)])    


In [438]:
df_x2 = df_x2.fillna(0)
print(df_x2)

     Animation  Science Fiction  Drama  Romance  Horror  Thriller  Action  \
1         2.50             2.50   3.20     3.00    4.00      3.00    2.00   
2         4.00             3.00   3.63     3.70    4.00      3.55    3.31   
3         0.00             0.00   3.73     4.00    3.50      3.61    3.58   
4         5.00             4.56   4.52     4.50    4.30      4.50    4.12   
5         4.00             4.00   3.76     3.92    4.00      4.11    3.95   
..         ...              ...    ...      ...     ...       ...     ...   
667       4.50             3.50   3.59     3.69    4.33      4.00    3.67   
668       0.00             3.00   3.67     3.75    0.00      3.50    4.00   
669       0.00             4.00   3.12     4.33    4.00      2.50    4.00   
670       3.50             4.50   3.67     4.40    2.50      4.50    5.00   
671       4.75             3.75   3.95     4.05    4.25      3.88    4.03   

     Crime  Mystery  Comedy  Music  Adventure  Fantasy  Family  Documentary

In [439]:
# Creating the NN arcitecture
num_outputs = 32
tf.random.set_seed(1)
user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear')
])

item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear')
])

num_user_features = len(df_x2.columns)
num_item_features = len(df_x1.columns)

# Create the user input and point to the base network
input_user = tf.keras.layers.Input(shape=(num_user_features))
vu = user_NN(input_user)
vu = tf.linalg.l2_normalize(vu, axis=1)

# Create the item input and point to the base network
input_item = tf.keras.layers.Input(shape=(num_item_features))
vm = item_NN(input_item)
vm = tf.linalg.l2_normalize(vm, axis=1)

# Compute the dot product of the two vectors vu and vm
output = tf.keras.layers.Dot(axes=1)([vu, vm])

# Specify the inputs and output of the model
model = tf.keras.Model([input_user, input_item], output)

model.summary()

Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_25 (InputLayer)          [(None, 20)]         0           []                               
                                                                                                  
 input_26 (InputLayer)          [(None, 20)]         0           []                               
                                                                                                  
 sequential_26 (Sequential)     (None, 32)           42400       ['input_25[0][0]']               
                                                                                                  
 sequential_27 (Sequential)     (None, 32)           42400       ['input_26[0][0]']               
                                                                                           

In [440]:
# Setup the training data such a way that input to the NNs are a user and his/her movies. The plan is to create same number of movies and user data. So, we need to duplicate the user data as user has rated multiple movies.
# For instance,
# movie 1 - user 1
# movie2 - user 2
# movie5 - user 3
#
item_list = []
user_list = []
y = []

# Iterate all rows using DataFrame.iterrows()
for user, row in ratings_another.iterrows():
    
    # All the movie rated by the user
    row = row.to_numpy()

    # Get the column index of  movies with a rating greater than 0.0. The column index is the id of the film.
    movies = np.where(row > 0.0)
    movies = movies[0].tolist()
    for movie in movies:
        movie_df = df_movies_data.loc[df_movies_data['id'] == movie] 
        movie_df = movie_df.drop(columns=['index', 'id', 'title','popularity'])
        if np.any(movie_df.to_numpy()):
            item_list.append(movie_df.to_numpy())
            user_list.append(df_x2.loc[[user]].to_numpy())
            y.append(row[movie])

    if user == 100:
        break

item_train = np.vstack(item_list)
user_train = np.vstack(user_list)
item_train = item_train[:user_train.shape[0]:]
y = np.array(y)
print(item_train.shape)
print(user_train.shape)
print(y.shape)

(6877, 20)
(6877, 20)
(6877,)


In [441]:
!pip install scikit-learn



In [442]:
# Scale the data
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

item_train_unscaled = item_train
user_train_unscaled = user_train
y_train_unscaled    = y

scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y.reshape(-1, 1))
y_train = scalerTarget.transform(y.reshape(-1, 1))

In [443]:
# Split train-test set
item_train, item_test = train_test_split(item_train, train_size=0.80, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, random_state=1)

In [444]:
# Train the NN
print(item_train)
tf.random.set_seed(1)
cost_fn = tf.keras.losses.MeanSquaredError()
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt, loss=cost_fn)
model.fit([user_train, item_train], y_train, epochs=60)

[[-0.50263423 -0.42282333 -0.16764798 ... -0.55579448 -0.18768337
  -0.15872554]
 [-0.50263423 -0.42282333 -0.16764798 ... -0.55579448 -0.18768337
  -0.15872554]
 [-0.50263423 -0.42282333 -0.16764798 ... -0.55579448 -0.18768337
  -0.15872554]
 ...
 [-0.50263423 -0.42282333 -0.16764798 ... -0.55579448 -0.18768337
  -0.15872554]
 [-0.50263423 -0.42282333 -0.16764798 ... -0.55579448 -0.18768337
  -0.15872554]
 [-0.50263423 -0.42282333 -0.16764798 ... -0.55579448 -0.18768337
  -0.15872554]]
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
E

<keras.callbacks.History at 0x7f626c6ccf50>

In [445]:
model.evaluate([user_test, item_test], y_test)



0.22523756325244904