In [1]:
import pandas as pd
from datetime import datetime
import tensorflow as tf
import numpy as np

In [2]:
credits = pd.read_csv('the-movies-dataset/credits.csv')
keywords = pd.read_csv('the-movies-dataset/keywords.csv')
movies = pd.read_csv('the-movies-dataset/movies_metadata.csv', low_memory=False).\
    drop(['belongs_to_collection', 'homepage', 'imdb_id', 'poster_path', 'status', 'title', 'video'], axis=1)

movies['id'] = movies['id'].apply(pd.to_numeric, errors='coerce')
movies.dropna(inplace=True)
movies['id'] = movies['id'].astype('int64')

df = movies.merge(keywords, on='id').merge(credits, on='id')

df['original_language'] = df['original_language'].fillna('')
df['runtime'] = df['runtime'].fillna(0)
df['tagline'] = df['tagline'].fillna('')

df.dropna(inplace=True)

In [3]:
def get_text(text, obj='name'):
    text = eval(text)
    
    if len(text) == 1:
        for i in text:
            return i[obj]
    else:
        s = []
        for i in text:
            s.append(i[obj])
        return ', '.join(s)

df['genres'] = df['genres'].apply(get_text)
df['production_companies'] = df['production_companies'].apply(get_text)
df['production_countries'] = df['production_countries'].apply(get_text)
df['crew'] = df['crew'].apply(get_text)
df['spoken_languages'] = df['spoken_languages'].apply(get_text)
df['keywords'] = df['keywords'].apply(get_text)

# New columns
df['characters'] = df['cast'].apply(get_text, obj='character')
df['actors'] = df['cast'].apply(get_text)

df.drop('cast', axis=1, inplace=True)
df = df[~df['original_title'].duplicated()]
df = df.reset_index(drop=True)

In [4]:
df.sample(5)

Unnamed: 0,adult,budget,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,...,revenue,runtime,spoken_languages,tagline,vote_average,vote_count,keywords,crew,characters,actors
2053,False,75000000,"Drama, Science Fiction, Thriller",2900,en,The Astronaut's Wife,When astronaut Spencer Armacost returns to Ear...,14.705865,"New Line Cinema, Mad Chance",United States of America,...,19598588.0,109.0,English,How well do you know the one you love?,5.5,263.0,"wife husband relationship, space travel, space...","Richard Pagano, Isis Mussenden, Leslie A. Pope...","Commander Spencer Armacost, Jillian Armacost, ...","Johnny Depp, Charlize Theron, Joe Morton, Clea..."
5878,False,0,"Thriller, Drama, Fantasy",29267,de,Der müde Tod,As a young couple stops and rests in a small v...,2.8041,Decla-Bioscop AG,Germany,...,0.0,105.0,Deutsch,Love is Stronger Than Death,7.4,25.0,"china, self sacrifice, fiancé, wall, death, si...","Erich Pommer, Fritz Lang, Fritz Lang, Fritz La...",Young Woman / Das junge Mädchen / Zobeide / Mo...,"Lil Dagover, Walter Janssen, Bernhard Goetzke,..."
3901,False,27000000,"Comedy, Drama, Romance",9583,en,Divine Secrets of the Ya-Ya Sisterhood,A mother and daughter dispute is resolved by t...,4.327218,"All Girl Productions, Gaylord Productions",United States of America,...,0.0,116.0,English,The Secret Is Out.,5.6,65.0,"secret society, conciliation, marriage, mother...","Callie Khouri, Rebecca Wells, Mark Andrus, Cal...","Siddalee 'Sidda' Walker, Viviane Joan 'Vivi' A...","Sandra Bullock, Ellen Burstyn, Fionnula Flanag..."
6659,False,0,"History, War, Drama",18770,en,Thirty Seconds Over Tokyo,"In the wake of Pearl Harbor, a young lieutenan...",1.210978,Metro-Goldwyn-Mayer (MGM),United States of America,...,0.0,138.0,"English, 普通话","Heart-Warming Romance . . . Stark, Sensational...",6.5,8.0,"pearl harbor, bomber pilot","Mervyn LeRoy, Harold Rosson, Sam Zimbalist, Ro...","Lt. Col. Jimmy Doolittle, Ted Lawson, David Th...","Spencer Tracy, Van Johnson, Robert Walker, Tim..."
16429,False,0,Horror,289097,en,Cell 213,Cocky young attorney Michael Gray finds himsel...,2.68548,Access Motion Pictures,Canada,...,0.0,109.0,"English, Português","""You're Next""",5.1,18.0,"reference to god, location in title, two word ...","Stephen T. Kay, Maninder Chana","Warden, Michael Grey, Ray Clement, Audrey Davis","Bruce Greenwood, Eric Balfour, Michael Rooker,..."


In [5]:
ratings_df = pd.read_csv('the-movies-dataset/ratings.csv')
ratings_df.sample(5)

ratings_df['date'] = ratings_df['timestamp'].apply(lambda x: datetime.fromtimestamp(x))
ratings_df.drop('timestamp', axis=1, inplace=True)

ratings_df = ratings_df.merge(df[['id', 'original_title', 'genres', 'overview']], left_on='movieId',right_on='id', how='left')
ratings_df = ratings_df[~ratings_df['id'].isna()]
ratings_df.drop('id', axis=1, inplace=True)
ratings_df.reset_index(drop=True, inplace=True)


In [6]:
movies_df = df.loc[:, ['id', 'original_title', 'genres', 'overview', 'production_companies', 'actors']]
movies_df.rename(columns={'id':'movieId'}, inplace=True)
ratings_df.sample(5)

Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
642999,22375,750,4.0,2003-05-30 21:57:01,Murder She Said,"Drama, Crime, Mystery, Comedy",Miss Marple believes she’s seen a murder in a ...
5568912,194137,1955,4.0,2002-10-22 09:31:29,The Elephant Man,"Drama, History",A Victorian surgeon rescues a heavily disfigur...
5880575,204961,6373,3.5,2003-12-28 19:54:59,"Martha – Meet Frank, Daniel and Laurence","Comedy, Romance",Laurence recounts to his neighbour how his lif...
268413,9278,1653,4.5,2005-03-23 02:58:15,Diarios de motocicleta,Drama,"""The Motorcycle Diaries"" is based on the journ..."
2363815,82073,708,3.0,1996-12-11 06:23:35,The Living Daylights,"Action, Adventure, Thriller",James Bond helps a Russian General escape into...


In [7]:
min_movie_ratings = 10000
filter_movies = (ratings_df.groupby('movieId')['rating'].sum()>min_movie_ratings)
filter_movies = filter_movies[filter_movies].index.tolist()

min_user_ratings = 200
filter_users = (ratings_df.groupby('userId')['rating'].sum()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

df_filterd = ratings_df[(ratings_df['movieId'].isin(filter_movies)) & (ratings_df['userId'].isin(filter_users))]

print('Shape User-Ratings unfiltered:\t{}'.format(ratings_df.shape))
print('Shape User-Ratings filtered:\t{}'.format(df_filterd.shape))

Shape User-Ratings unfiltered:	(7754672, 7)
Shape User-Ratings filtered:	(3788984, 7)


In [8]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df_filterd, test_size=0.3)
train_df.shape, test_df.shape

((2652288, 7), (1136696, 7))

In [9]:
user_id_mapping = {id:i for i, id in enumerate(ratings_df['userId'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(ratings_df['movieId'].unique())}


train_user_data = train_df['userId'].map(user_id_mapping)
train_movie_data = train_df['movieId'].map(movie_id_mapping)

test_user_data = test_df['userId'].map(user_id_mapping)
test_movie_data = test_df['movieId'].map(movie_id_mapping)

In [10]:
df_id_descriptions = ratings_df.loc[:, ['movieId', 'overview']].set_index('movieId')
df_id_descriptions['overview'] = df_id_descriptions['overview'].str.lower()
df_id_descriptions

Unnamed: 0_level_0,overview
movieId,Unnamed: 1_level_1
147,"for young parisian boy antoine doinel, life is..."
858,a young boy who tries to set his dad up on a d...
1246,when he loses a highly publicized virtual boxi...
1968,alex whitman (matthew perry) is a designer fro...
2762,derrick de marney finds himself in a 39 steps ...
...,...
44191,"brad, steve, hue, and marvin are four get-nowh..."
45722,a sexy spanish siren named elvira schools a yo...
46578,a busload containing three cheerleading teams ...
48385,scientific experiments accidentally revive an ...


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_hybrid = tfidf.fit_transform(df_id_descriptions['overview'])

In [29]:
tfidf_hybrid.shape

(7754672, 19561)

In [30]:
tfidf.get_feature_names_out()

array(['00', '000', '007', ..., 'émigré', 'étienne', 'ñan'], dtype=object)

In [5]:
def get_model1(len_users : int, len_movies : int, embedding_size = 10, input_length = 1):
    from keras.layers import Input, Embedding, Reshape, Dot
    from keras.models import Model

    
    user_id_input = Input(shape=[1], name='user')
    movie_id_input = Input(shape=[1], name='movie')

    # Create embedding layers for users and movies
    user_embedding = Embedding(output_dim=embedding_size, 
                                input_dim=len_users,
                                input_length=input_length, 
                                name='user_embedding')(user_id_input)
    movie_embedding = Embedding(output_dim=embedding_size, 
                                input_dim=len_movies,
                                input_length=input_length, 
                                name='item_embedding')(movie_id_input)

    # Reshape the embedding layers
    user_vector = Reshape([embedding_size])(user_embedding)
    movie_vector = Reshape([embedding_size])(movie_embedding)

    # Compute dot-product of reshaped embedding layers as prediction
    y = Dot(1, normalize=False)([user_vector, movie_vector])

    # Setup model
    model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
    return model

In [10]:
users = len(user_id_mapping)
movies = len(movie_id_mapping)

model = get_model1(users, movies)

model.compile(loss='mse', optimizer='adam')

model.fit([train_user_data, train_movie_data],
          train_df['rating'],
          batch_size=256, 
          epochs=1,
          validation_split=0.1,
          shuffle=True)



<keras.callbacks.History at 0x2a6b63ad1f0>

In [11]:
from sklearn.metrics import mean_squared_error

y_pred = model.predict([test_user_data, test_movie_data])
y_true = test_df['rating'].values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Matrix-Factorization: {:.4f} RMSE'.format(rmse))



Testing Result With Keras Matrix-Factorization: 0.8781 RMSE


In [322]:
def EmbeddingRec(EMBEDDING_SIZE, NUM_MOVIES, NUM_USERS, ROW_COUNT):
    from keras.layers import Input, Embedding, Dense, Dropout, Flatten, Dot
    from keras.models import Model

    movie_input = Input(shape=(1,), name='movie_id')

    movie_emb = Embedding(output_dim=EMBEDDING_SIZE, input_dim=NUM_MOVIES, input_length=ROW_COUNT, name='movie_emb')(movie_input)
    movie_vec = Flatten(name='FlattenMovie')(movie_emb)

    movie_model = Model(inputs=movie_input, outputs=movie_vec)
    
    user_input = Input(shape=(1,), name='user_id')

    user_emb = Embedding(output_dim=EMBEDDING_SIZE, input_dim=NUM_USERS, input_length=ROW_COUNT, name='user_emb')(user_input)
    user_vec = Flatten(name='FlattenUser')(user_emb)

    user_model = Model(inputs=user_input, outputs=user_vec)
    
    merged = Dot(name = 'dot_product', normalize = True, axes = 2)([movie_emb, user_emb])
    merged = Dropout(0.2)(merged)
    
    
    dense_1 = Dense(70,name='FullyConnected-1')(merged)
    dropout_1 = Dropout(0.2,name='Dropout_1')(dense_1)

    dense_2 = Dense(50,name='FullyConnected-2')(dropout_1)
    dropout_2 = Dropout(0.2,name='Dropout_2')(dense_2)

    dense_3 = Dense(20,name='FullyConnected-3')(dropout_2)
    dropout_3 = Dropout(0.2,name='Dropout_3')(dense_3)

    dense_4 = Dense(10,name='FullyConnected-4', activation='relu')(dropout_3)

    result = Dense(1, name='result', activation="relu") (dense_4)
    model = Model([movie_input, user_input], result)
    model.compile(loss='mse', optimizer='adam')
    return model, movie_model, user_model

In [323]:
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
mvoie_encoder = LabelEncoder()

ratings_df['userId'] = user_encoder.fit_transform(ratings_df['userId'])
ratings_df['movieId'] = mvoie_encoder.fit_transform(ratings_df['movieId'])

ratings_df

Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
0,0,82,4.5,2015-03-09 18:07:15,Les Quatre Cents Coups,Drama,"For young Parisian boy Antoine Doinel, life is..."
1,0,528,5.0,2015-03-09 17:52:03,Sleepless in Seattle,"Comedy, Drama, Romance",A young boy who tries to set his dad up on a d...
2,0,637,5.0,2015-03-09 17:52:36,Rocky Balboa,Drama,When he loses a highly publicized virtual boxi...
3,0,905,4.0,2015-03-09 18:02:28,Fools Rush In,"Drama, Comedy, Romance",Alex Whitman (Matthew Perry) is a designer fro...
4,0,1140,4.5,2015-03-09 17:48:20,Young and Innocent,"Drama, Crime",Derrick De Marney finds himself in a 39 Steps ...
...,...,...,...,...,...,...,...
7754667,261011,2504,5.0,2009-10-31 18:25:46,Loose Screws,Comedy,"Brad, Steve, Hue, and Marvin are four get-nowh..."
7754668,261011,2532,3.5,2009-10-31 18:28:35,No mires para abajo,"Drama, Romance",A sexy Spanish siren named Elvira schools a yo...
7754669,261011,2542,4.5,2009-10-31 19:07:17,The Great American Girl Robbery,"Comedy, Crime",A busload containing three cheerleading teams ...
7754670,261011,2580,4.0,2009-10-31 18:32:19,Indestructible Man,"Crime, Horror, Science Fiction",Scientific experiments accidentally revive an ...


In [324]:
min_rating = min(ratings_df["rating"])
max_rating = max(ratings_df["rating"])
min_rating, max_rating

(0.5, 5.0)

In [409]:
train, test = train_test_split(ratings_df, test_size=0.2)

In [410]:
ROW_COUNT = train.shape[0]
EMBEDDING_SIZE = 10
NUM_USERS = ratings_df['userId'].nunique()
NUM_MOVIES = ratings_df['movieId'].nunique()
UNIQUE_MOVIE_IDS = ratings_df['movieId'].unique()
MOVIE_METADATA = ratings_df.groupby('movieId', as_index=False)[['original_title', 'genres', 'overview']].first()

NUM_USERS, NUM_MOVIES, ROW_COUNT

(261012, 3761, 6203737)

In [411]:
MOVIE_METADATA

Unnamed: 0,movieId,original_title,genres,overview
0,0,Four Rooms,"Crime, Comedy",It's Ted the Bellhop's first night on the job....
1,1,Judgment Night,"Action, Thriller, Crime","While racing to a boxing match, Frank, Mike, J..."
2,2,Star Wars,"Adventure, Action, Science Fiction",Princess Leia is captured and held hostage by ...
3,3,Finding Nemo,"Animation, Family","Nemo, an adventurous young clownfish, is unexp..."
4,4,Forrest Gump,"Comedy, Drama, Romance",A man with a low IQ has accomplished great thi...
...,...,...,...,...
3756,3756,Ferocious,Thriller,"Amanda Crew stars as Leigh Parrish, a successf..."
3757,3757,Cheap Thrills,"Drama, Comedy, Crime","Recently fired and facing eviction, a new dad ..."
3758,3758,A Madea Christmas,"Comedy, Drama",Madea dispenses her unique form of holiday spi...
3759,3759,Foreign Letters,,"A bittersweet coming-of-age film, Foreign Lett..."


In [328]:
model, movie_model, user_model = EmbeddingRec(EMBEDDING_SIZE, NUM_MOVIES, NUM_USERS, ROW_COUNT)

In [237]:
train['movieId'].shape, train['userId'].shape,train['rating'].shape

((6203737,), (6203737,), (6203737,))

In [238]:
test['movieId'].shape, test['userId'].shape,test['rating'].shape


((1550935,), (1550935,), (1550935,))

In [239]:
min(train['movieId'])

0

In [333]:
from keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = [EarlyStopping('val_loss', patience=10),
             ModelCheckpoint('besttest.h5', save_best_only=True)]

history = model.fit([train['movieId'], train['userId']],
          train['rating'],
          batch_size=256, 
          epochs=1,
          validation_split=0.1,
          shuffle=True,
          callbacks = callbacks)



In [334]:
y_pred = model.predict([test['movieId'], test['userId']]).reshape(-1)
y_true = test['rating'].values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Matrix-Factorization: {:.4f} RMSE'.format(rmse))



Testing Result With Keras Matrix-Factorization: 1.0658 RMSE


In [158]:
# asd = list(ratings_df[ratings_df['userId']==0]['movieId'])

# asd2 = []
# for i in UNIQUE_MOVIE_IDS:
#     if i not in asd:
#         asd2.append(i)
# m1 = np.array(asd2)
# u1 = np.zeros(len(m1))
# u1.shape, m1

((3755,), array([   0,   13,   21, ..., 3402, 3632, 3491], dtype=int64))

In [335]:
asd = model.predict([np.zeros(len(UNIQUE_MOVIE_IDS)), UNIQUE_MOVIE_IDS]).reshape(-1)



In [337]:
max(asd), asd.argmax(), UNIQUE_MOVIE_IDS[asd.argmax()]

(3.5315704, 779, 642)

In [338]:
pos = np.argsort(asd)[::-1][:5]
for i in pos:
    print(i, asd[i])

ratings_df.loc[ratings_df['userId']==0]



779 3.5315704
1501 3.5315552
737 3.5315523
220 3.5315468
2694 3.5315466


Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
0,0,82,4.5,2015-03-09 18:07:15,Les Quatre Cents Coups,Drama,"For young Parisian boy Antoine Doinel, life is..."
1,0,528,5.0,2015-03-09 17:52:03,Sleepless in Seattle,"Comedy, Drama, Romance",A young boy who tries to set his dad up on a d...
2,0,637,5.0,2015-03-09 17:52:36,Rocky Balboa,Drama,When he loses a highly publicized virtual boxi...
3,0,905,4.0,2015-03-09 18:02:28,Fools Rush In,"Drama, Comedy, Romance",Alex Whitman (Matthew Perry) is a designer fro...
4,0,1140,4.5,2015-03-09 17:48:20,Young and Innocent,"Drama, Crime",Derrick De Marney finds himself in a 39 Steps ...
5,0,1167,4.0,2015-03-09 17:53:21,License to Wed,Comedy,"Newly engaged, Ben and Sadie can't wait to sta..."


In [339]:
model.predict([np.array([0]), np.array([82])])



array([[[3.5313687]]], dtype=float32)

In [340]:
def recommend_movies(user, topN = 5):
    pred = np.argsort(model.predict([np.ones(len(UNIQUE_MOVIE_IDS))*user, UNIQUE_MOVIE_IDS]).reshape(-1))[::-1]

    movies_seen = list(ratings_df[ratings_df['userId']==user]['movieId'])
    
    cont, pos = 0, 0
    top_pred = []
    while cont < topN:
        if pred[pos] not in movies_seen:
            top_pred.append(pred[pos])
            cont+=1
        pos+=1

    df_indices = pd.DataFrame(top_pred, columns = ['movieId'])
    return df_indices.merge(MOVIE_METADATA,on='movieId')[MOVIE_METADATA.columns]

In [341]:
recommend_movies(0, 10)



Unnamed: 0,movieId,original_title,genres,overview
0,779,Ray,"Drama, Music",Born on a sharecropping plantation in Northern...
1,1501,Sky Captain and the World of Tomorrow,"Mystery, Action, Thriller, Science Fiction, Ad...","When gigantic robots attack New York City, ""Sk..."
2,737,A Shot in the Dark,"Comedy, Crime",A Shot in the Dark is the second and more succ...
3,220,Batman Returns,"Action, Fantasy","Having defeated the Joker, Batman now faces th..."
4,2694,The Bucks County Massacre,Horror,"On Saturday, May 17th, 2008, police responded ..."
5,1517,Kiss Kiss Bang Bang,"Action, Comedy, Crime, Mystery",A petty thief posing as an actor is brought to...
6,2751,Pufnstuf,"Comedy, Family, Fantasy, Music",Jimmy (Jack Wild) ventures to Living Island wi...
7,323,Die Hard,"Action, Thriller","NYPD cop, John McClane's plan to reconcile wit..."
8,3447,Scary or Die,Horror,"The creation of filmmaker Michael Emanuel, SCA..."
9,2422,Terror Train,"Horror, Thriller",A masked killer targets six college kids respo...


In [368]:
asd = df['genres']
tags = [i.split(", ") for i in df['genres'].unique()]
genres = list(set([i for lst in tags for i in lst]))
genres.remove('')
print(genres)

['Family', 'Romance', 'Horror', 'Drama', 'TV Movie', 'Documentary', 'Fantasy', 'Western', 'Foreign', 'Music', 'Action', 'Science Fiction', 'War', 'Crime', 'Thriller', 'Mystery', 'Adventure', 'Comedy', 'History', 'Animation']


In [369]:
for col in genres:
    MOVIE_METADATA[col] = MOVIE_METADATA["genres"].apply(lambda x: 1 if col in x else 0)
MOVIE_METADATA.head()

Unnamed: 0,movieId,original_title,genres,overview,Family,Romance,Horror,Drama,TV Movie,Documentary,...,Action,Science Fiction,War,Crime,Thriller,Mystery,Adventure,Comedy,History,Animation
0,0,Four Rooms,"Crime, Comedy",It's Ted the Bellhop's first night on the job....,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
1,1,Judgment Night,"Action, Thriller, Crime","While racing to a boxing match, Frank, Mike, J...",0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,0,0
2,2,Star Wars,"Adventure, Action, Science Fiction",Princess Leia is captured and held hostage by ...,0,0,0,0,0,0,...,1,1,0,0,0,0,1,0,0,0
3,3,Finding Nemo,"Animation, Family","Nemo, an adventurous young clownfish, is unexp...",1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,4,Forrest Gump,"Comedy, Drama, Romance",A man with a low IQ has accomplished great thi...,0,1,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0


In [408]:
#ratings_df = ratings_df.merge(df[['id', 'original_title', 'genres', 'overview']], left_on='movieId',right_on='id', how='left')
for col in genres:
    ratings_df[col] = ratings_df["genres"].apply(lambda x: 1 if col in x else 0)
ratings_df.head()

Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview,Family,Romance,Horror,...,Action,Science Fiction,War,Crime,Thriller,Mystery,Adventure,Comedy,History,Animation
0,0,82,4.5,2015-03-09 18:07:15,Les Quatre Cents Coups,Drama,"For young Parisian boy Antoine Doinel, life is...",0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,528,5.0,2015-03-09 17:52:03,Sleepless in Seattle,"Comedy, Drama, Romance",A young boy who tries to set his dad up on a d...,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,0,637,5.0,2015-03-09 17:52:36,Rocky Balboa,Drama,When he loses a highly publicized virtual boxi...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,905,4.0,2015-03-09 18:02:28,Fools Rush In,"Drama, Comedy, Romance",Alex Whitman (Matthew Perry) is a designer fro...,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0,1140,4.5,2015-03-09 17:48:20,Young and Innocent,"Drama, Crime",Derrick De Marney finds himself in a 39 Steps ...,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [412]:
def get_model3(EMBEDDING_SIZE, NUM_MOVIES, NUM_USERS, ROW_COUNT, NUM_GENRES):
    from keras.layers import Input, Embedding, Dense, Dropout, Flatten, Dot, Reshape, Concatenate
    from keras.models import Model

    movie_input = Input(shape=(1,), name='movie_id')

    movie_emb = Embedding(output_dim=EMBEDDING_SIZE, input_dim=NUM_MOVIES, input_length=ROW_COUNT, name='movie_emb')(movie_input)

    movie_res = Reshape(target_shape=(EMBEDDING_SIZE,), name='movie_res')(movie_emb)

    user_input = Input(shape=(1,), name='user_id')

    user_emb = Embedding(output_dim=EMBEDDING_SIZE, input_dim=NUM_USERS, input_length=ROW_COUNT, name='user_emb')(user_input)

    user_res = Reshape(target_shape=(EMBEDDING_SIZE,), name='user_res')(user_emb)
    
    merged = Dot(name = 'dot_product', normalize = True, axes = 1)([movie_res, user_res])

    #############################
    genres_input = Input(shape=(NUM_GENRES,),name='genres')
    dense_gen_1 = Dense(70,name='FullyConnected-gen_1')(genres_input)
    dropout_gen_1 = Dropout(0.2,name='Dropout_gen_1')(dense_gen_1)

    dense_gen_2 = Dense(50,name='FullyConnected-gen_2')(dropout_gen_1)
    dropout_gen_2 = Dropout(0.2,name='Dropout_gen_2')(dense_gen_2)

    dense_gen_3 = Dense(20,name='FullyConnected-gen_3')(dropout_gen_2)
    dropout_gen_3 = Dropout(0.2,name='Dropout_gen_3')(dense_gen_3)

    dense_gen_4 = Dense(10,name='FullyConnected-gen_4', activation='relu')(dropout_gen_3)
    #############################

    merged = Dropout(0.2)(merged)
    
    dense_1 = Dense(70,name='FullyConnected-1')(merged)
    dropout_1 = Dropout(0.2,name='Dropout_1')(dense_1)

    dense_2 = Dense(50,name='FullyConnected-2')(dropout_1)
    dropout_2 = Dropout(0.2,name='Dropout_2')(dense_2)

    dense_3 = Dense(20,name='FullyConnected-3')(dropout_2)
    dropout_3 = Dropout(0.2,name='Dropout_3')(dense_3)

    dense_4 = Dense(10,name='FullyConnected-4', activation='relu')(dropout_3)

    concat = Concatenate()([dense_4, dense_gen_4])

    result = Dense(1, name='result', activation="relu") (concat)
    model = Model([movie_input, user_input, genres_input], result)
    model.compile(loss='mse', optimizer='adam')
    return model

In [413]:
model = get_model3(EMBEDDING_SIZE, NUM_MOVIES, NUM_USERS, ROW_COUNT, len(genres))

In [415]:
history = model.fit([train['movieId'], train['userId'], train[genres]],
          train['rating'],
          batch_size=256, 
          epochs=1,
          validation_split=0.1,
          shuffle=True,
          callbacks = callbacks)



In [416]:
y_pred = model.predict([test['movieId'], test['userId'], test[genres]]).reshape(-1)
y_true = test['rating'].values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Matrix-Factorization: {:.4f} RMSE'.format(rmse))



Testing Result With Keras Matrix-Factorization: 1.0473 RMSE


In [148]:
def get_model2(len_users : int, len_movies : int):
    from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
    from keras.models import Model

    # Setup variables
    user_embedding_size = 20
    movie_embedding_size = 10


    ##### Create model
    # Set input layers
    user_id_input = Input(shape=[1], name='userId')
    movie_id_input = Input(shape=[1], name='movieId')

    # Create embedding layers for users and movies
    user_embedding = Embedding(output_dim=user_embedding_size, 
                            input_dim=len_users,
                            input_length=1, 
                            name='user_embedding')(user_id_input)
    movie_embedding = Embedding(output_dim=movie_embedding_size, 
                                input_dim=len_movies,
                                input_length=1, 
                                name='item_embedding')(movie_id_input)

    # Reshape the embedding layers
    user_vector = Reshape([user_embedding_size])(user_embedding)
    movie_vector = Reshape([movie_embedding_size])(movie_embedding)

    # Concatenate the reshaped embedding layers
    concat = Concatenate()([user_vector, movie_vector])

    # Combine with dense layers
    dense = Dense(256)(concat)
    y = Dense(1)(dense)

    # Setup model
    model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
    return model

In [149]:
users = len(user_id_mapping)
movies = len(movie_id_mapping)

model = get_model2(users, movies)

model.compile(loss='mse', optimizer='adam')

model.fit([train_user_data, train_movie_data],
          train_df['rating'],
          batch_size=512, 
          epochs=5,
          validation_split=0.1,
          shuffle=True)

# Test model
y_pred = model.predict([test_user_data, test_movie_data])
y_true = test_df['rating'].values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Deep Learning: {:.4f} RMSE'.format(rmse))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Testing Result With Keras Deep Learning: 0.8416 RMSE


In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_hybrid = tfidf.fit_transform(df_filterd['overview'].str.lower())

In [81]:
mapping = {id:i for i, id in enumerate(df_filterd['movieId'])}

train_tfidf = []
# Iterate over all movie-ids and save the tfidf-vector
for id in train_df['movieId'].values:
    index = mapping[id]
    train_tfidf.append(tfidf_hybrid[index])

test_tfidf = []
# Iterate over all movie-ids and save the tfidf-vector
for id in test_df['movieId'].values:
    index = mapping[id]
    test_tfidf.append(tfidf_hybrid[index])

KeyError: 867

In [80]:
test_tfidf = np.array(test_tfidf)
train_tfidf = np.array(train_df)

NameError: name 'test_tfidf' is not defined

In [78]:
test_tfidf.shape, train_tfidf.shape

NameError: name 'test_tfidf' is not defined

In [154]:
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

user_embed = 10
movie_embed = 10


# Create two input layers
user_id_input = Input(shape=[1], name='user')
movie_id_input = Input(shape=[1], name='movie')
tfidf_input = Input(shape=[24144], name='tfidf', sparse=True)

# Create separate embeddings for users and movies
user_embedding = Embedding(output_dim=user_embed,
                           input_dim=len(user_id_mapping),
                           input_length=1,
                           name='user_embedding')(user_id_input)
movie_embedding = Embedding(output_dim=movie_embed,
                            input_dim=len(movie_id_mapping),
                            input_length=1,
                            name='movie_embedding')(movie_id_input)

# Dimensionality reduction with Dense layers
tfidf_vectors = Dense(128, activation='relu')(tfidf_input)
tfidf_vectors = Dense(32, activation='relu')(tfidf_vectors)

# Reshape both embedding layers
user_vectors = Reshape([user_embed])(user_embedding)
movie_vectors = Reshape([movie_embed])(movie_embedding)

# Concatenate all layers into one vector
both = Concatenate()([user_vectors, movie_vectors, tfidf_vectors])

# Add dense layers for combinations and scalar output
dense = Dense(512, activation='relu')(both)
dense = Dropout(0.2)(dense)
output = Dense(1)(dense)


# Create and compile model
model = Model(inputs=[user_id_input, movie_id_input, tfidf_input], outputs=output)
model.compile(loss='mse', optimizer='adam')


# Train and test the network
model.fit([train_df['userId'], train_df['movieId']],
          train_df['rating'],
          batch_size=1024, 
          epochs=2,
          validation_split=0.1,
          shuffle=True)



Epoch 1/2


ValueError: in user code:

    File "c:\Users\gita2\anaconda3\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\gita2\anaconda3\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\gita2\anaconda3\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\gita2\anaconda3\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\gita2\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\gita2\anaconda3\lib\site-packages\keras\engine\input_spec.py", line 216, in assert_input_compatibility
        raise ValueError(

    ValueError: Layer "model_13" expects 3 input(s), but it received 2 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 1) dtype=int64>, <tf.Tensor 'IteratorGetNext:1' shape=(None, 1) dtype=int64>]


In [None]:
y_pred = model.predict([df_hybrid_test['User'], df_hybrid_test['Movie'], test_tfidf])
y_true = df_hybrid_test['Rating'].values

rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse))

In [None]:
def get_model3(len_users : int, len_movies : int):
    from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
    from keras.models import Model

    # Setup variables
    user_embedding_size = 20
    movie_embedding_size = 10


    ##### Create model
    # Set input layers
    user_id_input = Input(shape=[1], name='userId')
    movie_id_input = Input(shape=[1], name='movieId')

    # Create embedding layers for users and movies
    user_embedding = Embedding(output_dim=user_embedding_size, 
                            input_dim=len_users,
                            input_length=1, 
                            name='user_embedding')(user_id_input)
    movie_embedding = Embedding(output_dim=movie_embedding_size, 
                                input_dim=len_movies,
                                input_length=1, 
                                name='item_embedding')(movie_id_input)

    # Reshape the embedding layers
    user_vector = Reshape([user_embedding_size])(user_embedding)
    movie_vector = Reshape([movie_embedding_size])(movie_embedding)

    # Concatenate the reshaped embedding layers
    concat = Concatenate()([user_vector, movie_vector])

    # Combine with dense layers
    dense = Dense(256)(concat)
    y = Dense(1)(dense)

    # Setup model
    model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
    return model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy import vstack


user_id_mapping = {id:i for i, id in enumerate(df['userId'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df['movieId'].unique())}

# Use mapping to get better ids
df['userId'] = df['userId'].map(user_id_mapping)
df['movieId'] = df['movieId'].map(movie_id_mapping)


##### Combine both datasets to get movies with metadata
# Preprocess metadata
tmp_metadata = df_filterd[['original_title' ,'overview']].set_index('original_title')
tmp_metadata.index = tmp_metadata.index.str.lower()


# Preprocess titles
tmp_titles = df_filterd[['movieId', 'original_title']].rename(columns={'movieId':'id'}).set_index('id')
tmp_titles = tmp_titles.reset_index().set_index('original_title')
tmp_titles.index = tmp_titles.index.str.lower()

# Combine titles and metadata
df_id_descriptions = tmp_titles.join(tmp_metadata).dropna().set_index('Id')
df_id_descriptions['overview'] = df_id_descriptions['overview'].str.lower()
del tmp_metadata,tmp_titles


# Filter all ratings with metadata
df_hybrid = df.drop('Date', axis=1).set_index('Movie').join(df_id_descriptions).dropna().drop('overview', axis=1).reset_index().rename({'index':'Movie'}, axis=1)


# Split train- & testset
n = 100000
df_hybrid = df_hybrid.sample(frac=1).reset_index(drop=True)
df_hybrid_train = df_hybrid[:1500000]
df_hybrid_test = df_hybrid[-n:]


# Create tf-idf matrix for text comparison
tfidf = TfidfVectorizer(stop_words='english')
tfidf_hybrid = tfidf.fit_transform(df_id_descriptions['overview'])


# Get mapping from movie-ids to indices in tfidf-matrix
mapping = {id:i for i, id in enumerate(df_id_descriptions.index)}

train_tfidf = []
# Iterate over all movie-ids and save the tfidf-vector
for id in df_hybrid_train['movieId'].values:
    index = mapping[id]
    train_tfidf.append(tfidf_hybrid[index])
    
test_tfidf = []
# Iterate over all movie-ids and save the tfidf-vector
for id in df_hybrid_test['movieId'].values:
    index = mapping[id]
    test_tfidf.append(tfidf_hybrid[index])


# Stack the sparse matrices
train_tfidf = vstack(train_tfidf)
test_tfidf = vstack(test_tfidf)


##### Setup the network
# Network variables
user_embed = 10
movie_embed = 10


# Create two input layers
user_id_input = Input(shape=[1], name='user')
movie_id_input = Input(shape=[1], name='movie')
tfidf_input = Input(shape=[24144], name='tfidf', sparse=True)

# Create separate embeddings for users and movies
user_embedding = Embedding(output_dim=user_embed,
                           input_dim=len(user_id_mapping),
                           input_length=1,
                           name='user_embedding')(user_id_input)
movie_embedding = Embedding(output_dim=movie_embed,
                            input_dim=len(movie_id_mapping),
                            input_length=1,
                            name='movie_embedding')(movie_id_input)

# Dimensionality reduction with Dense layers
tfidf_vectors = Dense(128, activation='relu')(tfidf_input)
tfidf_vectors = Dense(32, activation='relu')(tfidf_vectors)

# Reshape both embedding layers
user_vectors = Reshape([user_embed])(user_embedding)
movie_vectors = Reshape([movie_embed])(movie_embedding)

# Concatenate all layers into one vector
both = Concatenate()([user_vectors, movie_vectors, tfidf_vectors])

# Add dense layers for combinations and scalar output
dense = Dense(512, activation='relu')(both)
dense = Dropout(0.2)(dense)
output = Dense(1)(dense)


# Create and compile model
model = Model(inputs=[user_id_input, movie_id_input, tfidf_input], outputs=output)
model.compile(loss='mse', optimizer='adam')


# Train and test the network
model.fit([df_hybrid_train['userId'], df_hybrid_train['movieId'], train_tfidf],
          df_hybrid_train['rating'],
          batch_size=1024, 
          epochs=2,
          validation_split=0.1,
          shuffle=True)

y_pred = model.predict([df_hybrid_test['userId'], df_hybrid_test['movieId'], test_tfidf])
y_true = df_hybrid_test['rating'].values

rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse))

MemoryError: Unable to allocate 300. GiB for an array with shape (40328309078,) and data type int64

# Pruebas con tfidf...

In [None]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,date,original_title,genres,overview
0,1,147,4.5,2015-03-09 18:07:15,Les Quatre Cents Coups,Drama,"For young Parisian boy Antoine Doinel, life is..."
1,1,858,5.0,2015-03-09 17:52:03,Sleepless in Seattle,"Comedy, Drama, Romance",A young boy who tries to set his dad up on a d...
2,1,1246,5.0,2015-03-09 17:52:36,Rocky Balboa,Drama,When he loses a highly publicized virtual boxi...
3,1,1968,4.0,2015-03-09 18:02:28,Fools Rush In,"Drama, Comedy, Romance",Alex Whitman (Matthew Perry) is a designer fro...
4,1,2762,4.5,2015-03-09 17:48:20,Young and Innocent,"Drama, Crime",Derrick De Marney finds himself in a 39 Steps ...


In [None]:
df = ratings_df.loc[:, ['userId', 'movieId', 'rating', 'date']]
df.rename(columns={'userId':'User', 'movieId':'Movie', 'date':'Date', 'rating':'Rating'}, inplace=True)
df['User'].max()

270896

In [None]:
# Load a movie metadata dataset
movie_metadata = pd.read_csv('the-movies-dataset/movies_metadata.csv', low_memory=False)[['original_title', 'overview', 'vote_count']].set_index('original_title').dropna()
# Remove the long tail of rarly rated moves
movie_metadata = movie_metadata[movie_metadata['vote_count']>10].drop('vote_count', axis=1)

print('Shape Movie-Metadata:\t{}'.format(movie_metadata.shape))
movie_metadata.sample(5)

Shape Movie-Metadata:	(21604, 1)


Unnamed: 0_level_0,overview
original_title,Unnamed: 1_level_1
Son of Batman,"Batman learns that he has a violent, unruly pr..."
Village of the Damned,An American village is visited by some unknown...
விண்ணைத்தாண்டி வருவாயா,"A young, aspiring filmmaker falls in love with..."
Çalgı Çengi,"Salih and Gürkan, two cousins from Ankara, are..."
The Brave Little Toaster,A group of dated appliances find themselves st...


In [None]:
movie_titles = pd.read_csv('the-movies-dataset/movies_metadata.csv', low_memory=False)[['original_title', 'id']]
movie_titles.rename(columns={'id':'Id', 'original_title':'Name'}, inplace=True)
movie_titles.set_index('Id')


Unnamed: 0_level_0,Name
Id,Unnamed: 1_level_1
862,Toy Story
8844,Jumanji
15602,Grumpier Old Men
31357,Waiting to Exhale
11862,Father of the Bride Part II
...,...
439050,رگ خواب
111109,Siglo ng Pagluluwal
67758,Betrayal
227506,Satana likuyushchiy


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy import vstack
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

# Create user- & movie-id mapping
user_id_mapping = {id:i for i, id in enumerate(df['User'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df['Movie'].unique())}

# Use mapping to get better ids
df['User'] = df['User'].map(user_id_mapping)
df['Movie'] = df['Movie'].map(movie_id_mapping)


##### Combine both datasets to get movies with metadata
# Preprocess metadata
tmp_metadata = movie_metadata.copy()
tmp_metadata.index = tmp_metadata.index.str.lower()

# Preprocess titles
tmp_titles = movie_titles.copy()
tmp_titles = tmp_titles.reset_index().set_index('Name')
tmp_titles.index = tmp_titles.index.str.lower()

# Combine titles and metadata
df_id_descriptions = tmp_titles.join(tmp_metadata).dropna().set_index('Id')
df_id_descriptions['overview'] = df_id_descriptions['overview'].str.lower()
del tmp_metadata,tmp_titles


# Filter all ratings with metadata
df_hybrid = df.drop('Date', axis=1).set_index('Movie').join(df_id_descriptions).dropna().drop('overview', axis=1).reset_index().rename({'index':'Movie'}, axis=1)

df.drop('Date', axis=1).set_index('Movie').join(df_id_descriptions)

Unnamed: 0,User,Rating,index,overview
0,0,4.5,,
0,22,4.0,,
0,66,4.5,,
0,136,4.0,,
0,144,2.0,,
...,...,...,...,...
3756,255092,3.5,,
3757,257842,3.5,,
3758,260273,3.5,,
3759,260273,1.5,,


In [None]:


# Split train- & testset
n = 100000
df_hybrid = df_hybrid.sample(frac=1).reset_index(drop=True)
df_hybrid_train = df_hybrid[:1500000]
df_hybrid_test = df_hybrid[-n:]


# Create tf-idf matrix for text comparison
tfidf = TfidfVectorizer(stop_words='english')
tfidf_hybrid = tfidf.fit_transform(df_id_descriptions['overview'])


# Get mapping from movie-ids to indices in tfidf-matrix
mapping = {id:i for i, id in enumerate(df_id_descriptions.index)}

train_tfidf = []
# Iterate over all movie-ids and save the tfidf-vector
for id in df_hybrid_train['Movie'].values:
    index = mapping[id]
    train_tfidf.append(tfidf_hybrid[index])
    
test_tfidf = []
# Iterate over all movie-ids and save the tfidf-vector
for id in df_hybrid_test['Movie'].values:
    index = mapping[id]
    test_tfidf.append(tfidf_hybrid[index])


# Stack the sparse matrices
# train_tfidf = vstack(train_tfidf)
# test_tfidf = vstack(test_tfidf)


##### Setup the network
# Network variables
user_embed = 10
movie_embed = 10


# Create two input layers
user_id_input = Input(shape=[1], name='user')
movie_id_input = Input(shape=[1], name='movie')
tfidf_input = Input(shape=[24144], name='tfidf', sparse=True)

# Create separate embeddings for users and movies
user_embedding = Embedding(output_dim=user_embed,
                           input_dim=len(user_id_mapping),
                           input_length=1,
                           name='user_embedding')(user_id_input)
movie_embedding = Embedding(output_dim=movie_embed,
                            input_dim=len(movie_id_mapping),
                            input_length=1,
                            name='movie_embedding')(movie_id_input)

# Dimensionality reduction with Dense layers
tfidf_vectors = Dense(128, activation='relu')(tfidf_input)
tfidf_vectors = Dense(32, activation='relu')(tfidf_vectors)

# Reshape both embedding layers
user_vectors = Reshape([user_embed])(user_embedding)
movie_vectors = Reshape([movie_embed])(movie_embedding)

# Concatenate all layers into one vector
both = Concatenate()([user_vectors, movie_vectors, tfidf_vectors])

# Add dense layers for combinations and scalar output
dense = Dense(512, activation='relu')(both)
dense = Dropout(0.2)(dense)
output = Dense(1)(dense)


# Create and compile model
model = Model(inputs=[user_id_input, movie_id_input, tfidf_input], outputs=output)
model.compile(loss='mse', optimizer='adam')


# Train and test the network
model.fit([df_hybrid_train['User'], df_hybrid_train['Movie'], train_tfidf],
          df_hybrid_train['Rating'],
          batch_size=1024, 
          epochs=2,
          validation_split=0.1,
          shuffle=True)

y_pred = model.predict([df_hybrid_test['User'], df_hybrid_test['Movie'], test_tfidf])
y_true = df_hybrid_test['Rating'].values

rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse))

In [None]:
df_hybrid

Unnamed: 0,level_0,User,Rating,Movie
