In [2]:
import pandas as pd

In [3]:
movies = pd.read_csv('datasets/movie.csv')
tags = pd.read_csv('datasets/tag.csv')
links = pd.read_csv('datasets/link.csv')

In [5]:
df = pd.merge(movies, tags, on='movieId')
df = pd.merge(df, links, on='movieId')

In [6]:
df = df[['movieId', 'title', 'tag', 'tmdbId']]

In [7]:
df = df.groupby('movieId').agg(lambda x: ', '.join(map(str, x))).reset_index()

In [8]:
df['title'] = df['title'].apply(lambda x: x.split(',')[0])
df['tmdbId'] = df['tmdbId'].apply(lambda x: x.split(',')[0])
df['tag'] = df['tag'].apply(lambda x: x.replace(',', ''))

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19545 entries, 0 to 19544
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  19545 non-null  int64 
 1   title    19545 non-null  object
 2   tag      19545 non-null  object
 3   tmdbId   19545 non-null  object
dtypes: int64(1), object(3)
memory usage: 610.9+ KB


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [11]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,2)) 

In [12]:
tfidf_matrix = tf.fit_transform(df['tag'])
cosine_sim = cosine_similarity(tfidf_matrix) 
cosine_sim_df = pd.DataFrame(cosine_sim, index=df['title'], columns=df['title']) 

In [14]:
model = NearestNeighbors(n_neighbors=11).fit(cosine_sim_df)

In [15]:
import pickle

In [16]:
data = {'similarity_df': cosine_sim_df, 'df': df}

In [17]:
with open('recommender.pkl', 'wb') as file:
    pickle.dump(data, file)