In [1]:
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from sklearn.metrics.pairwise import linear_kernel

In [4]:
movies = pd.read_csv("movies_metadata.csv",
                    usecols=["id","overview","title","vote_average","vote_count","release_date"])


In [5]:
movies.head()

Unnamed: 0,id,overview,release_date,title,vote_average,vote_count
0,862,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,Toy Story,7.7,5415.0
1,8844,When siblings Judy and Peter discover an encha...,1995-12-15,Jumanji,6.9,2413.0
2,15602,A family wedding reignites the ancient feud be...,1995-12-22,Grumpier Old Men,6.5,92.0
3,31357,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,Waiting to Exhale,6.1,34.0
4,11862,Just when George Banks has recovered from his ...,1995-02-10,Father of the Bride Part II,5.7,173.0


In [6]:
movies.isnull().sum()

id                0
overview        954
release_date     87
title             6
vote_average      6
vote_count        6
dtype: int64

In [7]:
movies = movies.dropna()

In [8]:
movies.isnull().sum()

id              0
overview        0
release_date    0
title           0
vote_average    0
vote_count      0
dtype: int64

In [9]:
movies.dtypes

id               object
overview         object
release_date     object
title            object
vote_average    float64
vote_count      float64
dtype: object

In [10]:
movies.duplicated().sum()

28

In [11]:
movies = movies.drop_duplicates()

Unnamed: 0,userId,movieId,rating,timestamp,date
0,1,31,2.5,1260759144,2009-12-14 02:52:24
1,1,1029,3.0,1260759179,2009-12-14 02:52:59
2,1,1061,3.0,1260759182,2009-12-14 02:53:02
3,1,1129,2.0,1260759185,2009-12-14 02:53:05
4,1,1172,4.0,1260759205,2009-12-14 02:53:25
...,...,...,...,...,...
99999,671,6268,2.5,1065579370,2003-10-08 02:16:10
100000,671,6269,4.0,1065149201,2003-10-03 02:46:41
100001,671,6365,4.0,1070940363,2003-12-09 03:26:03
100002,671,6385,2.5,1070979663,2003-12-09 14:21:03


In [12]:
movies.shape

(44407, 6)

In [20]:
ratings = pd.read_csv("ratings_small.csv")

In [21]:
ratings


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [23]:
ratings["date"] = pd.to_datetime(ratings["timestamp"],unit="s")


In [24]:
ratings


Unnamed: 0,userId,movieId,rating,timestamp,date
0,1,31,2.5,1260759144,2009-12-14 02:52:24
1,1,1029,3.0,1260759179,2009-12-14 02:52:59
2,1,1061,3.0,1260759182,2009-12-14 02:53:02
3,1,1129,2.0,1260759185,2009-12-14 02:53:05
4,1,1172,4.0,1260759205,2009-12-14 02:53:25
...,...,...,...,...,...
99999,671,6268,2.5,1065579370,2003-10-08 02:16:10
100000,671,6269,4.0,1065149201,2003-10-03 02:46:41
100001,671,6365,4.0,1070940363,2003-12-09 03:26:03
100002,671,6385,2.5,1070979663,2003-12-09 14:21:03


In [25]:
ratings.isnull().sum()


userId       0
movieId      0
rating       0
timestamp    0
date         0
dtype: int64

In [26]:
ratings.duplicated().sum()

0

In [27]:
movies["id"].nunique()

44405

In [28]:
movies = movies.rename(columns={"id":"movieId"})

In [29]:
movies


Unnamed: 0,movieId,overview,release_date,title,vote_average,vote_count
0,862,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,Toy Story,7.7,5415.0
1,8844,When siblings Judy and Peter discover an encha...,1995-12-15,Jumanji,6.9,2413.0
2,15602,A family wedding reignites the ancient feud be...,1995-12-22,Grumpier Old Men,6.5,92.0
3,31357,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,Waiting to Exhale,6.1,34.0
4,11862,Just when George Banks has recovered from his ...,1995-02-10,Father of the Bride Part II,5.7,173.0
...,...,...,...,...,...,...
45460,30840,"Yet another version of the classic epic, with ...",1991-05-13,Robin Hood,5.7,26.0
45462,111109,An artist struggles to finish his work while a...,2011-11-17,Century of Birthing,9.0,3.0
45463,67758,"When one of her hits goes wrong, a professiona...",2003-08-01,Betrayal,3.8,6.0
45464,227506,"In a small town live two brothers, one a minis...",1917-10-21,Satan Triumphant,0.0,0.0


In [30]:
movies.dtypes


movieId          object
overview         object
release_date     object
title            object
vote_average    float64
vote_count      float64
dtype: object

In [31]:
ratings.dtypes

userId                int64
movieId               int64
rating              float64
timestamp             int64
date         datetime64[ns]
dtype: object

In [32]:
movies["movieId"] = movies["movieId"].astype("int64")


In [33]:
movies


Unnamed: 0,movieId,overview,release_date,title,vote_average,vote_count
0,862,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,Toy Story,7.7,5415.0
1,8844,When siblings Judy and Peter discover an encha...,1995-12-15,Jumanji,6.9,2413.0
2,15602,A family wedding reignites the ancient feud be...,1995-12-22,Grumpier Old Men,6.5,92.0
3,31357,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,Waiting to Exhale,6.1,34.0
4,11862,Just when George Banks has recovered from his ...,1995-02-10,Father of the Bride Part II,5.7,173.0
...,...,...,...,...,...,...
45460,30840,"Yet another version of the classic epic, with ...",1991-05-13,Robin Hood,5.7,26.0
45462,111109,An artist struggles to finish his work while a...,2011-11-17,Century of Birthing,9.0,3.0
45463,67758,"When one of her hits goes wrong, a professiona...",2003-08-01,Betrayal,3.8,6.0
45464,227506,"In a small town live two brothers, one a minis...",1917-10-21,Satan Triumphant,0.0,0.0
