# Data extraction

In [2]:
import pandas as pd
from pandas.io import gbq


In [42]:
movies = pd.read_csv('movies.csv')

In [43]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [44]:
ratings = pd.read_csv('ratings.csv')

In [45]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [78]:
links = pd.read_csv('links.csv')

In [79]:
links


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


# Data transformation

In [74]:
movies_ratings  = pd.merge(movies, ratings, how = 'inner')

In [75]:
movies_ratings

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,964982703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,847434962
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,1106635946
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,1510577970
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,1305696483
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,1537109082
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,1537109545
100833,193585,Flint (2017),Drama,184,3.5,1537109805
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,1537110021


In [76]:
movies_ratings.drop(columns=['timestamp'], inplace = True)

In [77]:
movies_ratings


Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5
...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5
100833,193585,Flint (2017),Drama,184,3.5
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5


In [80]:
movies_ratings_links = pd.merge(movies_ratings, links, how = 'inner')

In [81]:
movies_ratings_links

Unnamed: 0,movieId,title,genres,userId,rating,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,114709,862.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,114709,862.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,114709,862.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,114709,862.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,114709,862.0
...,...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,5476944,432131.0
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,5914996,445030.0
100833,193585,Flint (2017),Drama,184,3.5,6397426,479308.0
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,8391976,483455.0


In [82]:
movies_ratings_links.drop(columns=['tmdbId'], inplace = True)

In [83]:
movies_ratings_links

Unnamed: 0,movieId,title,genres,userId,rating,imdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,114709
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,114709
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,114709
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.5,114709
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,114709
...,...,...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,5476944
100832,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184,3.5,5914996
100833,193585,Flint (2017),Drama,184,3.5,6397426
100834,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184,3.5,8391976


# Data type transform

In [84]:
movies_ratings_links.count()

movieId    100836
title      100836
genres     100836
userId     100836
rating     100836
imdbId     100836
dtype: int64

In [85]:
movies_ratings_links.dtypes

movieId      int64
title       object
genres      object
userId       int64
rating     float64
imdbId       int64
dtype: object

In [92]:
movies_ratings_links['title'] = movies_ratings_links['title'].astype(str, copy=True, errors='raise')

In [95]:
movies_ratings_links['genres'] = movies_ratings_links['genres'].astype(str, copy=True, errors='raise')

In [96]:
better_movies_ratings = movies_ratings_links[movies_ratings_links.rating >= 4.0]

In [97]:
better_movies_ratings

Unnamed: 0,movieId,title,genres,userId,rating,imdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,4.0,114709
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5,4.0,114709
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,4.5,114709
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17,4.5,114709
6,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,4.0,114709
...,...,...,...,...,...,...
100825,191005,Gintama (2017),Action|Adventure|Comedy|Sci-Fi,184,4.5,5805470
100828,193571,Silver Spoon (2014),Comedy|Drama,184,4.0,3110014
100829,193573,Love Live! The School Idol Movie (2015),Animation,184,4.0,3837248
100831,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184,4.0,5476944
