In [178]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import datetime as dt

import sqlalchemy as db
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect

from flask import Flask, jsonify, render_template

import pandas as pd

engine = create_engine("sqlite:///movie_ratings_db.sqlite")

Base = automap_base()

Base.prepare(autoload_with=engine)

Links = Base.classes.links
Movies = Base.classes.movies
Ratings = Base.classes.ratings
Tags = Base.classes.tags

print(Base.classes.keys())


['links', 'movies', 'ratings', 'tags']


In [179]:
# https://www.geeksforgeeks.org/sqlalchemy-orm-conversion-to-pandas-dataframe/#

links_df = pd.read_sql_query(
    sql = db.select([Links.movieId,
                     Links.imdbId,
                     Links.tmdbId]),
    con = engine
)

print(len(links_df)) # Length should be 9742
links_df.head()

# We can ignore this table

9742


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [180]:
movies_df = pd.read_sql_query(
    sql = db.select([Movies.movieId,
                     Movies.title,
                     Movies.genres]),
    con = engine
)

print(len(movies_df)) # Length should be 9742
movies_df.head()

9742


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [181]:
ratings_df = pd.read_sql_query(
    sql = db.select([Ratings.userId,
                     Ratings.movieId,
                     Ratings.rating,
                     Ratings.timestamp]),
    con = engine
)

print(len(ratings_df)) # Length should be 100836
print(len(ratings_df["userId"].unique()))
ratings_df.head(25)


100836
610


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [182]:
tags_df = pd.read_sql_query(
    sql = db.select([Tags.userId,
                     Tags.movieId,
                     Tags.tag,
                     Tags.timestamp]),
    con = engine
)

print(len(tags_df)) # Length should be 3683
print(len(tags_df["userId"].unique()))
tags_df.head()

3683
58


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [183]:
# split genres column, drop original genres column, rename the first genre column to genre, drop additional genre columns
# in order to only keep first genre listed, assuming that's the most relevant 
movies_df_copy = movies_df.copy()
new = movies_df_copy["genres"].str.split("|",expand=True)
for i in new:
    movies_df_copy[f"genre{i + 1}"] = new[i]
movies_df_copy.drop(columns=["genres"], inplace=True)
movies_trimmed = movies_df_copy.rename(columns={"genre1": "genre"})
movies_trimmed.drop(columns=["genre2", "genre3", "genre4", "genre5", "genre6", "genre7", "genre8", "genre9", "genre10"], inplace=True)
print(len(movies_trimmed))
movies_trimmed.dropna(inplace=True)
print(len(movies_trimmed["title"].unique()))
movies_trimmed.set_index("movieId", inplace=True)
movies_trimmed.head(100)

9742
9737


Unnamed: 0_level_0,title,genre
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure
2,Jumanji (1995),Adventure
3,Grumpier Old Men (1995),Comedy
4,Waiting to Exhale (1995),Comedy
5,Father of the Bride Part II (1995),Comedy
...,...,...
107,Muppet Treasure Island (1996),Adventure
108,Catwalk (1996),Documentary
110,Braveheart (1995),Action
111,Taxi Driver (1976),Crime


In [184]:
# add movie title and genre to tags dataframe based on movieId
tags_movies = tags_df.copy()
tags_movies["title"] = ""
tags_movies["genre"] = ""
for i in range(len(tags_movies)):
    movie = tags_movies.loc[i, "movieId"]
    film = movies_trimmed.at[movie, "title"]
    genre = movies_trimmed.at[movie, "genre"]
    tags_movies.loc[tags_movies.index[i], "title"] = film
    tags_movies.loc[tags_movies.index[i], "genre"] = genre
print(len(tags_movies["userId"].unique()))

tags_movies.head(25)

58


Unnamed: 0,userId,movieId,tag,timestamp,title,genre
0,2,60756,funny,1445714994,Step Brothers (2008),Comedy
1,2,60756,Highly quotable,1445714996,Step Brothers (2008),Comedy
2,2,60756,will ferrell,1445714992,Step Brothers (2008),Comedy
3,2,89774,Boxing story,1445715207,Warrior (2011),Drama
4,2,89774,MMA,1445715200,Warrior (2011),Drama
5,2,89774,Tom Hardy,1445715205,Warrior (2011),Drama
6,2,106782,drugs,1445715054,"Wolf of Wall Street, The (2013)",Comedy
7,2,106782,Leonardo DiCaprio,1445715051,"Wolf of Wall Street, The (2013)",Comedy
8,2,106782,Martin Scorsese,1445715056,"Wolf of Wall Street, The (2013)",Comedy
9,7,48516,way too long,1169687325,"Departed, The (2006)",Crime


In [185]:
# Add ratings for each user/movie combo to movies + tags dataframe
tags_movies_ratings = tags_movies.copy()
tags_movies_ratings["rating"] = ""
for j in range(len(tags_movies_ratings)):
    user = tags_movies_ratings.loc[j, "userId"]
    movie = tags_movies_ratings.loc[j, "movieId"]
    rating = ratings_df.loc[((ratings_df["userId"]==user) & (ratings_df["movieId"]==movie)), "rating"] 
    if not rating.empty:
        tags_movies_ratings.loc[tags_movies_ratings.index[j], "rating"] = rating.item()
    
tags_movies_ratings.head(25)

Unnamed: 0,userId,movieId,tag,timestamp,title,genre,rating
0,2,60756,funny,1445714994,Step Brothers (2008),Comedy,5.0
1,2,60756,Highly quotable,1445714996,Step Brothers (2008),Comedy,5.0
2,2,60756,will ferrell,1445714992,Step Brothers (2008),Comedy,5.0
3,2,89774,Boxing story,1445715207,Warrior (2011),Drama,5.0
4,2,89774,MMA,1445715200,Warrior (2011),Drama,5.0
5,2,89774,Tom Hardy,1445715205,Warrior (2011),Drama,5.0
6,2,106782,drugs,1445715054,"Wolf of Wall Street, The (2013)",Comedy,5.0
7,2,106782,Leonardo DiCaprio,1445715051,"Wolf of Wall Street, The (2013)",Comedy,5.0
8,2,106782,Martin Scorsese,1445715056,"Wolf of Wall Street, The (2013)",Comedy,5.0
9,7,48516,way too long,1169687325,"Departed, The (2006)",Crime,1.0


In [186]:
# ML model (supervised learning - logistic regression?)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

In [187]:
mtr_num = pd.get_dummies(tags_movies_ratings)