In [3]:
# dependencies
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import datetime as dt
import pandas as pd

import sqlalchemy as db
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect

from flask import Flask, jsonify, render_template


In [4]:
import sqlite3
con = sqlite3.connect("movie_ratings_db.sqlite")
links_df = pd.read_sql_query("SELECT * FROM Links", con)
movies_df = pd.read_sql_query("SELECT * FROM Movies", con)
ratings_df = pd.read_sql_query("SELECT * FROM Ratings", con)
tags_df = pd.read_sql_query("SELECT * FROM Tags", con)

In [5]:
links_df.head(10) # length 9742

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
5,6,113277,949.0
6,7,114319,11860.0
7,8,112302,45325.0
8,9,114576,9091.0
9,10,113189,710.0


In [6]:
movies_df.head(10) # length 9742

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [7]:
ratings_df.head(10) # length 100836

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [8]:
tags_df.head(10) # length 3683

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
5,2,89774,Tom Hardy,1445715205
6,2,106782,drugs,1445715054
7,2,106782,Leonardo DiCaprio,1445715051
8,2,106782,Martin Scorsese,1445715056
9,7,48516,way too long,1169687325


In [9]:
# Data preprocessing
# split genres column, drop original genres column, rename the first genre column to genre, drop additional genre columns
# in order to only keep first genre listed, assuming that's the most relevant 
movies_df_copy = movies_df.copy()

new = movies_df_copy["genres"].str.split("|",expand=True)

for i in new:
    movies_df_copy[f"genre{i + 1}"] = new[i]
movies_df_copy.drop(columns=["genres"], inplace=True)
movies_trimmed = movies_df_copy.rename(columns={"genre1": "genre"})
movies_trimmed.drop(columns=["genre2", "genre3", "genre4", "genre5", "genre6", "genre7", "genre8", "genre9", "genre10"], inplace=True)
print(len(movies_trimmed))

movies_trimmed.dropna(inplace=True)
print(len(movies_trimmed["title"].unique()))

movies_trimmed.set_index("movieId", inplace=True)
movies_trimmed.head(20)

9742
9737


Unnamed: 0_level_0,title,genre
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure
2,Jumanji (1995),Adventure
3,Grumpier Old Men (1995),Comedy
4,Waiting to Exhale (1995),Comedy
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action
7,Sabrina (1995),Comedy
8,Tom and Huck (1995),Adventure
9,Sudden Death (1995),Action
10,GoldenEye (1995),Action


In [None]:
# preprocess agg of tags
tags_df_agg = tags_df.groupby('movieId')['tag'].agg(lambda x: ','.join(x)).reset_index()
# merge movies_df with tags_df_agg
df_temp = pd.merge(movies_trimmed,tags_df_agg, on='movieId', how='left')
# extracting year out of the title and making a new column with it
df_temp['Year'] = df_temp['title'].str.extract(r'\((\d{4})\)')
# movie agg on users
ratings_df['userId'] = ratings_df['userId'].astype('str')
ratings_df['movieId'] = ratings_df['movieId'].astype('str')
ratings_df_user_agg = ratings_df.groupby('movieId')['userId'].agg(lambda x: ','.join(x)).reset_index()
#ratings_df_rate_agg = ratings_df.groupby('movieId')['rating'].agg(lambda x: ','.join(x)).reset_index()

In [None]:
# all of it, the big dataframe
# combine df on movie ID, starting with the

In [10]:
# add movie title and genre to tags dataframe based on movieId
tags_movies = tags_df.copy()
tags_movies["title"] = ""
tags_movies["genre"] = ""

for i in range(len(tags_movies)):
    movie = tags_movies.loc[i, "movieId"]
    film = movies_trimmed.at[movie, "title"]
    genre = movies_trimmed.at[movie, "genre"]
    tags_movies.loc[tags_movies.index[i], "title"] = film
    tags_movies.loc[tags_movies.index[i], "genre"] = genre
print(len(tags_movies["userId"].unique()))

tags_movies.head(20)

58


Unnamed: 0,userId,movieId,tag,timestamp,title,genre
0,2,60756,funny,1445714994,Step Brothers (2008),Comedy
1,2,60756,Highly quotable,1445714996,Step Brothers (2008),Comedy
2,2,60756,will ferrell,1445714992,Step Brothers (2008),Comedy
3,2,89774,Boxing story,1445715207,Warrior (2011),Drama
4,2,89774,MMA,1445715200,Warrior (2011),Drama
5,2,89774,Tom Hardy,1445715205,Warrior (2011),Drama
6,2,106782,drugs,1445715054,"Wolf of Wall Street, The (2013)",Comedy
7,2,106782,Leonardo DiCaprio,1445715051,"Wolf of Wall Street, The (2013)",Comedy
8,2,106782,Martin Scorsese,1445715056,"Wolf of Wall Street, The (2013)",Comedy
9,7,48516,way too long,1169687325,"Departed, The (2006)",Crime


In [11]:
# Model 1: (supervised learning - logistic regression)
# sklearn jazz
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

In [None]:
# keras tuner ig
import tensorflow as tf
