In [3]:
import os
import re
from datetime import date
import ast

import pandas as pd
from dotenv import load_dotenv
from nameparser import HumanName
from sqlalchemy import create_engine, text


In [4]:
load_dotenv()

DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")

connection_string = f"postgresql://{DB_USER}:{DB_PASSWORD}@localhost/recommender"

%load_ext sql
%sql $connection_string

The sql extension is already loaded. To reload it, use:
  %reload_ext sql


# Implementing new schema:

## Merging all dataframes into one, joining on title, year, score

Renaming some fields to match for the merge:

In [6]:
dfs = [credits_df, titles_df]
for df in dfs:
    df.rename(columns={'id':'content_id', 'imdb_score':'score'}, inplace=True)

NameError: name 'credits_df' is not defined

In [None]:
merged_df = titles_df.merge(best_movies_df, 
                            on=['title', 'release_year', 'score'], 
                            how='left',
                            suffixes=('', '_best_movies'))\
                     .merge(best_shows_df,
                            on=['title', 'release_year', 'score'],
                            how='left',
                            suffixes=('', '_best_shows'))\
                     .merge(best_movies_yearly_df,
                            on=['title', 'release_year', 'score'],
                            how='left',
                            suffixes=('', '_best_movies_yearly'))\
                     .merge(best_shows_yearly_df,
                            on=['title', 'release_year', 'score'],
                            how='left',
                            suffixes=('', '_best_shows_yearly'))

merged_df.head(15)

Unnamed: 0,content_id,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,...,number_of_votes_best_shows,duration_best_shows,number_of_seasons,main_genre_best_shows,main_production_best_shows,main_genre_best_movies_yearly,main_production_best_movies_yearly,number_of_seasons_best_shows_yearly,main_genre_best_shows_yearly,main_production_best_shows_yearly
0,ts300399,Five Came Back: The Reference Films,SHOW,1945,TV-MA,48,['documentation'],['US'],1.0,,...,,,,,,,,,,
1,tm84618,Taxi Driver,MOVIE,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,...,,,,,,crime,US,,,
2,tm127384,Monty Python and the Holy Grail,MOVIE,1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,...,,,,,,comedy,GB,,,
3,tm70993,Life of Brian,MOVIE,1979,R,94,['comedy'],['GB'],,tt0079470,...,,,,,,comedy,GB,,,
4,tm190788,The Exorcist,MOVIE,1973,R,133,['horror'],['US'],,tt0070047,...,,,,,,horror,US,,,
5,ts22164,Monty Python's Flying Circus,SHOW,1969,TV-14,30,"['comedy', 'european']",['GB'],4.0,tt0063929,...,72895.0,30.0,4.0,comedy,GB,,,4.0,comedy,GB
6,tm14873,Dirty Harry,MOVIE,1971,R,102,"['thriller', 'crime', 'action']",['US'],,tt0066999,...,,,,,,thriller,US,,,
7,tm185072,My Fair Lady,MOVIE,1964,G,170,"['drama', 'music', 'romance', 'family']",['US'],,tt0058385,...,,,,,,drama,US,,,
8,tm98978,The Blue Lagoon,MOVIE,1980,R,104,"['romance', 'drama']",['US'],,tt0080453,...,,,,,,romance,US,,,
9,tm119281,Bonnie and Clyde,MOVIE,1967,R,110,"['drama', 'crime', 'action']",['US'],,tt0061418,...,,,,,,drama,US,,,


Dropping redundant columns from our new, very wide merged_df:

In [None]:
merged_df.columns

Index(['content_id', 'title', 'type', 'release_year', 'age_certification',
       'runtime', 'genres', 'production_countries', 'seasons', 'imdb_id',
       'score', 'imdb_votes', 'number_of_votes', 'duration', 'main_genre',
       'main_production', 'number_of_votes_best_shows', 'duration_best_shows',
       'number_of_seasons', 'main_genre_best_shows',
       'main_production_best_shows', 'main_genre_best_movies_yearly',
       'main_production_best_movies_yearly',
       'number_of_seasons_best_shows_yearly', 'main_genre_best_shows_yearly',
       'main_production_best_shows_yearly'],
      dtype='object')

In [None]:
merged_df.drop(columns=['number_of_votes_best_shows', 'duration_best_shows', 'number_of_seasons_best_shows_yearly', 'duration', 'number_of_votes'], inplace=True)
merged_df.columns

Index(['content_id', 'title', 'type', 'release_year', 'age_certification',
       'runtime', 'genres', 'production_countries', 'seasons', 'imdb_id',
       'score', 'imdb_votes', 'main_genre', 'main_production',
       'number_of_seasons', 'main_genre_best_shows',
       'main_production_best_shows', 'main_genre_best_movies_yearly',
       'main_production_best_movies_yearly', 'main_genre_best_shows_yearly',
       'main_production_best_shows_yearly'],
      dtype='object')

Setting main_genre equal to any field among other main_genre columns that is not null. This will default to the earliest encountered genre, and that is fine, the show is the same and the genre is the same, so it doesn't matter which one we choose.:

In [None]:
mask1 = merged_df['main_genre'].isna() & merged_df['main_genre_best_shows'].notna()
merged_df.loc[mask1, 'main_genre'] = merged_df.loc[mask1, 'main_genre_best_shows']

mask2 = merged_df['main_genre'].isna() & merged_df['main_genre_best_movies_yearly'].notna()
merged_df.loc[mask2, 'main_genre'] = merged_df.loc[mask2, 'main_genre_best_movies_yearly']

mask3 = merged_df['main_genre'].isna() & merged_df['main_genre_best_shows_yearly'].notna()
merged_df.loc[mask3, 'main_genre'] = merged_df.loc[mask3, 'main_genre_best_shows_yearly']


In [None]:
merged_df.drop(columns=['main_genre_best_shows', 'main_genre_best_movies_yearly', 'main_genre_best_shows_yearly'], inplace=True)

Setting main_production equal to any field among other main_production columns that is not null.:

In [None]:
mask1 = merged_df['main_production'].isna() & merged_df['main_production_best_shows'].notna()
merged_df.loc[mask1, 'main_production'] = merged_df.loc[mask1, 'main_production_best_shows']

mask2 = merged_df['main_production'].isna() & merged_df['main_production_best_movies_yearly'].notna()
merged_df.loc[mask2, 'main_production'] = merged_df.loc[mask2, 'main_production_best_movies_yearly']

mask3 = merged_df['main_production'].isna() & merged_df['main_production_best_shows_yearly'].notna()
merged_df.loc[mask3, 'main_production'] = merged_df.loc[mask3, 'main_production_best_shows_yearly']

In [None]:
merged_df.drop(columns=['main_production_best_shows', 'main_production_best_movies_yearly', 'main_production_best_shows_yearly'], inplace=True)

In [None]:
merged_df.shape

(5806, 15)

In [None]:
merged_df.columns

Index(['content_id', 'title', 'type', 'release_year', 'age_certification',
       'runtime', 'genres', 'production_countries', 'seasons', 'imdb_id',
       'score', 'imdb_votes', 'main_genre', 'main_production',
       'number_of_seasons'],
      dtype='object')

In [None]:
merged_df.head()

Unnamed: 0,content_id,title,type,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,score,imdb_votes,main_genre,main_production,number_of_seasons
0,ts300399,Five Came Back: The Reference Films,SHOW,1945,TV-MA,48,['documentation'],['US'],1.0,,,,,,
1,tm84618,Taxi Driver,MOVIE,1976,R,113,"['crime', 'drama']",['US'],,tt0075314,8.3,795222.0,crime,US,
2,tm127384,Monty Python and the Holy Grail,MOVIE,1975,PG,91,"['comedy', 'fantasy']",['GB'],,tt0071853,8.2,530877.0,comedy,GB,
3,tm70993,Life of Brian,MOVIE,1979,R,94,['comedy'],['GB'],,tt0079470,8.0,392419.0,comedy,GB,
4,tm190788,The Exorcist,MOVIE,1973,R,133,['horror'],['US'],,tt0070047,8.1,391942.0,horror,US,


## Creating title_df (per new schema):

In [None]:
def create_title_df(df):
    title_df = df[['content_id', 'title', 'release_year', 'type', 'age_certification', 'runtime', 'number_of_seasons', 'imdb_id', 'score', 'imdb_votes']].copy()
    title_df.rename(columns={'score': 'imdb_score'}, inplace=True)
    title_df['is_best_year'] = False 
    title_df['is_all_time_best'] = False
    condition1 = title_df['title'].isin(best_movies_yearly_df['title']) | title_df['title'].isin(best_shows_yearly_df['title'])
    title_df['is_best_year'].loc[condition1] = True
    condition2 = title_df['title'].isin(best_movies_df['title']) | title_df['title'].isin(best_shows_df['title'])
    title_df['is_all_time_best'].loc[condition2] = True
    title_df = title_df.applymap(lambda r: r.strip() if isinstance(r, str) else r)
    return title_df

title_df = create_title_df(merged_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  title_df['is_best_year'].loc[condition1] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  title_df['is_all_time_best'].loc[condition2] = True


In [None]:
title_df.head(50)

Unnamed: 0,content_id,title,release_year,type,age_certification,runtime,number_of_seasons,imdb_id,imdb_score,imdb_votes,is_best_year,is_all_time_best
0,ts300399,Five Came Back: The Reference Films,1945,SHOW,TV-MA,48,,,,,False,False
1,tm84618,Taxi Driver,1976,MOVIE,R,113,,tt0075314,8.3,795222.0,True,True
2,tm127384,Monty Python and the Holy Grail,1975,MOVIE,PG,91,,tt0071853,8.2,530877.0,True,True
3,tm70993,Life of Brian,1979,MOVIE,R,94,,tt0079470,8.0,392419.0,True,True
4,tm190788,The Exorcist,1973,MOVIE,R,133,,tt0070047,8.1,391942.0,True,True
5,ts22164,Monty Python's Flying Circus,1969,SHOW,TV-14,30,4.0,tt0063929,8.8,72895.0,True,True
6,tm14873,Dirty Harry,1971,MOVIE,R,102,,tt0066999,7.7,153463.0,True,True
7,tm185072,My Fair Lady,1964,MOVIE,G,170,,tt0058385,7.8,94121.0,True,True
8,tm98978,The Blue Lagoon,1980,MOVIE,R,104,,tt0080453,5.8,69053.0,True,False
9,tm119281,Bonnie and Clyde,1967,MOVIE,R,110,,tt0061418,7.7,111189.0,True,True


## Creating genres df:

In [None]:
def create_genres_df(merged_df):
    genre_df = merged_df[['content_id', 'genres', 'main_genre']].copy()
    genre_df['genres'] = genre_df['genres'].apply(lambda genres: ast.literal_eval(genres))
    genre_df = genre_df.explode('genres')
    genre_df['genres'] = genre_df['genres'].str.strip()
    genre_df.drop_duplicates(inplace=True)
    genre_df['is_main_genre'] = False
    genre_df['is_main_genre'].loc[genre_df['genres'] == genre_df['main_genre']] = True
    genre_df.drop(columns='main_genre', inplace=True)
    return genre_df

genre_df = create_genres_df(merged_df)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_df['is_main_genre'].loc[genre_df['genres'] == genre_df['main_genre']] = True


In [None]:
merged_df.columns

Index(['content_id', 'title', 'type', 'release_year', 'age_certification',
       'runtime', 'genres', 'production_countries', 'seasons', 'imdb_id',
       'score', 'imdb_votes', 'main_genre', 'main_production',
       'number_of_seasons'],
      dtype='object')

In [None]:
def create_prod_country_df(merged_df):
    prod_country_df = merged_df[['content_id', 'production_countries', 'main_production']].copy()
    prod_country_df.rename(columns={'production_countries': 'country'}, inplace=True)
    prod_country_df['country'] = prod_country_df['country'].apply(lambda countries: ast.literal_eval(countries))
    prod_country_df = prod_country_df.explode('country')
    prod_country_df['country'] = prod_country_df['country'].str.strip()
    prod_country_df.drop_duplicates(inplace=True)
    prod_country_df['is_main_production'] = False
    prod_country_df['is_main_production'].loc[prod_country_df['country'] == prod_country_df['main_production']] = True
    prod_country_df.drop(columns='main_production', inplace=True)
    condition = prod_country_df.duplicated(subset=['content_id'])
    prod_country_df['is_main_production'].loc[~condition] = True
    prod_country_df.loc[pd.isna(prod_country_df['country']), 'is_main_production'] = None
    return prod_country_df

prod_country_df = create_prod_country_df(merged_df)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prod_country_df['is_main_production'].loc[prod_country_df['country'] == prod_country_df['main_production']] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prod_country_df['is_main_production'].loc[~condition] = True


In [None]:
prod_country_df.head(50)

Unnamed: 0,content_id,country,is_main_production
0,ts300399,US,True
1,tm84618,US,True
2,tm127384,GB,True
3,tm70993,GB,True
4,tm190788,US,True
5,ts22164,GB,True
6,tm14873,US,True
7,tm185072,US,True
8,tm98978,US,True
9,tm119281,US,True


## Splitting names in credits df:

In [None]:
credits_df.head()

Unnamed: 0,person_id,content_id,name,character,role
0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR
1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR
2,7064,tm84618,Albert Brooks,Tom,ACTOR
3,3739,tm84618,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR
4,48933,tm84618,Cybill Shepherd,Betsy,ACTOR


In [None]:
def split_credits_names(credits_df):
    credits_df['name_obj'] = credits_df['name'].apply(lambda name: HumanName(name))
    credits_df['first_name'] = credits_df['name_obj'].apply(lambda name: name.first if name else None)
    credits_df['middle_name'] = credits_df['name_obj'].apply(lambda name: name.middle if name else None)
    credits_df['last_name'] = credits_df['name_obj'].apply(lambda name: name.last if name else None)
    
    credits_df.drop(columns=['name', 'name_obj'], inplace=True)
    return credits_df

credits_df = split_credits_names(credits_df)



In [None]:
credits_df.head(50)

Unnamed: 0,person_id,content_id,character,role,first_name,middle_name,last_name
0,3748,tm84618,Travis Bickle,ACTOR,Robert,,De Niro
1,14658,tm84618,Iris Steensma,ACTOR,Jodie,,Foster
2,7064,tm84618,Tom,ACTOR,Albert,,Brooks
3,3739,tm84618,Matthew 'Sport' Higgins,ACTOR,Harvey,,Keitel
4,48933,tm84618,Betsy,ACTOR,Cybill,,Shepherd
5,32267,tm84618,Wizard,ACTOR,Peter,,Boyle
6,519612,tm84618,Senator Charles Palantine,ACTOR,Leonard,,Harris
7,29068,tm84618,Concession Girl,ACTOR,Diahnne,,Abbott
8,519613,tm84618,Policeman at Rally,ACTOR,Gino,,Ardito
9,3308,tm84618,Passenger Watching Silhouette,ACTOR,Martin,,Scorsese


In [None]:
%%sql
DROP TABLE IF EXISTS titles CASCADE;
CREATE TABLE titles (
    title_id varchar(15) PRIMARY KEY,
    title varchar(200),
    content_type varchar(5) NOT NULL CHECK (content_type IN('movie', 'MOVIE', 'show', 'SHOW')),
    release_year smallint,
    age_certification varchar(10),
    runtime varchar(6),
    num_seasons smallint,
    imdb_id varchar(15),
    imdb_score real,
    imdb_num_votes bigint
);