In [1]:
import pandas as pd
import numpy as np
import os, json, math, time
import regex as re
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine,inspect
from sqlalchemy_utils import create_database, database_exists


In [2]:
#Environment Variables
data_dir = 'data/'

data_basics = 'title_basics.csv.gz'
data_ratings = 'title_ratings.csv.gz'
data_tmbd =  'tmdb_results_combined.csv.gz'

schema = 'movies'
columns_drop_basics = ['originalTitle','isAdult','titleType','genres','endYear']
KeyPath = %env CODINGDOJO
with open(KeyPath) as f: 
    login = json.load(f)
connection = f"mysql+pymysql://root:{login['MySQL']}@localhost:3306/{schema}"

engine = create_engine(connection)


# Create MySQL Database from data collected in parts 1&2 

In [4]:
df_title_basics = pd.read_csv(data_dir+data_basics)
df_title_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,Comedy


## Create a title_genres table from title_basics genres column

In [5]:
#set will only store unique values
genres = set()
for line in df_title_basics['genres']:
    for genre in line.split(','):
        genres.add(genre)

In [6]:
genres_map = dict(zip(sorted(genres),list(range(len(genres)))))
genres_map

{'Action': 0,
 'Adult': 1,
 'Adventure': 2,
 'Animation': 3,
 'Biography': 4,
 'Comedy': 5,
 'Crime': 6,
 'Drama': 7,
 'Family': 8,
 'Fantasy': 9,
 'Game-Show': 10,
 'History': 11,
 'Horror': 12,
 'Music': 13,
 'Musical': 14,
 'Mystery': 15,
 'News': 16,
 'Reality-TV': 17,
 'Romance': 18,
 'Sci-Fi': 19,
 'Short': 20,
 'Sport': 21,
 'Talk-Show': 22,
 'Thriller': 23,
 'War': 24,
 'Western': 25}

In [7]:
# to explode, turn genres into a list
df_title_basics['genres'] = df_title_basics['genres'].str.split(',').tolist()
df_title_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"[Comedy, Fantasy, Romance]"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,[Drama]
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,[Drama]
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"[Comedy, Horror, Sci-Fi]"
4,tt0094859,movie,Chief Zabu,Chief Zabu,0,2016,,74,[Comedy]


In [8]:
df_title_genres = df_title_basics[['tconst','genres']].explode('genres')
print(f'duplicates : {df_title_genres.duplicated().sum()} \n ')
display(df_title_genres.info())

duplicates : 0 
 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 153333 entries, 0 to 82085
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   tconst  153333 non-null  object
 1   genres  153333 non-null  object
dtypes: object(2)
memory usage: 3.5+ MB


None

In [9]:
df_title_genres.head()

Unnamed: 0,tconst,genres
0,tt0035423,Comedy
0,tt0035423,Fantasy
0,tt0035423,Romance
1,tt0062336,Drama
2,tt0069049,Drama


In [10]:
#normalize the genres field with the genres_map
df_title_genres['genre_id'] = df_title_genres['genres'].map(genres_map)

In [11]:
df_title_genres.drop(columns='genres',inplace=True)
display(df_title_genres.info())
df_title_genres.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 153333 entries, 0 to 82085
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   tconst    153333 non-null  object
 1   genre_id  153333 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.5+ MB


None

Unnamed: 0,tconst,genre_id
0,tt0035423,5
0,tt0035423,9
0,tt0035423,18
1,tt0062336,7
2,tt0069049,7


## Create the genres table from the genres_map

In [12]:
df_genres = pd.DataFrame({
    'genre_id': genres_map.values(),
    'genre_name': genres_map.keys()
})
df_genres.head()

Unnamed: 0,genre_id,genre_name
0,0,Action
1,1,Adult
2,2,Adventure
3,3,Animation
4,4,Biography


## Bring in rest of needed data 

In [13]:
df_title_basics.drop(columns=columns_drop_basics,inplace=True)
df_title_basics.head()

Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes
0,tt0035423,Kate & Leopold,2001,118
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020,70
2,tt0069049,The Other Side of the Wind,2018,122
3,tt0088751,The Naked Monster,2005,100
4,tt0094859,Chief Zabu,2016,74


In [14]:
df_title_ratings  = pd.read_csv(data_dir+data_ratings)
df_title_ratings.head()


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1905
1,tt0000002,5.9,256
2,tt0000005,6.2,2517
3,tt0000006,5.2,173
4,tt0000007,5.4,783


In [3]:
df_tmbd_data = pd.read_csv(data_dir+data_tmbd,low_memory=False)#[['imdb_id','budget','revenue','certification']]
df_tmbd_data.rename(columns={'imdb_id':'tconst'},inplace=True)
#https://www.geeksforgeeks.org/how-to-move-a-column-to-first-position-in-pandas-dataframe/
# shift column 'tconst' to first position
#first_column = df_tmbd_data.pop('tconst')
#df_tmbd_data.insert(0, 'tconst', first_column)
#df_tmbd_data.head()

In [11]:
%%time
dfp = pd.DataFrame()
x = 0
while x <= df_tmdb['production_companies'].shape[0]:
    data = eval(df_tmdb['production_companies'].fillna('[{}]')[x])
    df = pd.json_normalize(data)
    dfp = pd.concat([dfp,df],axis=0,ignore_index=True)
    x+=1
dfp.shape

KeyboardInterrupt: 

In [10]:
dfp

Unnamed: 0,id,logo_path,name,origin_country
0,60.0,/2eqFolQI0NLL7ExZts5MnLLaPwX.png,United Artists,US
1,51207.0,,Sullivan Street Productions,
2,51208.0,,Michael Ritchie Productions,
3,51209.0,,"Radmin Company, The",
4,925.0,/dIb9hjXNOkgxu4kBWdIdK8nM4w.png,Nu Image,US
...,...,...,...,...
49621,114644.0,,Lu Film,KR
49622,25009.0,,Bulldozerfilms,
49623,19146.0,/5Ff25ornzVNhm5skuAvMAR556NB.png,Dharma Productions,IN
49624,170389.0,,PogieJoe Productions,


In [16]:
%%time
df_production = pd.DataFrame()
for rows in df_tmbd_data['production_companies'].fillna('[{}]'):
    x=1
    while x < len(eval(rows)) and len(eval(rows)) >= 0 :
        df= pd.json_normalize(eval(rows)[x])
        df_production = pd.concat([df_production,df],axis=0)
        x+=1

#len((eval(df_tmbd_data['production_companies'][69250]))

Wall time: 3min 29s


In [93]:
df_production.shape

(56748, 4)

In [19]:
%%time
df_productions = pd.DataFrame()
for index, rows in df_tmbd_data[['tconst','production_companies']].iterrows():
    try:
        x=0
        while x <= len(eval(rows['production_companies'])) and len(eval(rows['production_companies'])) > 1 :
            df= pd.json_normalize(eval(rows['production_companies'])[x])
            df['tconst'] = rows['tconst']
            df_productions = pd.concat([df_productions,df],axis=0)
            x+=1
    except:
        pass

Wall time: 4min 55s


In [22]:
df_tmbd_data['production_companies'] = df_tmbd_data['production_companies'].replace('[]','[{}]').fillna('[{}]')#.apply(lambda x: len(eval(x)))

In [20]:
df_productions.shape

(79745, 5)

In [21]:
df_productions['name'].value_counts()

Canal+                    588
Warner Bros. Pictures     375
Universal Pictures        349
CNC                       314
France 2 Cinéma           265
                         ... 
Xiao Xiang Film Group       1
Blinker Filmproduktion      1
TILT Production             1
Alinfilmproduktion          1
Scopic                      1
Name: name, Length: 28400, dtype: int64

In [4]:
%%time
df_production_companies = pd.DataFrame()
for index, row in df_tmbd_data[['tconst','production_companies']].iterrows():
    try:
        df = pd.json_normalize(eval(row['production_companies']))#['name'].values.flatten().tolist()
        df['tconst'] = row['tconst']
        df_production_companies = pd.concat([df_production_companies,df],axis=0,ignore_index=True)
    except:
        pass

Wall time: 3min 8s


In [26]:
df_production_companies.head(5)

Unnamed: 0,id,logo_path,name,origin_country,tconst
0,60.0,/2eqFolQI0NLL7ExZts5MnLLaPwX.png,United Artists,US,tt0113026
1,51207.0,,Sullivan Street Productions,,tt0113026
2,51208.0,,Michael Ritchie Productions,,tt0113026
3,51209.0,,"Radmin Company, The",,tt0113026
4,925.0,/dIb9hjXNOkgxu4kBWdIdK8nM4w.png,Nu Image,US,tt0113092


In [25]:
df_production_companies['name'].value_counts()

Canal+                        592
Warner Bros. Pictures         390
Universal Pictures            370
CNC                           317
Columbia Pictures             272
                             ... 
Kukku Films                     1
Lawford County Productions      1
Motorfilm                       1
Miramonte Film                  1
Scopic                          1
Name: name, Length: 35528, dtype: int64

In [102]:
df_production['name'].value_counts()

Canal+                   407
CNC                      266
Warner Bros. Pictures    260
Ciné+                    219
StudioCanal              176
                        ... 
Atomo Films                1
MMKA                       1
HvD Productions            1
L&G Hungary                1
Scopic                     1
Name: name, Length: 22915, dtype: int64

In [89]:
#len(eval(
df_tmbd_data['production_companies'][3]#))

'[]'

# Create movies database in MySQL

In [99]:
# first create the database if does not exist. In the connection string
if database_exists(connection) == False: create_database(connection)
else: print('The database already exists.')
    
database_exists(connection)

True

In [100]:
#{dataframe:primary key} used to create tables in MySQL
df_list = {'df_title_basics':'tconst','df_genres':'genre_id',
           'df_title_genres':'','df_title_ratings':'tconst','df_tmbd_data':'tconst'}
#inspect needed to identify if table exists in schema
insp = inspect(engine)

In [101]:
%%time
for x in df_list:
    #create table from empty datafrome to set primary key in later statement
    if insp.has_table(x.replace('df_','')) == False:
        locals()[x].head(0).to_sql(x.replace('df_',''), engine, if_exists = 'replace',index=False)
        #checks is primary key listed in df_list
        if df_list[x] != '':
            #set object key length to 256
            if locals()[x][df_list[x]].dtype == 'O': 
                max_len = locals()[x][df_list[x]].apply(len).max()+1
                key_len = f'({max_len})'
            else:
                key_len = ''
            engine.execute(f"alter table {x.replace('df_','')} add primary key({df_list[x]}{key_len})")
    #overwrite table if no primary key. will look into adding composite key later
    if df_list[x] != '':
        locals()[x].to_sql(x.replace('df_',''), engine, if_exists = 'append',index=False)
    else:
        locals()[x].to_sql(x.replace('df_',''), engine, if_exists = 'replace',index=False)
    #display first 5 rows of each table
    print(f"table : {schema}.{x.replace('df_','')}")
    q = f"""
    select * from {schema}.{x.replace('df_','')} limit 5;
    """
    display(pd.read_sql(q,engine))
    print('\n')
    



table : movies.title_basics


Unnamed: 0,tconst,primaryTitle,startYear,runtimeMinutes
0,tt0035423,Kate & Leopold,2001,118
1,tt0062336,The Tango of the Widower and Its Distorting Mi...,2020,70
2,tt0069049,The Other Side of the Wind,2018,122
3,tt0088751,The Naked Monster,2005,100
4,tt0094859,Chief Zabu,2016,74




table : movies.genres


Unnamed: 0,genre_id,genre_name
0,0,Action
1,1,Adult
2,2,Adventure
3,3,Animation
4,4,Biography




table : movies.title_genres


Unnamed: 0,tconst,genre_id
0,tt0035423,5
1,tt0035423,9
2,tt0035423,18
3,tt0062336,7
4,tt0069049,7




table : movies.title_ratings


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1905
1,tt0000002,5.9,256
2,tt0000005,6.2,2517
3,tt0000006,5.2,173
4,tt0000007,5.4,783




table : movies.tmbd_data


Unnamed: 0,tconst,budget,revenue,certification
0,0,,,
1,tt0035423,48000000.0,76019048.0,PG-13
2,tt0062336,0.0,0.0,
3,tt0069049,12000000.0,0.0,R
4,tt0088751,350000.0,0.0,




Wall time: 1min 6s


In [103]:
q = f"""
SHOW TABLES;
"""
display(pd.read_sql(q,engine))

Unnamed: 0,Tables_in_movies
0,genres
1,title_basics
2,title_genres
3,title_ratings
4,tmbd_data
