In [11]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm_notebook
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['df_2000_2001.csv.gz',
 'df_2000_2001_budget_revenue_title_cert.csv.gz',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json',
 'tmdb_results_combined.csv.gz']

In [91]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"

In [92]:
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)

In [93]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [15]:
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [16]:
ratings = pd.read_csv(ratings_url,sep='\t', low_memory=False)

In [17]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1907
1,tt0000002,5.8,256
2,tt0000003,6.5,1707
3,tt0000004,5.6,168
4,tt0000005,6.2,2520


In [18]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"

In [19]:
akas = pd.read_csv(akas_url,sep='\t', low_memory=False)

In [20]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [21]:
basics = basics.replace({'\\N':np.nan})

In [22]:
akas = akas.replace({'\\N':np.nan})

In [23]:
ratings = ratings.replace({'\\N':np.nan})

In [24]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9193618 entries, 0 to 9193617
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 631.3+ MB


In [25]:
basics['runtimeMinutes'].value_counts()

30                         131172
60                         102763
22                          92421
44                          69224
45                          58456
                            ...  
569                             1
670                             1
924                             1
Animation,Comedy,Family         1
2088                            1
Name: runtimeMinutes, Length: 873, dtype: int64

In [26]:
basics.duplicated().sum()

0

In [27]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1224122
endYear           9097970
runtimeMinutes    6726926
genres             426946
dtype: int64

In [28]:
basics.dropna(subset=['runtimeMinutes'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 1
startYear           37680
endYear           2419966
runtimeMinutes          0
genres              67367
dtype: int64

In [29]:
basics.dropna(subset=['genres'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear           36339
endYear           2354164
runtimeMinutes          0
genres                  0
dtype: int64

In [30]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear           30797
endYear           1983271
runtimeMinutes          0
genres                  0
dtype: int64

In [31]:
basics['titleType'].value_counts()

tvEpisode       954532
short           484367
movie           279745
video           139737
tvSeries         74122
tvMovie          56118
tvSpecial        13881
tvMiniSeries     11641
tvShort           7013
videoGame          284
Name: titleType, dtype: int64

In [32]:
basics = basics[basics["titleType"].str.contains("movie")==True]
basics['titleType'].value_counts()

movie    279745
Name: titleType, dtype: int64

In [33]:
basics['startYear'].value_counts()

2018    9580
2017    9394
2019    9309
2016    8996
2015    8544
        ... 
1906       1
1903       1
1908       1
2027       1
1894       1
Name: startYear, Length: 124, dtype: int64

In [34]:
basics.dropna(subset=['startYear'], inplace=True)
basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           275466
runtimeMinutes         0
genres                 0
dtype: int64

In [35]:
basics = basics[basics['startYear'] >= '2000']
basics['startYear'].value_counts()

2018    9580
2017    9394
2019    9309
2016    8996
2015    8544
2014    8144
2021    7996
2013    7761
2020    7481
2012    7271
2011    6740
2010    6343
2009    5951
2022    5765
2008    5182
2007    4603
2006    4370
2005    3880
2004    3506
2003    3216
2002    2971
2001    2843
2000    2717
2023     282
2024      28
2025       6
2026       2
2027       1
Name: startYear, dtype: int64

In [36]:
basics = basics[basics['startYear'] <= '2021']
basics['startYear'].value_counts()

2018    9580
2017    9394
2019    9309
2016    8996
2015    8544
2014    8144
2021    7996
2013    7761
2020    7481
2012    7271
2011    6740
2010    6343
2009    5951
2008    5182
2007    4603
2006    4370
2005    3880
2004    3506
2003    3216
2002    2971
2001    2843
2000    2717
Name: startYear, dtype: int64

In [37]:
keepers =basics['tconst'].isin(akas['titleId'])
keepers

34791      True
61091      True
67637      True
77931      True
86768      True
           ... 
9193290    True
9193299    True
9193338    True
9193383    True
9193467    True
Name: tconst, Length: 136798, dtype: bool

In [38]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34791,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61091,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67637,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77931,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86768,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9193290,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9193299,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"
9193338,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
9193383,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"


In [39]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [40]:
akas['region'].value_counts()

JP    3946404
FR    3946379
DE    3929512
IN    3874808
ES    3869437
       ...   
TV          1
NU          1
PW          1
NR          1
TC          1
Name: region, Length: 247, dtype: int64

In [41]:
akas = akas[akas["region"].str.contains("US")==True]
akas['region'].value_counts()

US    1346076
Name: region, dtype: int64

In [42]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1346076 entries, 5 to 33050551
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1346076 non-null  object
 1   ordering         1346076 non-null  int64 
 2   title            1346076 non-null  object
 3   region           1346076 non-null  object
 4   language         3694 non-null     object
 5   types            963758 non-null   object
 6   attributes       44843 non-null    object
 7   isOriginalTitle  1344701 non-null  object
dtypes: int64(1), object(7)
memory usage: 92.4+ MB


In [43]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136188 entries, 34791 to 9193467
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          136188 non-null  object
 1   titleType       136188 non-null  object
 2   primaryTitle    136188 non-null  object
 3   originalTitle   136188 non-null  object
 4   isAdult         136188 non-null  object
 5   startYear       136188 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  136188 non-null  object
 8   genres          136188 non-null  object
dtypes: object(9)
memory usage: 10.4+ MB


In [44]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258682 entries, 0 to 1258681
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1258682 non-null  object 
 1   averageRating  1258682 non-null  float64
 2   numVotes       1258682 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.8+ MB


In [45]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['df_2000_2001.csv.gz',
 'df_2000_2001_budget_revenue_title_cert.csv.gz',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json',
 'tmdb_results_combined.csv.gz']

In [46]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [47]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [48]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [49]:
import json
with open('C:/Users/fid24/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key-tmdb'])

In [50]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key-tmdb']

In [51]:
YEARS_TO_GET = [2000,2001]

In [52]:
def get_movie_with_rating(movie_id):
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    # save the .info .releases dictionaries
    movie_info = movie.info()
    releases = movie.releases()
    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1' ] =='US':
            ## save a "certification" key in the info dict with the certification
            movie_info['certification'] = c['certification']
    return movie_info

In [53]:
 def write_json(new_data, filename): 
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [54]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
    # If it does not exist: create it
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    file_exists = os.path.isfile(JSON_FILE)
    if file_exists == False:    
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
    # Load in the dataframe from project part 1 as basics:
    basics = pd.read_csv('Data/title_basics.csv.gz')
    #Saving new year as the current df
    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['tconst'].copy()#.to_list()
    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        # Attempt to retrieve then data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        # If it fails,  make a dict with just the id and None for certification.
        except Exception as e:
            continue

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/528 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/587 [00:00<?, ?it/s]

In [55]:
with open('C:/Users/fid24/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key-tmdb'])

In [56]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key-tmdb']

In [57]:
JSON_FILE_2000 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2000.csv.gz"
JSON_FILE_2000

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2000.csv.gz'

In [58]:
df_2000 = pd.read_csv(JSON_FILE_2000)
df_2000.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,0.0,0.0,
4,tt0116748,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,579396.0,hi,Karobaar,...,0.0,180.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,The Business of Love,Karobaar,0.0,5.5,2.0,


In [59]:
JSON_FILE_2001 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2001.csv.gz"
JSON_FILE_2001

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2001.csv.gz'

In [60]:
df_2001 = pd.read_csv(JSON_FILE_2001)
df_2001.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0035423,0.0,/ab5yL8zgRotrICzGbEl10z24N71.jpg,,48000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",,11232.0,en,Kate & Leopold,...,76019048.0,118.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,If they lived in the same century they'd be pe...,Kate & Leopold,0.0,6.3,1137.0,PG-13
2,tt0079644,0.0,/79axmuH1UGkB7m72jjB9rPff9om.jpg,,0.0,"[{'id': 10752, 'name': 'War'}]",,285529.0,id,November 1828,...,0.0,140.0,"[{'english_name': 'Indonesian', 'iso_639_1': '...",Released,,November 1828,0.0,0.0,0.0,
3,tt0089067,0.0,,,0.0,"[{'id': 35, 'name': 'Comedy'}]",,210258.0,es,El día de los albañiles 2,...,0.0,90.0,"[{'english_name': 'Spanish', 'iso_639_1': 'es'...",Released,The laborers are back full of love and laughs.,El día de los albañiles 2,0.0,7.2,71.0,
4,tt0114447,0.0,,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",,151007.0,en,The Silent Force,...,0.0,90.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,They left him for dead... They should have fin...,The Silent Force,0.0,5.0,3.0,


In [61]:
df = pd.concat([df_2000, df_2001], ignore_index=True)
df

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,0.0,0.0,
4,tt0116748,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,579396.0,hi,Karobaar,...,0.0,180.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,The Business of Love,Karobaar,0.0,5.5,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4437,tt9071078,0.0,,,0.0,"[{'id': 28, 'name': 'Action'}]",http://www.hkcinemagic.com/en/movie.asp?id=6627,201706.0,cn,致命密函,...,0.0,90.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,,Chinese Heroes,0.0,3.0,2.0,
4438,tt9099724,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,616033.0,ja,Rokushukan Private Moment,...,0.0,102.0,"[{'english_name': 'Japanese', 'iso_639_1': 'ja...",Released,,Rokushukan Private Moment,0.0,0.0,0.0,
4439,tt9798698,0.0,,,0.0,[],,580269.0,en,Gay holocaust,...,0.0,0.0,[],Released,,Gay holocaust,0.0,0.0,0.0,
4440,tt0286560,0.0,/l2NTN8FJq0jTLSMSjEekL4nvfTj.jpg,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,240270.0,pt,Copacabana,...,0.0,90.0,"[{'english_name': 'Portuguese', 'iso_639_1': '...",Released,,Copacabana,0.0,8.3,3.0,


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4442 entries, 0 to 4441
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                4442 non-null   object 
 1   adult                  4440 non-null   float64
 2   backdrop_path          1908 non-null   object 
 3   belongs_to_collection  248 non-null    object 
 4   budget                 4440 non-null   float64
 5   genres                 4440 non-null   object 
 6   homepage               229 non-null    object 
 7   id                     4440 non-null   float64
 8   original_language      4440 non-null   object 
 9   original_title         4440 non-null   object 
 10  overview               4072 non-null   object 
 11  popularity             4440 non-null   float64
 12  poster_path            3725 non-null   object 
 13  production_companies   4440 non-null   object 
 14  production_countries   4440 non-null   object 
 15  rele

In [63]:
df.to_csv("Data/df_2000_2001.csv.gz",compression='gzip',index=False)

In [64]:
df.drop(columns='imdb_id', inplace=True)
df.drop(columns='adult', inplace=True)
df.drop(columns='backdrop_path', inplace=True)
df.drop(columns='belongs_to_collection', inplace=True)
df.drop(columns='genres', inplace=True)
df.drop(columns='homepage', inplace=True)
df.drop(columns='id', inplace=True)
df.drop(columns='original_language', inplace=True)
df.drop(columns='overview', inplace=True)
df.drop(columns='popularity', inplace=True)
df.drop(columns='poster_path', inplace=True)
df.drop(columns='production_companies', inplace=True)
df.drop(columns='production_countries', inplace=True)
df.drop(columns='release_date', inplace=True)
df.drop(columns='runtime', inplace=True)
df.drop(columns='spoken_languages', inplace=True)
df.drop(columns='status', inplace=True)
df.drop(columns='tagline', inplace=True)
df.drop(columns='video', inplace=True)
df.drop(columns='vote_average', inplace=True)
df.drop(columns='vote_count', inplace=True)
df.drop(columns='original_title', inplace=True)
df

Unnamed: 0,budget,revenue,title,certification
0,,,,
1,10000000.0,0.0,The Fantasticks,
2,0.0,0.0,For the Cause,
3,0.0,0.0,Gang,
4,0.0,0.0,Karobaar,
...,...,...,...,...
4437,0.0,0.0,Chinese Heroes,
4438,0.0,0.0,Rokushukan Private Moment,
4439,0.0,0.0,Gay holocaust,
4440,0.0,0.0,Copacabana,


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4442 entries, 0 to 4441
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   budget         4440 non-null   float64
 1   revenue        4440 non-null   float64
 2   title          4440 non-null   object 
 3   certification  816 non-null    object 
dtypes: float64(2), object(2)
memory usage: 138.9+ KB


In [66]:
test_notebook = get_movie_with_rating("tt0332280")
test_notebook

{'adult': False,
 'backdrop_path': '/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg',
 'belongs_to_collection': None,
 'budget': 29000000,
 'genres': [{'id': 10749, 'name': 'Romance'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.newline.com/properties/notebookthe.html',
 'id': 11036,
 'imdb_id': 'tt0332280',
 'original_language': 'en',
 'original_title': 'The Notebook',
 'overview': "An epic love story centered around an older man who reads aloud to a woman with Alzheimer's. From a faded notebook, the old man's words bring to life the story about a couple who is separated by World War II, and is then passionately reunited, seven years later, after they have taken different paths.",
 'popularity': 73.78,
 'poster_path': '/rNzQyW4f8B8cQeg7Dgj3n6eT5k9.jpg',
 'production_companies': [{'id': 12,
   'logo_path': '/iaYpEp3LQmb8AfAtmTvpqd4149c.png',
   'name': 'New Line Cinema',
   'origin_country': 'US'},
  {'id': 1565, 'logo_path': None, 'name': 'Avery Pix', 'origin_country': 'US'},
  {'id': 260

In [67]:
test_avengers = get_movie_with_rating("tt0848228")
test_avengers

{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 225.297,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

In [94]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
9190999,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
9191000,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
9191001,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
9191002,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [95]:
title_genres = basics.drop(columns=['titleType','originalTitle','isAdult','genres'])
title_genres

Unnamed: 0,tconst,primaryTitle,startYear,endYear,runtimeMinutes
0,tt0000001,Carmencita,1894,\N,1
1,tt0000002,Le clown et ses chiens,1892,\N,5
2,tt0000003,Pauvre Pierrot,1892,\N,4
3,tt0000004,Un bon bock,1892,\N,12
4,tt0000005,Blacksmith Scene,1893,\N,1
...,...,...,...,...,...
9190999,tt9916848,Episode #3.17,2010,\N,\N
9191000,tt9916850,Episode #3.19,2010,\N,\N
9191001,tt9916852,Episode #3.20,2010,\N,\N
9191002,tt9916856,The Wind,2015,\N,27


In [68]:
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists
# Create connection string using credentials following this format
# connection = "dialect+driver://username:password@host:port/database"
connection = "mysql+pymysql://root:root@localhost/project_3"
engine = create_engine(connection)

In [69]:
if database_exists(connection) == False: create_database(connection)
else: print('The database already exists.')

The database already exists.


In [71]:
df.to_sql('movies', engine, if_exists = 'replace')

4442

In [96]:
df.to_sql('title_genres', engine, if_exists = 'replace')

4442

In [None]:
df.to_sql('ratings', engine, if_exists = 'replace')

In [97]:
q = """SELECT * FROM movies LIMIT 5;"""
pd.read_sql(q, engine)

Unnamed: 0,index,budget,revenue,title,certification
0,0,,,,
1,1,10000000.0,0.0,The Fantasticks,
2,2,0.0,0.0,For the Cause,
3,3,0.0,0.0,Gang,
4,4,0.0,0.0,Karobaar,
