In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm_notebook
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['df_2000_2001.csv.gz',
 'df_2000_2001_budget_revenue_title_cert.csv.gz',
 'df_clean_basics.csv.gz',
 'final_basics_df.csv.gz',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json',
 'tmdb_results_combined.csv.gz']

In [22]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"

In [23]:
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)

In [24]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [6]:
ratings = pd.read_csv(ratings_url,sep='\t', low_memory=False)

In [7]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1907
1,tt0000002,5.8,256
2,tt0000003,6.5,1707
3,tt0000004,5.6,168
4,tt0000005,6.2,2520


In [8]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"

In [9]:
akas = pd.read_csv(akas_url,sep='\t', low_memory=False)

In [10]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [11]:
basics = basics.replace({'\\N':np.nan})

In [12]:
akas = akas.replace({'\\N':np.nan})

In [13]:
ratings = ratings.replace({'\\N':np.nan})

In [25]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9196049 entries, 0 to 9196048
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 631.4+ MB


In [26]:
basics['runtimeMinutes'].value_counts()

\N                         6728889
30                          131190
60                          102774
22                           92425
44                           69240
                            ...   
569                              1
670                              1
924                              1
Animation,Comedy,Family          1
2088                             1
Name: runtimeMinutes, Length: 874, dtype: int64

In [27]:
basics.duplicated().sum()

0

In [28]:
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [29]:
basics.dropna(subset=['runtimeMinutes'], inplace=True)
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [30]:
basics.dropna(subset=['genres'], inplace=True)
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres             0
dtype: int64

In [31]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      10
originalTitle     10
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres             0
dtype: int64

In [32]:
basics['titleType'].value_counts()

tvEpisode       6438328
short            753416
movie            504537
video            223457
tvSeries         203189
tvMovie           93371
tvMiniSeries      35838
tvSpecial         34361
videoGame         31987
tvShort            8018
tvPilot               2
Name: titleType, dtype: int64

In [33]:
basics = basics[basics["titleType"].str.contains("movie")==True]
basics['titleType'].value_counts()

movie    504537
Name: titleType, dtype: int64

In [34]:
basics['startYear'].value_counts()

\N      75401
2018    12986
2017    12613
2019    12587
2021    12454
        ...  
1906        8
2028        7
2029        4
1903        1
1894        1
Name: startYear, Length: 128, dtype: int64

In [35]:
basics.dropna(subset=['startYear'], inplace=True)
basics.isna().sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
endYear           0
runtimeMinutes    0
genres            0
dtype: int64

In [36]:
basics = basics[basics['startYear'] >= '2000']
basics['startYear'].value_counts()

\N      75401
2018    12986
2017    12613
2019    12587
2021    12454
2016    12230
2022    11644
2015    11138
2020    10681
2014    10443
2013     9778
2012     9296
2011     8683
2010     8029
2009     7692
2008     6787
2007     6020
2006     5569
2005     5215
2004     4589
2002     4390
2003     4389
2001     4303
2000     4099
2023     1940
2024      314
2025       66
2026       19
2027       14
2028        7
2029        4
Name: startYear, dtype: int64

In [37]:
basics = basics[basics['startYear'] <= '2021']
basics['startYear'].value_counts()

2018    12986
2017    12613
2019    12587
2021    12454
2016    12230
2015    11138
2020    10681
2014    10443
2013     9778
2012     9296
2011     8683
2010     8029
2009     7692
2008     6787
2007     6020
2006     5569
2005     5215
2004     4589
2002     4390
2003     4389
2001     4303
2000     4099
Name: startYear, dtype: int64

In [38]:
keepers =basics['tconst'].isin(akas['titleId'])
keepers

11636      True
15174      True
34791      True
61091      True
67637      True
           ... 
9195814    True
9195846    True
9195898    True
9195978    True
9195989    True
Name: tconst, Length: 183971, dtype: bool

In [39]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
11636,tt0011801,movie,Tötet nicht mehr,Tötet nicht mehr,0,2019,\N,\N,"Action,Crime"
15174,tt0015414,movie,La tierra de los toros,La tierra de los toros,0,2000,\N,60,\N
34791,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,\N,118,"Comedy,Fantasy,Romance"
61091,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,\N,70,Drama
67637,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,\N,122,Drama
...,...,...,...,...,...,...,...,...,...
9195814,tt9916362,movie,Coven,Akelarre,0,2020,\N,92,"Drama,History"
9195846,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0,2019,\N,\N,"Adventure,History,War"
9195898,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,\N,123,Drama
9195978,tt9916706,movie,Dankyavar Danka,Dankyavar Danka,0,2013,\N,\N,Comedy


In [40]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [41]:
akas['region'].value_counts()

JP    3947737
FR    3947708
DE    3930843
IN    3876152
ES    3870774
       ...   
TV          1
NU          1
PW          1
NR          1
TC          1
Name: region, Length: 247, dtype: int64

In [42]:
akas = akas[akas["region"].str.contains("US")==True]
akas['region'].value_counts()

US    1346280
Name: region, dtype: int64

In [43]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1346280 entries, 5 to 33060654
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1346280 non-null  object
 1   ordering         1346280 non-null  int64 
 2   title            1346280 non-null  object
 3   region           1346280 non-null  object
 4   language         3697 non-null     object
 5   types            963762 non-null   object
 6   attributes       44846 non-null    object
 7   isOriginalTitle  1344905 non-null  object
dtypes: int64(1), object(7)
memory usage: 92.4+ MB


In [44]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 180043 entries, 11636 to 9195989
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          180043 non-null  object
 1   titleType       180043 non-null  object
 2   primaryTitle    180043 non-null  object
 3   originalTitle   180043 non-null  object
 4   isAdult         180043 non-null  object
 5   startYear       180043 non-null  object
 6   endYear         180043 non-null  object
 7   runtimeMinutes  180043 non-null  object
 8   genres          180043 non-null  object
dtypes: object(9)
memory usage: 13.7+ MB


In [45]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258682 entries, 0 to 1258681
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1258682 non-null  object 
 1   averageRating  1258682 non-null  float64
 2   numVotes       1258682 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.8+ MB


In [46]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['df_2000_2001.csv.gz',
 'df_2000_2001_budget_revenue_title_cert.csv.gz',
 'df_clean_basics.csv.gz',
 'final_basics_df.csv.gz',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json',
 'tmdb_results_combined.csv.gz']

In [47]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [48]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [49]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [50]:
import json
with open('C:/Users/fid24/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key-tmdb'])

In [51]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key-tmdb']

In [52]:
YEARS_TO_GET = [2000,2001]

In [53]:
def get_movie_with_rating(movie_id):
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    # save the .info .releases dictionaries
    movie_info = movie.info()
    releases = movie.releases()
    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1' ] =='US':
            ## save a "certification" key in the info dict with the certification
            movie_info['certification'] = c['certification']
    return movie_info

In [54]:
 def write_json(new_data, filename): 
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [55]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
    # If it does not exist: create it
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    file_exists = os.path.isfile(JSON_FILE)
    if file_exists == False:    
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
    # Load in the dataframe from project part 1 as basics:
    basics = pd.read_csv('Data/title_basics.csv.gz')
    #Saving new year as the current df
    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['tconst'].copy()#.to_list()
    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        # Attempt to retrieve then data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        # If it fails,  make a dict with just the id and None for certification.
        except Exception as e:
            continue

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/1864 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/2021 [00:00<?, ?it/s]

In [56]:
with open('C:/Users/fid24/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key-tmdb'])

In [57]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key-tmdb']

In [58]:
JSON_FILE_2000 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2000.csv.gz"
JSON_FILE_2000

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2000.csv.gz'

In [59]:
df_2000 = pd.read_csv(JSON_FILE_2000)
df_2000.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,0.0,0.0,
4,tt0116748,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,579396.0,hi,Karobaar,...,0.0,180.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,The Business of Love,Karobaar,0.0,5.5,2.0,


In [60]:
JSON_FILE_2001 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2001.csv.gz"
JSON_FILE_2001

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2001.csv.gz'

In [61]:
df_2001 = pd.read_csv(JSON_FILE_2001)
df_2001.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0035423,0.0,/ab5yL8zgRotrICzGbEl10z24N71.jpg,,48000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 14, 'nam...",,11232.0,en,Kate & Leopold,...,76019048.0,118.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,If they lived in the same century they'd be pe...,Kate & Leopold,0.0,6.3,1137.0,PG-13
2,tt0079644,0.0,/79axmuH1UGkB7m72jjB9rPff9om.jpg,,0.0,"[{'id': 10752, 'name': 'War'}]",,285529.0,id,November 1828,...,0.0,140.0,"[{'english_name': 'Indonesian', 'iso_639_1': '...",Released,,November 1828,0.0,0.0,0.0,
3,tt0089067,0.0,,,0.0,"[{'id': 35, 'name': 'Comedy'}]",,210258.0,es,El día de los albañiles 2,...,0.0,90.0,"[{'english_name': 'Spanish', 'iso_639_1': 'es'...",Released,The laborers are back full of love and laughs.,El día de los albañiles 2,0.0,7.2,71.0,
4,tt0114447,0.0,,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",,151007.0,en,The Silent Force,...,0.0,90.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,They left him for dead... They should have fin...,The Silent Force,0.0,5.0,3.0,


In [62]:
df = pd.concat([df_2000, df_2001], ignore_index=True)
df

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,0.0,0.0,
4,tt0116748,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,579396.0,hi,Karobaar,...,0.0,180.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,The Business of Love,Karobaar,0.0,5.5,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5052,tt7464254,0.0,,,0.0,"[{'id': 10751, 'name': 'Family'}, {'id': 35, '...",,694810.0,ta,Sigamani Ramamani,...,0.0,138.0,"[{'english_name': 'Tamil', 'iso_639_1': 'ta', ...",Released,,Sigamani Ramamani,0.0,0.0,0.0,
5053,tt8032774,0.0,,,0.0,"[{'id': 35, 'name': 'Comedy'}]",,876076.0,ta,Looty,...,0.0,145.0,"[{'english_name': 'Tamil', 'iso_639_1': 'ta', ...",Released,,Looty,0.0,0.0,0.0,
5054,tt8188914,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,740851.0,en,Nayak,...,0.0,100.0,"[{'english_name': 'Assamese', 'iso_639_1': 'as...",Released,,Nayak,0.0,0.0,0.0,
5055,tt8228424,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,740593.0,en,Daag,...,0.0,140.0,[],Released,,Daag,0.0,0.0,0.0,


In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5057 entries, 0 to 5056
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                5057 non-null   object 
 1   adult                  5055 non-null   float64
 2   backdrop_path          1980 non-null   object 
 3   belongs_to_collection  256 non-null    object 
 4   budget                 5055 non-null   float64
 5   genres                 5055 non-null   object 
 6   homepage               251 non-null    object 
 7   id                     5055 non-null   float64
 8   original_language      5055 non-null   object 
 9   original_title         5055 non-null   object 
 10  overview               4570 non-null   object 
 11  popularity             5055 non-null   float64
 12  poster_path            4102 non-null   object 
 13  production_companies   5055 non-null   object 
 14  production_countries   5055 non-null   object 
 15  rele

In [64]:
df.to_csv("Data/df_2000_2001.csv.gz",compression='gzip',index=False)

In [65]:
df.drop(columns='imdb_id', inplace=True)
df.drop(columns='adult', inplace=True)
df.drop(columns='backdrop_path', inplace=True)
df.drop(columns='belongs_to_collection', inplace=True)
df.drop(columns='genres', inplace=True)
df.drop(columns='homepage', inplace=True)
df.drop(columns='id', inplace=True)
df.drop(columns='original_language', inplace=True)
df.drop(columns='overview', inplace=True)
df.drop(columns='popularity', inplace=True)
df.drop(columns='poster_path', inplace=True)
df.drop(columns='production_companies', inplace=True)
df.drop(columns='production_countries', inplace=True)
df.drop(columns='release_date', inplace=True)
df.drop(columns='runtime', inplace=True)
df.drop(columns='spoken_languages', inplace=True)
df.drop(columns='status', inplace=True)
df.drop(columns='tagline', inplace=True)
df.drop(columns='video', inplace=True)
df.drop(columns='vote_average', inplace=True)
df.drop(columns='vote_count', inplace=True)
df.drop(columns='original_title', inplace=True)
df

Unnamed: 0,budget,revenue,title,certification
0,,,,
1,10000000.0,0.0,The Fantasticks,
2,0.0,0.0,For the Cause,
3,0.0,0.0,Gang,
4,0.0,0.0,Karobaar,
...,...,...,...,...
5052,0.0,0.0,Sigamani Ramamani,
5053,0.0,0.0,Looty,
5054,0.0,0.0,Nayak,
5055,0.0,0.0,Daag,


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5057 entries, 0 to 5056
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   budget         5055 non-null   float64
 1   revenue        5055 non-null   float64
 2   title          5055 non-null   object 
 3   certification  826 non-null    object 
dtypes: float64(2), object(2)
memory usage: 158.2+ KB


In [67]:
test_notebook = get_movie_with_rating("tt0332280")
test_notebook

{'adult': False,
 'backdrop_path': '/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg',
 'belongs_to_collection': None,
 'budget': 29000000,
 'genres': [{'id': 10749, 'name': 'Romance'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.newline.com/properties/notebookthe.html',
 'id': 11036,
 'imdb_id': 'tt0332280',
 'original_language': 'en',
 'original_title': 'The Notebook',
 'overview': "An epic love story centered around an older man who reads aloud to a woman with Alzheimer's. From a faded notebook, the old man's words bring to life the story about a couple who is separated by World War II, and is then passionately reunited, seven years later, after they have taken different paths.",
 'popularity': 70.168,
 'poster_path': '/rNzQyW4f8B8cQeg7Dgj3n6eT5k9.jpg',
 'production_companies': [{'id': 12,
   'logo_path': '/iaYpEp3LQmb8AfAtmTvpqd4149c.png',
   'name': 'New Line Cinema',
   'origin_country': 'US'},
  {'id': 1565, 'logo_path': None, 'name': 'Avery Pix', 'origin_country': 'US'},
  {'id': 26

In [68]:
test_avengers = get_movie_with_rating("tt0848228")
test_avengers

{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 204.949,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

In [70]:
basics.to_csv("Data/df_clean_basics.csv.gz",compression='gzip',index=False)

In [None]:
q = """SELECT * FROM movies LIMIT 5;"""
pd.read_sql(q, engine)