In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
#example making new folder with os
import os
os.makedirs('Data/',exist_ok=True)

In [3]:
os.listdir('Data/')

['title-ratings.csv',
 'title.ratings.tsv',
 'Project 3 Part 3A.ipynb',
 'title.basics.tsv.gz',
 'movies.sql',
 'tmdb_api.json',
 'title-akas-us-only.csv',
 '.ipynb_checkpoints',
 'Data',
 'title-basics.csv']

In [4]:
#title_basics
basics = pd.read_csv('Data/title.basics.tsv.gz', sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
#title_ratings
ratings = pd.read_csv('Data/title.ratings.tsv', sep='\t', low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000003,6.5,1849
3,tt0000004,5.5,178
4,tt0000005,6.2,2632


In [6]:
#title_akas
akas = pd.read_csv('Data/title-akas-us-only.csv', low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [7]:
#filtering basics based on akas
filter_us_titles = basics['tconst'].isin(akas['titleId'])
basics = basics[filter_us_titles]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10016872,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family
10016901,tt9916620,movie,The Copeland Case,The Copeland Case,0,\N,\N,\N,Drama
10016939,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,\N,\N,\N,"Drama,Short"
10016962,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short


In [8]:
#replacing \Ns in basics
basics = basics.replace({'\\N':np.nan})

In [9]:
#replacing \Ns in akas
akas = akas.replace({'\\N':np.nan})

In [10]:
#replacing \Ns in ratings
ratings = ratings.replace({'\\N':np.nan})

In [11]:
#Viewing null values in basics
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            0
originalTitle           0
isAdult                 0
startYear           98665
endYear           1328513
runtimeMinutes     503119
genres              28616
dtype: int64

In [12]:
#Convert startyear to float
basics['startYear'] = basics['startYear'].astype(float)

In [13]:
#keep startYear 2000-2022
basics = basics[(basics['startYear']>=2000)&(basics['startYear']<=2022)]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
33802,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001.0,,20,Short
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
39544,tt0040241,short,Color Rhapsodie,Color Rhapsodie,0,2021.0,,6,Short
43546,tt0044326,short,Abstronic,Abstronic,0,2021.0,,6,Short
49493,tt0050396,short,Final Curtain,Final Curtain,0,2012.0,,20,"Horror,Short"


In [14]:
#keep only titleType == movie
basics = basics.loc[ basics['titleType']=='movie']
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 144330 entries, 34802 to 10016809
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          144330 non-null  object 
 1   titleType       144330 non-null  object 
 2   primaryTitle    144330 non-null  object 
 3   originalTitle   144330 non-null  object 
 4   isAdult         144330 non-null  object 
 5   startYear       144330 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  122694 non-null  object 
 8   genres          141195 non-null  object 
dtypes: float64(1), object(8)
memory usage: 11.0+ MB


In [15]:
#Eliminate movies that are null for runtimeMinute and genres
basics = basics.dropna(subset=['runtimeMinutes','genres'])
basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           121127
runtimeMinutes         0
genres                 0
dtype: int64

In [16]:
#Filtering out movies that are documentaries
filter_documentaries = basics['genres'].str.contains('documentary',case=False)

In [17]:
#Using documentaries filter
basics = basics[~filter_documentaries]

In [18]:
#Filtering ratings based on basics
filter_basics = ratings['tconst'].isin(basics['tconst'])
ratings = ratings[filter_basics]
ratings

Unnamed: 0,tconst,averageRating,numVotes
17961,tt0035423,6.4,87153
40764,tt0062336,6.4,175
46645,tt0069049,6.7,7754
63640,tt0088751,5.2,336
69953,tt0096056,5.6,846
...,...,...,...
1331411,tt9914942,6.6,178
1331437,tt9915872,6.4,9
1331450,tt9916170,7.0,7
1331451,tt9916190,3.7,243


In [19]:
#Viewing info of basics
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 86979 entries, 34802 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          86979 non-null  object 
 1   titleType       86979 non-null  object 
 2   primaryTitle    86979 non-null  object 
 3   originalTitle   86979 non-null  object 
 4   isAdult         86979 non-null  object 
 5   startYear       86979 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  86979 non-null  object 
 8   genres          86979 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.6+ MB


In [20]:
#Viewing info of akas
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452564 non-null  object
 3   region           1452564 non-null  object
 4   language         4018 non-null     object
 5   types            981678 non-null   object
 6   attributes       47016 non-null    object
 7   isOriginalTitle  1451222 non-null  object
dtypes: int64(1), object(7)
memory usage: 88.7+ MB


In [21]:
basics.to_csv("Data/title-basics.csv", index=False)

In [22]:
ratings.to_csv("Data/title-ratings.csv", index=False)

Project 2 Part 3A

In [25]:
# Install tmdbsimple (only need to run once)
!pip install tmdbsimple



In [31]:
# Load API Credentials
with open('/Users/deidrehunt/.secret/tdmi_api.json', 'r') as f:
    login = json.load(f)
    login.keys()


In [32]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key']

In [33]:
# Import packages
import os, time, json
import tmdbsimple as tmdb 
import pandas as pd
from tqdm.notebook import tqdm_notebook
# Create the folder for saving files (if it doesn't exist)
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['title-ratings.csv',
 'title.ratings.tsv',
 'Project 3 Part 3A.ipynb',
 'title.basics.tsv.gz',
 'movies.sql',
 'tmdb_api.json',
 'title-akas-us-only.csv',
 '.ipynb_checkpoints',
 'Data',
 'title-basics.csv']

In [36]:
#title_basics
basics = pd.read_csv('Data/title.basics.tsv.gz', sep='\t', low_memory=False)
basics.head()


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [66]:
## make a movie object using the .Movies function from tmdb
movie = tmdb.Movies(603)


In [67]:
## movie objects have a .info dictionary 
info = movie.info()
info


{'adult': False,
 'backdrop_path': '/ncEsesgOJDNrTUED89hYbA117wo.jpg',
 'belongs_to_collection': {'id': 2344,
  'name': 'The Matrix Collection',
  'poster_path': '/bV9qTVHTVf0gkW0j7p7M0ILD4pG.jpg',
  'backdrop_path': '/bRm2DEgUiYciDw3myHuYFInD7la.jpg'},
 'budget': 63000000,
 'genres': [{'id': 28, 'name': 'Action'},
  {'id': 878, 'name': 'Science Fiction'}],
 'homepage': 'http://www.warnerbros.com/matrix',
 'id': 603,
 'imdb_id': 'tt0133093',
 'original_language': 'en',
 'original_title': 'The Matrix',
 'overview': 'Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.',
 'popularity': 88.589,
 'poster_path': '/f89U3ADr1oiB1s9GkdPOEpXUk5H.jpg',
 'production_companies': [{'id': 79,
   'logo_path': '/at4uYdwAAgNRKhZuuFX8ShKSybw.png',
   'name': 'Village Roadshow Pictures',
   'origin_country': 'US'},
  {'id': 372,
   'logo_path': None,
   'name': 'Groucho II Film

In [68]:
info['budget']

63000000

In [69]:
info['revenue']

463517383

In [70]:
movie = tmdb.Movies('tt1361336')
info = movie.info()
info['budget']

50000000

In [71]:
# example from package README
# source = https://github.com/celiao/tmdbsimple
releases = movie.releases()
for c in releases['countries']:
    if c['iso_3166_1'] == 'US':
        print(c['certification'])

PG
PG
PG


In [83]:
# Loop through movie_ids_to_get with a tqdm progress bar
for movie_id in tqdm_notebook(tt0848228, f"Movies from {2010}"):


SyntaxError: incomplete input (1650146892.py, line 2)

In [84]:
#Get index and movie id from list
try:
    # Retrieve then data for the movie id
    temp = get_movie_with_rating(tt0848228 )  
    # Append/extend results to existing file using a pre-made function
    write_json(temp,JSON_FILE)
    # Short 20 ms sleep to prevent overwhelming server
    time.sleep(0.02)
    
except Exception as e:
    errors.append([tt0332280, e])


NameError: name 'tt0332280' is not defined

In [85]:
# Get the movie object for the current id
movie = tmdb.Movies('tt1361336')
# save the .info .releases dictionaries
info = movie.info()
releases = movie.releases()
# Loop through countries in releases
for c in releases['countries']:
    # if the country abbreviation==US
    if c['iso_3166_1' ] =='US':
        ## save a "certification" key in the info dict with the certification
       info['certification'] = c['certification']

In [86]:
import glob
# Use glob to get all filepaths that match the pattern (*=wildcard)
tmdb_files = sorted(glob.glob("Data/final_tmdb_data*.csv.gz"))
tmdb_files


[]

In [76]:
# Use read_csv in a list comprehension and combine with concat to load all files
df = pd.concat([pd.read_csv(f) for f in tmdb_files] )
df

ValueError: No objects to concatenate

In [77]:
# Adding lineterminator arg to get around error
df = pd.concat([pd.read_csv(f, lineterminator='\n') for f in files] )

NameError: name 'files' is not defined