In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# read in JSON as DataFrame
df = pd.read_json('tmdb_movie_all_unclean.json')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800000 entries, 0 to 799999
Data columns (total 27 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   adult                  545815 non-null  float64
 1   backdrop_path          107987 non-null  object 
 2   belongs_to_collection  13666 non-null   object 
 3   budget                 545815 non-null  float64
 4   genres                 545815 non-null  object 
 5   homepage               337756 non-null  object 
 6   id                     545815 non-null  float64
 7   imdb_id                423291 non-null  object 
 8   original_language      545815 non-null  object 
 9   original_title         545815 non-null  object 
 10  overview               545815 non-null  object 
 11  popularity             545815 non-null  float64
 12  poster_path            299801 non-null  object 
 13  production_companies   545815 non-null  object 
 14  production_countries   545815 non-nu

In [3]:
# look at DataFrame
df.head().transpose()

Unnamed: 0,0,1,2,3,4
adult,,0,0,,0
backdrop_path,,/kpuTCMw3v2AuKjqGS7383uWbc8V.jpg,/jMmHFm0TcjiN9QDICXY2tJcQsDl.jpg,,/xvjGhJHsArVjCWXb5OARi0PiqvB.jpg
belongs_to_collection,,,,,
budget,,0,0,,4e+06
genres,,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...","[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name..."
homepage,,,,,
id,,2,3,,5
imdb_id,,tt0094675,tt0092149,,tt0113101
original_language,,fi,fi,,en
original_title,,Ariel,Varjoja paratiisissa,,Four Rooms


In [4]:
# store number of original entries
STARTING_ROWS = df.shape[0]
STARTING_ROWS

800000

In [5]:
before_dropped = 0
def rows_dropped():
    """Returns the number of rows dropped and rows left since last call"""
    global before_dropped
    just_dropped = STARTING_ROWS - before_dropped - df.shape[0]
    # adjusts number of row dropped since last call
    before_dropped += just_dropped
    return (f'{just_dropped} row(s) just dropped.'\
        f' {df.shape[0]} row(s) left.')

> **EMPTY ROWS**

In [6]:
# drop all rows with no values
df.dropna(how='all', inplace=True)
rows_dropped()

'254185 row(s) just dropped. 545815 row(s) left.'

> **ADULT**

In [7]:
# check data types
df['adult'].apply(type).value_counts()

<class 'float'>    545815
Name: adult, dtype: int64

The values are all floats.

In [8]:
# look at the adult labels
df.adult.value_counts()

0.0    511489
1.0     34326
Name: adult, dtype: int64

This column looks clean. Let's keep it "clean".

In [9]:
# drop rows for adult movies
df.drop(df[df['adult'] == True].index, inplace=True)
rows_dropped()

'34326 row(s) just dropped. 511489 row(s) left.'

> **ORIGINAL LANGUAGE**

In [10]:
# check data types
df['original_language'].apply(type).value_counts()

<class 'str'>    511489
Name: original_language, dtype: int64

The values are all strings.

In [11]:
# look at the original language labels
df.original_language.value_counts()

en    292137
fr     30949
de     30876
es     23360
ja     15894
       ...  
gv         1
tt         1
os         1
ii         1
oj         1
Name: original_language, Length: 159, dtype: int64

In [12]:
# only keep the English language movies
df.drop(df[df['original_language'] != 'en'].index, inplace=True)
rows_dropped()

'219352 row(s) just dropped. 292137 row(s) left.'

In [13]:
# trust but verify
df.original_language.value_counts()

en    292137
Name: original_language, dtype: int64

> **STATUS**

In [14]:
# check data types
df['status'].apply(type).value_counts()

<class 'str'>    292137
Name: status, dtype: int64

The values are all floats.

In [15]:
# look at the status labels
df.status.value_counts()

Released           288801
Planned              1431
In Production         880
Post Production       805
Rumored               132
Canceled               88
Name: status, dtype: int64

In [16]:
# only keep the movies that have been released
df.drop(df[df['status'] != 'Released'].index, inplace=True)
rows_dropped()

'3336 row(s) just dropped. 288801 row(s) left.'

In [17]:
# trust but verify
df.status.value_counts()

Released    288801
Name: status, dtype: int64

> **FEATURE SELECTION**

Now that the adult, foreign language, and unreleased movies are gone, I don't need those feature. I'll drop a few others, also.

In [18]:
# drop features not needed
df.drop(['adult', 'backdrop_path', 'belongs_to_collection', 'homepage', 'original_language', 
         'original_title', 'poster_path', 'production_companies', 'production_countries', 
         'spoken_languages', 'status', 'video'], axis=1, inplace=True)
df.shape

(288801, 15)

> **EMPTY STRINGS**

In [19]:
# check if missing data is disguised as '' (omitting the columns of lists and dicts)
df.drop(['genres', 'credits', 'reviews'], axis=1).isin(['']).sum()

budget               0
id                   0
imdb_id          24263
overview          4556
popularity           0
release_date     29758
revenue              0
runtime              0
tagline         227982
title                0
vote_average         0
vote_count           0
dtype: int64

In [20]:
# convert '' values to NaN's
df.replace(r'^\s*$', np.NaN, regex=True, inplace=True)
df.drop(['genres', 'credits', 'reviews'], axis=1).isin(['']).sum()

budget          0
id              0
imdb_id         0
overview        0
popularity      0
release_date    0
revenue         0
runtime         0
tagline         0
title           0
vote_average    0
vote_count      0
dtype: int64

> **ZERO VALUES**

In [21]:
# check if missing data is disguised as 0 (omitting the columns of lists and dicts)
df.drop(['genres', 'credits', 'reviews'], axis=1).isin([0]).sum()

budget          271172
id                   0
imdb_id              0
overview             0
popularity         114
release_date         0
revenue         280097
runtime          21391
tagline              0
title                0
vote_average    171036
vote_count      170814
dtype: int64

In [22]:
# convert 0 values to NaN's
df = df.replace(0, np.nan)
df.drop(['genres', 'credits', 'reviews'], axis=1).isin([0]).sum()

budget          0
id              0
imdb_id         0
overview        0
popularity      0
release_date    0
revenue         0
runtime         0
tagline         0
title           0
vote_average    0
vote_count      0
dtype: int64

In [23]:
# look for anything odd in numeric data
df.drop(['genres', 'credits', 'reviews'], axis=1).describe()

Unnamed: 0,budget,id,popularity,revenue,runtime,vote_average,vote_count
count,17629.0,288801.0,288687.0,8704.0,237865.0,117765.0,117987.0
mean,12856330.0,353922.394625,1.636013,65908270.0,487.7006,5.995865,93.225932
std,85248030.0,201400.890836,3.059695,154902900.0,145990.7,1.847679,661.708424
min,1.0,5.0,0.6,-12.0,1.0,0.5,1.0
25%,8144.0,193034.0,0.6,759848.2,18.0,5.0,1.0
50%,500000.0,351697.0,0.6,11522750.0,70.0,6.0,3.0
75%,10000000.0,520800.0,1.4,58402750.0,92.0,7.0,9.0
max,10000000000.0,701729.0,448.572,2797801000.0,50505050.0,10.0,25603.0


There are some low values in the budget and revenue data.<br>
I will filter by dollar amount on the final DataFrame, because these will have to be adjusted for inflation first.<br>
The runtimes will have to be limited in range. I will do this later.

In [24]:
# look at what's left
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 288801 entries, 4 to 701728
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   budget        17629 non-null   float64
 1   genres        288801 non-null  object 
 2   id            288801 non-null  float64
 3   imdb_id       204092 non-null  object 
 4   overview      284188 non-null  object 
 5   popularity    288687 non-null  float64
 6   release_date  259043 non-null  object 
 7   revenue       8704 non-null    float64
 8   runtime       237865 non-null  float64
 9   tagline       60811 non-null   object 
 10  title         288801 non-null  object 
 11  vote_average  117765 non-null  float64
 12  vote_count    117987 non-null  float64
 13  credits       288801 non-null  object 
 14  reviews       288800 non-null  object 
dtypes: float64(7), object(8)
memory usage: 35.3+ MB


> **BUDGET**

In [25]:
# check data types
df['budget'].apply(type).value_counts()

<class 'float'>    288801
Name: budget, dtype: int64

The values are all floats.

This feature, along with revenue, has the most missing values. I will keep them all for now,

> **GENRES**

In [26]:
# check data types
df['genres'].apply(type).value_counts()

<class 'list'>    288801
Name: genres, dtype: int64

The values are all lists.

These are lists of dicts representing genres.<br>
Each list may have multiple genres associated with the movie, resulting in multiple dicts per list.<br>
I'm going to extract the string that identifies each genre and leave the numerical key behind, along with the dicts.<br>
The result will be a column with lists of strings, each string representing the one of the 19 TMDb genre labels.<br>

In [27]:
def get_genre_id():
    """Returns Series of genre names grouped by movie from genres column"""
    # generate outer list
    all_vals = []
    for row in df['genres']:
        # generate inner list
        val_lst = []
        for d in row:
            # add each genre name to inner list for each movie
            val_lst.append(d.get('name'))
        # add each inner list to outer list
        all_vals.append(val_lst)
    # convert outer list to Series
    return pd.Series(all_vals)

In [28]:
# look at original form of genre values
df['genres']

4         [{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...
5         [{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...
7                       [{'id': 99, 'name': 'Documentary'}]
10        [{'id': 12, 'name': 'Adventure'}, {'id': 28, '...
11        [{'id': 16, 'name': 'Animation'}, {'id': 10751...
                                ...                        
701719    [{'id': 99, 'name': 'Documentary'}, {'id': 16,...
701722    [{'id': 99, 'name': 'Documentary'}, {'id': 16,...
701723    [{'id': 16, 'name': 'Animation'}, {'id': 12, '...
701726                                                   []
701728                                                   []
Name: genres, Length: 288801, dtype: object

In [29]:
# replace genre column with new column
df = df.reset_index().assign(genres=pd.DataFrame(get_genre_id(), columns=['genres'])['genres'])
df['genres']

0                              [Crime, Comedy]
1                    [Action, Thriller, Crime]
2                                [Documentary]
3         [Adventure, Action, Science Fiction]
4                          [Animation, Family]
                          ...                 
288796                [Documentary, Animation]
288797                [Documentary, Animation]
288798                  [Animation, Adventure]
288799                                      []
288800                                      []
Name: genres, Length: 288801, dtype: object

In [30]:
# count empty lists
df.genres.str.len().eq(0).sum()

118356

I will unpack these lists in the final DataFrame.

> **ID**

In [31]:
# check data types
df['id'].apply(type).value_counts()

<class 'float'>    288801
Name: id, dtype: int64

The values are all floats.

In [32]:
# looking for duplicate values
df['id'].is_unique

True

This will be an optional key to concatenate on with other DataFrames derived from TMDb.

> **IMDB ID**

In [33]:
# check data types
df['imdb_id'].apply(type).value_counts()

<class 'str'>         204092
<class 'NoneType'>     60446
<class 'float'>        24263
Name: imdb_id, dtype: int64

In [34]:
# convert None types to NaN
df['imdb_id'] = df['imdb_id'].replace([None], np.NaN)
df['imdb_id'].apply(type).value_counts()

<class 'str'>      204092
<class 'float'>     84709
Name: imdb_id, dtype: int64

In [35]:
# check that number of NaN's matches number of floats
df['imdb_id'].isnull().sum()

84709

The values are all strings.

In [36]:
# look for duplicate values that are not NaN
df[df.duplicated(subset=['imdb_id'],keep=False)]['imdb_id'].dropna().sum()

0

This will be an optional key to concatenate on with other DataFrames derived from IMBd.

> **OVERVIEW**

In [37]:
# check data types
df['overview'].apply(type).value_counts()

<class 'str'>      284188
<class 'float'>      4613
Name: overview, dtype: int64

In [38]:
# check that number of NaN's matches number of floats
df['overview'].isnull().sum()

4613

The values are all strings.

> **POPULARITY**

In [39]:
# check data types
df['popularity'].apply(type).value_counts()

<class 'float'>    288801
Name: popularity, dtype: int64

The values are all floats.

In [40]:
# check values
df.popularity.sort_values(ascending=False)

175140    448.572
225881    235.989
201136    205.280
189812    180.871
138521    172.548
           ...   
288796        NaN
288797        NaN
288798        NaN
288799        NaN
288800        NaN
Name: popularity, Length: 288801, dtype: float64

The scale begins at 0.6 and has no limit.

> **RELEASE DATE**

In [41]:
# check data types
df['release_date'].apply(type).value_counts()

<class 'str'>      259043
<class 'float'>     29758
Name: release_date, dtype: int64

In [42]:
# check that number of NaN's matches number of floats
df['release_date'].isnull().sum()

29758

The values are all strings.

In [43]:
# convert release dates to datetime objects
df['release_date'] = pd.to_datetime(df['release_date'])
df['release_date'].apply(type).value_counts()

<class 'pandas._libs.tslibs.timestamps.Timestamp'>    259043
<class 'pandas._libs.tslibs.nattype.NaTType'>          29758
Name: release_date, dtype: int64

> **REVENUE**

In [44]:
# check data types
df['revenue'].apply(type).value_counts()

<class 'float'>    288801
Name: revenue, dtype: int64

The values are all floats.

> **RUNTIME**

In [45]:
# check data types
df['runtime'].apply(type).value_counts()

<class 'float'>    288801
Name: runtime, dtype: int64

The values are all floats.

I don't want movies that are under 75 minutes, which is the cut off length for a feature film according to the Screen Actors Guild.

In [46]:
# remove films too short to be feature length
df.drop(df.loc[df['runtime'] < 70].index, inplace=True)
rows_dropped()

'117555 row(s) just dropped. 171246 row(s) left.'

> **TAGLINE**

In [47]:
# check data types
df['tagline'].apply(type).value_counts()

<class 'float'>    125123
<class 'str'>       46123
Name: tagline, dtype: int64

In [48]:
# check that number of NaN's matches number of floats
df['tagline'].isnull().sum()

125123

The values are all strings.

> **TITLE**

In [49]:
# check data types
df['title'].apply(type).value_counts()

<class 'str'>    171246
Name: title, dtype: int64

The values are all strings.

> **VOTE AVERAGE**

In [50]:
# check data types
df['vote_average'].apply(type).value_counts()

<class 'float'>    171246
Name: vote_average, dtype: int64

The values are all floats.

> **VOTE COUNT**

In [51]:
# check data types
df['vote_count'].apply(type).value_counts()

<class 'float'>    171246
Name: vote_count, dtype: int64

The values are all floats.

> **CREDITS**

In [52]:
# check data types
df['credits'].apply(type).value_counts()

<class 'dict'>    171246
Name: credits, dtype: int64

The values are all dicts.

In [53]:
# count empty dicts
df.credits.str.len().eq(0).sum()

0

The dicts each contain 2 lists. The first one holds the cast data. The second one has the crew data.

In [54]:
# extract cast and crew lists from credits column
df = pd.concat([df.reset_index(drop=True), pd.json_normalize(df['credits'])], axis=1).drop(['index', 'credits'], 
                                                                                           axis=1)
df.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,171236,171237,171238,171239,171240,171241,171242,171243,171244,171245
budget,4e+06,2.1e+07,42000,1.1e+07,9.4e+07,5.5e+07,1.5e+07,839727,1.28e+07,,...,,,,,,,,,,
genres,"[Crime, Comedy]","[Action, Thriller, Crime]",[Documentary],"[Adventure, Action, Science Fiction]","[Animation, Family]","[Comedy, Drama, Romance]",[Drama],"[Mystery, Drama]","[Drama, Crime]","[Horror, Thriller, Mystery]",...,[Documentary],[],[],[],[],[Music],[],[Documentary],[],[]
id,5,6,8,11,12,13,14,15,16,17,...,701689,701692,701695,701702,701705,701706,701707,701710,701714,701729
imdb_id,tt0113101,tt0107286,tt0825671,tt0076759,tt0266543,tt0109830,tt0169547,tt0033467,tt0168629,tt0411267,...,,,,,,,,tt1821695,,
overview,It's Ted the Bellhop's first night on the job....,"While racing to a boxing match, Frank, Mike, J...",Timo Novotny labels his new project an experim...,Princess Leia is captured and held hostage by ...,"Nemo, an adventurous young clownfish, is unexp...",A man with a low IQ has accomplished great thi...,"Lester Burnham, a depressed suburban father in...","Newspaper magnate, Charles Foster Kane is take...","Selma, a Czech immigrant on the verge of blind...",Adèle and her daughter Sarah are traveling on ...,...,Set at CREST (the Centre for Research and Educ...,Tom and Jerry is an American animated franchis...,This full concert broadcast comes from our 201...,"Shakespeare's heartbreaking tale, Nureyev's bl...",Courage the Cowardly Dog is an American animat...,"Radiohead's set from June 30, 2017 at the Rock...",‘Rising Silence’ is a journey of relationships...,There are approximately 5.7 million people in ...,"SUM 41 performs live at The House of Blues, Cl...",A tantalising trio of cult lesbian movies from...
popularity,13.532,11.817,2.629,90.645,33.313,33.781,23.513,18.872,16.037,6.766,...,,,,,,,,,,
release_date,1995-12-09 00:00:00,1993-10-15 00:00:00,2006-01-01 00:00:00,1977-05-25 00:00:00,2003-05-30 00:00:00,1994-07-06 00:00:00,1999-09-15 00:00:00,1941-04-30 00:00:00,2000-05-17 00:00:00,2005-09-28 00:00:00,...,2019-12-31 00:00:00,1940-02-10 00:00:00,2017-06-01 00:00:00,NaT,NaT,2017-06-30 00:00:00,NaT,NaT,NaT,2010-10-04 00:00:00
revenue,4.25735e+06,1.21369e+07,,7.75398e+08,9.40336e+08,6.77388e+08,3.56297e+08,2.32177e+07,4.00319e+07,,...,,,,,,,,,,
runtime,98,110,80,121,100,142,122,119,141,87,...,80,,,144,,129,75,85,75,140
tagline,Twelve outrageous guests. Four scandalous requ...,Don't move. Don't whisper. Don't even breathe.,A Megacities remix.,"A long time ago in a galaxy far, far away...",There are 3.7 trillion fish in the ocean. They...,Life is like a box of chocolates...you never k...,Look closer.,It's terrific!,You don't need eyes to see.,One of the living for one of the dead.,...,,,,"Shakespeare's heartbreaking tale, Nureyev's bl...",,,,Bipolar and Living,"concert, punk rock, live",


> **CAST**

In [55]:
# check data types
df['cast'].apply(type).value_counts()

<class 'list'>    171246
Name: cast, dtype: int64

The values are all lists.

In [56]:
# count empty lists
df['cast'].str.len().eq(0).sum()

53867

> **CREW**

In [57]:
# check data types
df['crew'].apply(type).value_counts()

<class 'list'>    171246
Name: crew, dtype: int64

In [58]:
# count empty lists
df['crew'].str.len().eq(0).sum()

48286

> **REVIEWS**

In [59]:
# check data types
df['reviews'].apply(type).value_counts()

<class 'dict'>    171246
Name: reviews, dtype: int64

The values are all dicts.

In [60]:
# count empty dicts
df['reviews'].str.len().eq(0).sum()

0

I only want the text of the review, which is in the results key.

In [61]:
# extract results lists from reviews column
df = pd.concat([df, pd.json_normalize(df['reviews'])['results']], 
          axis=1).drop(['reviews'], axis=1).rename(columns={'results':'reviews'})
df.transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,171236,171237,171238,171239,171240,171241,171242,171243,171244,171245
budget,4e+06,2.1e+07,42000,1.1e+07,9.4e+07,5.5e+07,1.5e+07,839727,1.28e+07,,...,,,,,,,,,,
genres,"[Crime, Comedy]","[Action, Thriller, Crime]",[Documentary],"[Adventure, Action, Science Fiction]","[Animation, Family]","[Comedy, Drama, Romance]",[Drama],"[Mystery, Drama]","[Drama, Crime]","[Horror, Thriller, Mystery]",...,[Documentary],[],[],[],[],[Music],[],[Documentary],[],[]
id,5,6,8,11,12,13,14,15,16,17,...,701689,701692,701695,701702,701705,701706,701707,701710,701714,701729
imdb_id,tt0113101,tt0107286,tt0825671,tt0076759,tt0266543,tt0109830,tt0169547,tt0033467,tt0168629,tt0411267,...,,,,,,,,tt1821695,,
overview,It's Ted the Bellhop's first night on the job....,"While racing to a boxing match, Frank, Mike, J...",Timo Novotny labels his new project an experim...,Princess Leia is captured and held hostage by ...,"Nemo, an adventurous young clownfish, is unexp...",A man with a low IQ has accomplished great thi...,"Lester Burnham, a depressed suburban father in...","Newspaper magnate, Charles Foster Kane is take...","Selma, a Czech immigrant on the verge of blind...",Adèle and her daughter Sarah are traveling on ...,...,Set at CREST (the Centre for Research and Educ...,Tom and Jerry is an American animated franchis...,This full concert broadcast comes from our 201...,"Shakespeare's heartbreaking tale, Nureyev's bl...",Courage the Cowardly Dog is an American animat...,"Radiohead's set from June 30, 2017 at the Rock...",‘Rising Silence’ is a journey of relationships...,There are approximately 5.7 million people in ...,"SUM 41 performs live at The House of Blues, Cl...",A tantalising trio of cult lesbian movies from...
popularity,13.532,11.817,2.629,90.645,33.313,33.781,23.513,18.872,16.037,6.766,...,,,,,,,,,,
release_date,1995-12-09 00:00:00,1993-10-15 00:00:00,2006-01-01 00:00:00,1977-05-25 00:00:00,2003-05-30 00:00:00,1994-07-06 00:00:00,1999-09-15 00:00:00,1941-04-30 00:00:00,2000-05-17 00:00:00,2005-09-28 00:00:00,...,2019-12-31 00:00:00,1940-02-10 00:00:00,2017-06-01 00:00:00,NaT,NaT,2017-06-30 00:00:00,NaT,NaT,NaT,2010-10-04 00:00:00
revenue,4.25735e+06,1.21369e+07,,7.75398e+08,9.40336e+08,6.77388e+08,3.56297e+08,2.32177e+07,4.00319e+07,,...,,,,,,,,,,
runtime,98,110,80,121,100,142,122,119,141,87,...,80,,,144,,129,75,85,75,140
tagline,Twelve outrageous guests. Four scandalous requ...,Don't move. Don't whisper. Don't even breathe.,A Megacities remix.,"A long time ago in a galaxy far, far away...",There are 3.7 trillion fish in the ocean. They...,Life is like a box of chocolates...you never k...,Look closer.,It's terrific!,You don't need eyes to see.,One of the living for one of the dead.,...,,,,"Shakespeare's heartbreaking tale, Nureyev's bl...",,,,Bipolar and Living,"concert, punk rock, live",


In [62]:
# count empty lists
df['reviews'].str.len().eq(0).sum()

164479

In [63]:
# inspect to verify DataFrame is clean
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 171246 entries, 0 to 171245
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   budget        13635 non-null   float64       
 1   genres        171246 non-null  object        
 2   id            171246 non-null  float64       
 3   imdb_id       129045 non-null  object        
 4   overview      167715 non-null  object        
 5   popularity    171198 non-null  float64       
 6   release_date  152368 non-null  datetime64[ns]
 7   revenue       8309 non-null    float64       
 8   runtime       120310 non-null  float64       
 9   tagline       46123 non-null   object        
 10  title         171246 non-null  object        
 11  vote_average  81872 non-null   float64       
 12  vote_count    82029 non-null   float64       
 13  cast          171246 non-null  object        
 14  crew          171246 non-null  object        
 15  reviews       171

The only two categories without missing values are id and title, which I can use to match with other datasets.<br>
I may need to use data in all of the other columns.<br>
I will attempt to fill some of those missing values by leveraging other datasets in a separate notebook.<br>
If I need to drop them later, I will do so. For now, I'll accept fewer observations for maximaizing features.

In [64]:
# store clean data in JSON
df.to_json('tmdb_movie_clean.json')