In [1]:
# set cell display width
from IPython.display import display, HTML
display(HTML('<style>.container { width: 930px !important; }</style>'))
from IPython.display import clear_output

In [2]:
# import ML tools
import pandas as pd
import numpy as np
import json

In [3]:
# read in JSON as DataFrame
df = pd.read_json(
    '../../movie_actor_linear_regression/data/raw_api_requests/' + 
    'concat_data/tmdb_movie_all_unclean.json')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800000 entries, 0 to 799999
Data columns (total 27 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   adult                  545815 non-null  float64
 1   backdrop_path          107987 non-null  object 
 2   belongs_to_collection  13666 non-null   object 
 3   budget                 545815 non-null  float64
 4   genres                 545815 non-null  object 
 5   homepage               337756 non-null  object 
 6   id                     545815 non-null  float64
 7   imdb_id                423291 non-null  object 
 8   original_language      545815 non-null  object 
 9   original_title         545815 non-null  object 
 10  overview               545815 non-null  object 
 11  popularity             545815 non-null  float64
 12  poster_path            299801 non-null  object 
 13  production_companies   545815 non-null  object 
 14  production_countries   545815 non-nu

In [4]:
# get target and feature
df = df[['genres', 'tagline']]
df

Unnamed: 0,genres,tagline
0,,
1,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",
2,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",
3,,
4,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",Twelve outrageous guests. Four scandalous requ...
...,...,...
799995,,
799996,,
799997,,
799998,,


In [5]:
# check if missing data is disguised as '' (omitting the column of lists)
df.drop('genres', axis=1).isin(['']).sum()

tagline    467097
dtype: int64

In [6]:
# drop all rows with no values
df.dropna(how='all', inplace=True)
df

Unnamed: 0,genres,tagline
1,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",
2,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",
4,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",Twelve outrageous guests. Four scandalous requ...
5,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",Don't move. Don't whisper. Don't even breathe.
7,"[{'id': 99, 'name': 'Documentary'}]",A Megacities remix.
...,...,...
701726,[],WELCOME TO THE OTHER SIDE
701727,[],Rocco Czechs In...Way In!!
701728,[],
701729,[],


In [7]:
# convert '' values to NaN's
df.replace(r'^\s*$', np.NaN, regex=True, inplace=True)
df

Unnamed: 0,genres,tagline
1,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",
2,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",
4,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",Twelve outrageous guests. Four scandalous requ...
5,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",Don't move. Don't whisper. Don't even breathe.
7,"[{'id': 99, 'name': 'Documentary'}]",A Megacities remix.
...,...,...
701726,[],WELCOME TO THE OTHER SIDE
701727,[],Rocco Czechs In...Way In!!
701728,[],
701729,[],


In [8]:
# drop empty tagline strings
df.drop('genres', axis=1).isin(['']).sum()

tagline    0
dtype: int64

In [9]:
# observe data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 545815 entries, 1 to 701730
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   genres   545815 non-null  object
 1   tagline  78700 non-null   object
dtypes: object(2)
memory usage: 12.5+ MB


In [10]:
def get_genre_id():
    """Return Series of genre names grouped by movie from genres column"""
    # generate outer list
    all_vals = []
    for row in df['genres']:
        # generate inner list
        val_lst = []
        for d in row:
            # add each genre name to inner list for each movie
            val_lst.append(d.get('name'))
        # add each inner list to outer list
        all_vals.append(val_lst)
    # convert outer list to Series
    return pd.Series(all_vals)

In [11]:
# replace genre column with new column
df = df.reset_index(drop=True).assign(
    genres=pd.DataFrame(get_genre_id(), columns=['genres'])['genres'])
df['genres']

0            [Drama, Crime, Comedy]
1                   [Drama, Comedy]
2                   [Crime, Comedy]
3         [Action, Thriller, Crime]
4                     [Documentary]
                    ...            
545810                           []
545811                           []
545812                           []
545813                           []
545814                           []
Name: genres, Length: 545815, dtype: object

In [12]:
# count empty lists
df.genres.str.len().eq(0).sum()

207619

In [13]:
# remove empty genres lists
df = df[df['genres'].map(lambda d: len(d)) > 0]
df

Unnamed: 0,genres,tagline
0,"[Drama, Crime, Comedy]",
1,"[Drama, Comedy]",
2,"[Crime, Comedy]",Twelve outrageous guests. Four scandalous requ...
3,"[Action, Thriller, Crime]",Don't move. Don't whisper. Don't even breathe.
4,[Documentary],A Megacities remix.
...,...,...
545801,[Documentary],
545803,"[Documentary, Animation]",
545805,[Drama],
545806,"[Documentary, Animation]",


In [14]:
# drop all rows with any empty values
df.dropna(how='any', inplace=True)
df

Unnamed: 0,genres,tagline
2,"[Crime, Comedy]",Twelve outrageous guests. Four scandalous requ...
3,"[Action, Thriller, Crime]",Don't move. Don't whisper. Don't even breathe.
4,[Documentary],A Megacities remix.
6,"[Adventure, Action, Science Fiction]","A long time ago in a galaxy far, far away..."
7,"[Animation, Family]",There are 3.7 trillion fish in the ocean. They...
...,...,...
545667,[Documentary],The One and Only June Foray
545704,"[History, Drama]",Vera Komissarzhevskaya
545742,[Comedy],Stop Running You Ass-Hole
545761,"[Comedy, Western, Horror, Music]",Senior Slump never looked so good.


In [15]:
# obseve data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64710 entries, 2 to 545797
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   genres   64710 non-null  object
 1   tagline  64710 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [16]:
# store clean dataset in JSON
df.to_json('../data/genres_taglines_ds.json')