In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# read in JSON as DataFrame
df = pd.read_json('tmdb_person_all_unclean.json')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2700000 entries, 0 to 2699999
Data columns (total 14 columns):
 #   Column                Dtype  
---  ------                -----  
 0   birthday              object 
 1   known_for_department  object 
 2   deathday              object 
 3   id                    float64
 4   name                  object 
 5   also_known_as         object 
 6   gender                float64
 7   biography             object 
 8   popularity            float64
 9   place_of_birth        object 
 10  profile_path          object 
 11  adult                 float64
 12  imdb_id               object 
 13  homepage              object 
dtypes: float64(4), object(10)
memory usage: 309.0+ MB


In [3]:
# look at DataFrame
df.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,1944-05-14,1951-09-25,1942-07-13,1956-10-21,1913-05-26
known_for_department,Directing,Acting,Acting,Acting,Acting
deathday,,,,2016-12-27,1994-08-11
id,1,2,3,4,5
name,George Lucas,Mark Hamill,Harrison Ford,Carrie Fisher,Peter Cushing
also_known_as,"[George Walton Lucas Jr. , 乔治·卢卡斯, Джордж Лука...","[Mark Hamil, Mark Richard Hamill, Марк Хэмилл,...","[Гаррісон Форд, Харрисон Форд, هاريسون فورد, 해...","[Carrie Frances Fisher , Кэрри Фишер, Кэрри Фр...",[Peter Wilton Cushing]
gender,2,2,2,1,2
biography,"George Walton Lucas Jr. (born May 14, 1944) is...","Mark Richard Hamill (born September 25, 1951) ...",Legendary Hollywood Icon Harrison Ford was bor...,Carrie Frances Fisher (21 October 1956 - 27 De...,"Peter Wilton Cushing, OBE (26 May 1913 – 11 A..."
popularity,6.761,10.497,20.349,7.228,3.902
place_of_birth,"Modesto, California, USA","Concord, California, USA","Chicago, Illinois, USA","Beverly Hills, Los Angeles, California, USA","Kenley, Surrey, England, UK"


In [4]:
# store number of original entries
STARTING_ROWS = df.shape[0]
STARTING_ROWS

2700000

In [5]:
before_dropped = 0
def rows_dropped():
    """Returns the number of rows dropped and rows left since last call"""
    global before_dropped
    just_dropped = STARTING_ROWS - before_dropped - df.shape[0]
    # adjusts number of row dropped since last call
    before_dropped += just_dropped
    return (f'{just_dropped} row(s) just dropped.'\
        f' {df.shape[0]} row(s) left.')

> **EMPTY ROWS**

In [6]:
# drop all rows with no values
df.dropna(how='all', inplace=True)
rows_dropped()

'972444 row(s) just dropped. 1727556 row(s) left.'

> **ADULT**

In [7]:
# check data types
df['adult'].apply(type).value_counts()

<class 'float'>    1727556
Name: adult, dtype: int64

The values are all floats.

In [8]:
# look at the adult labels
df.adult.value_counts()

0.0    1677573
1.0      49983
Name: adult, dtype: int64

This column looks clean. Let's keep it "clean".

In [9]:
# drop rows for adult movies
df.drop(df[df['adult'] == True].index, inplace=True)
rows_dropped()

'49983 row(s) just dropped. 1677573 row(s) left.'

> **FEATURE SELECTION**

Now that the adult movies are gone, I don't need that feature. I'll drop a few others, also.

In [10]:
# drop features not needed
df.drop(['deathday', 'also_known_as', 'profile_path', 'adult', 'homepage'], axis=1, inplace=True)
df.shape

(1677573, 9)

> **ZERO VALUES**

In [11]:
# check if missing data is disguised as 0
df.isin([0]).sum()

birthday                      0
known_for_department          0
id                            0
name                          0
gender                  1397829
biography                     0
popularity                    0
place_of_birth                0
imdb_id                       0
dtype: int64

This looks good, except for gender. I'll look at that later.

> **BIRTHDAY**

In [12]:
# check data types
df['birthday'].apply(type).value_counts()

<class 'NoneType'>    1524804
<class 'str'>          152769
Name: birthday, dtype: int64

In [13]:
# convert None types to NaN
df['birthday'] = df['birthday'].replace([None], np.NaN)
df['birthday'].apply(type).value_counts()

<class 'float'>    1524804
<class 'str'>       152769
Name: birthday, dtype: int64

The values are all strings.

In [14]:
# convert birthday to datetime objects
df['birthday'] = pd.to_datetime(df['birthday'], errors='coerce')
df['birthday'].apply(type).value_counts()

<class 'pandas._libs.tslibs.nattype.NaTType'>         1524815
<class 'pandas._libs.tslibs.timestamps.Timestamp'>     152758
Name: birthday, dtype: int64

Birthdays that were out of range were converted to NaT.<br>
The values are all datetime objects.

> **KNOWN FOR DEPARTMENT**

In [15]:
# check data types
df['known_for_department'].apply(type).value_counts()

<class 'str'>         1677060
<class 'NoneType'>        513
Name: known_for_department, dtype: int64

All values are all strings

In [16]:
# look at the department labels
df.known_for_department.value_counts()

Acting               941186
Directing            143417
Production           133449
Writing              111972
Sound                 75535
Crew                  56361
Camera                47947
Art                   45811
Editing               38162
Costume & Make-Up     37098
Visual Effects        27080
Lighting              12803
Creator                6232
Actors                    7
Name: known_for_department, dtype: int64

In [17]:
# find rows where department is Actors
df[df['known_for_department'] == 'Actors']

Unnamed: 0,birthday,known_for_department,id,name,gender,biography,popularity,place_of_birth,imdb_id
7343,NaT,Actors,7344.0,Martha Crawford,0.0,,0.6,,nm0186895
14444,NaT,Actors,14445.0,Leigh Walsh,0.0,,0.6,,nm0909741
16688,NaT,Actors,16689.0,Diana Wiersma,0.0,,0.6,,nm0927560
121452,NaT,Actors,121453.0,Miguel Castro,2.0,,0.6,,nm1008574
130441,NaT,Actors,130442.0,Jimmy Adler,0.0,,0.6,,
130442,NaT,Actors,130443.0,Kimberly Kensington,0.0,,0.6,,
139277,NaT,Actors,139278.0,Matthew Wiatt,0.0,,0.6,,nm2980195


In [18]:
# change Actors to Acting
df.loc[df['known_for_department'] == 'Actors'] = 'Acting'

In [19]:
# trust but verify
df[df['known_for_department'] == 'Actors']

Unnamed: 0,birthday,known_for_department,id,name,gender,biography,popularity,place_of_birth,imdb_id


In [20]:
# drop all rows without department data
df.dropna(subset=['known_for_department'], inplace=True)
rows_dropped()

'513 row(s) just dropped. 1677060 row(s) left.'

> **id**

In [21]:
# check data types
df['id'].apply(type).value_counts()

<class 'float'>    1677053
<class 'str'>            7
Name: id, dtype: int64

There's a few suspicious rows here.

In [22]:
# look at rows with strings
df[df['id'].apply(type) == str]

Unnamed: 0,birthday,known_for_department,id,name,gender,biography,popularity,place_of_birth,imdb_id
7343,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting
14444,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting
16688,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting
121452,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting
130441,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting
130442,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting
139277,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting


In [23]:
# drop nonsensical rows
df.drop(df[df.id.apply(type) == str].index, inplace=True)
rows_dropped()

'7 row(s) just dropped. 1677053 row(s) left.'

In [24]:
# check data types
df['id'].apply(type).value_counts()

<class 'float'>    1677053
Name: id, dtype: int64

The values are all floats.

> **NAME**

In [25]:
# check data types
df['name'].apply(type).value_counts()

<class 'str'>    1677053
Name: name, dtype: int64

The values are all strings.

> **GENDER**

In [26]:
# check data types
df['gender'].apply(type).value_counts()

<class 'float'>    1677053
Name: gender, dtype: int64

All values are all floats.

In [27]:
# check values
df.gender.value_counts()

0.0    1397403
2.0     177491
1.0     102136
3.0         23
Name: gender, dtype: int64

Gender type 1 is female and 2 is male. All others are missing data.

In [28]:
# convert missing values to NaN
df['gender'] = df['gender'].where((df['gender'] == 1) | (df['gender'] == 2))
df.gender.value_counts()

2.0    177491
1.0    102136
Name: gender, dtype: int64

These look good now.

> **BIOGRAPHY**

In [29]:
# check data types
df['biography'].apply(type).value_counts()

<class 'str'>    1677053
Name: biography, dtype: int64

All values are all strings.

> **POPULARITY**

In [30]:
# check data types
df['popularity'].apply(type).value_counts()

<class 'float'>    1677053
Name: popularity, dtype: int64

All vaues are all floats.

In [31]:
# check values
df.popularity.sort_values(ascending=False)

1910847    132.816
2472212     88.179
1642788     51.328
208224      48.423
1244        42.005
            ...   
2085725        0.6
2085727        0.6
2085728        0.6
2085729        0.6
1782238        0.6
Name: popularity, Length: 1677053, dtype: object

The scale begins at 0.6 and has no limit.

> **PLACE OF BIRTH**

In [32]:
# check data types
df['place_of_birth'].apply(type).value_counts()

<class 'NoneType'>    1547792
<class 'str'>          129261
Name: place_of_birth, dtype: int64

In [33]:
# convert None types to NaN
df['place_of_birth'] = df['place_of_birth'].replace([None], np.NaN)
df['place_of_birth'].apply(type).value_counts()

<class 'float'>    1547792
<class 'str'>       129261
Name: place_of_birth, dtype: int64

All values are all strings.

> **IMBD ID**

In [34]:
# check data types
df['imdb_id'].apply(type).value_counts()

<class 'str'>         1677050
<class 'NoneType'>          3
Name: imdb_id, dtype: int64

In [35]:
# convert None types to NaN
df['imdb_id'] = df['imdb_id'].replace([None], np.NaN)
df['imdb_id'].apply(type).value_counts()

<class 'str'>      1677050
<class 'float'>          3
Name: imdb_id, dtype: int64

All values are all strings.

In [36]:
# inspect to verify DataFrame is clean
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1677053 entries, 0 to 2699999
Data columns (total 9 columns):
 #   Column                Non-Null Count    Dtype 
---  ------                --------------    ----- 
 0   birthday              152712 non-null   object
 1   known_for_department  1677053 non-null  object
 2   id                    1677053 non-null  object
 3   name                  1677053 non-null  object
 4   gender                279627 non-null   object
 5   biography             1677053 non-null  object
 6   popularity            1677053 non-null  object
 7   place_of_birth        129261 non-null   object
 8   imdb_id               1677050 non-null  object
dtypes: object(9)
memory usage: 127.9+ MB


The four categories with missing values are birthday, gender, place of birth, and IMDb ID.<br>
I may need to use data in the first three as actor metadata. The IMDb ID only has three missing values.<br>
I will attempt to fill some of those missing values by leveraging other datasets in a separate notebook.<br>
If I need to drop them later, I will do so. For now, I'll accept fewer observations for maximaizing features.

In [37]:
# look at final shape
df.shape

(1677053, 9)

In [38]:
# store clean data in JSON
df.to_json('tmdb_person_clean.json')