In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
# read in JSON as DataFrame
df = pd.read_json('tmdb_person_100k.json')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              36541 non-null  object 
 1   known_for_department  84356 non-null  object 
 2   deathday              12317 non-null  object 
 3   id                    84367 non-null  float64
 4   name                  84367 non-null  object 
 5   also_known_as         84367 non-null  object 
 6   gender                84367 non-null  float64
 7   biography             84367 non-null  object 
 8   popularity            84367 non-null  float64
 9   place_of_birth        30895 non-null  object 
 10  profile_path          20826 non-null  object 
 11  adult                 84367 non-null  float64
 12  imdb_id               84367 non-null  object 
 13  homepage              3667 non-null   object 
dtypes: float64(4), object(10)
memory usage: 11.4+ MB


In [3]:
# look at DataFrame
df.head().transpose()

Unnamed: 0,0,1,2,3,4
birthday,1944-05-14,1951-09-25,1942-07-13,1956-10-21,1913-05-26
known_for_department,Directing,Acting,Acting,Acting,Acting
deathday,,,,2016-12-27,1994-08-11
id,1,2,3,4,5
name,George Lucas,Mark Hamill,Harrison Ford,Carrie Fisher,Peter Cushing
also_known_as,"[George Walton Lucas Jr. , 乔治·卢卡斯, Джордж Лука...","[Mark Hamil, Mark Richard Hamill, Марк Хэмилл,...","[Гаррісон Форд, Харрисон Форд, هاريسون فورد, 해...","[Carrie Frances Fisher , Кэрри Фишер, Кэрри Фр...",[Peter Wilton Cushing]
gender,2,2,2,1,2
biography,"George Walton Lucas Jr. (born May 14, 1944) is...","Mark Richard Hamill (born September 25, 1951) ...",Legendary Hollywood Icon Harrison Ford was bor...,Carrie Frances Fisher (21 October 1956 - 27 De...,"Peter Wilton Cushing, OBE (26 May 1913 – 11 A..."
popularity,6.761,10.497,20.349,7.228,3.902
place_of_birth,"Modesto, California, USA","Concord, California, USA","Chicago, Illinois, USA","Beverly Hills, Los Angeles, California, USA","Kenley, Surrey, England, UK"


In [4]:
# store number of original entries
STARTING_ROWS = df.shape[0]
STARTING_ROWS

100000

In [5]:
before_dropped = 0
def rows_dropped():
    """Returns the number of rows dropped and rows left since last call"""
    global before_dropped
    just_dropped = STARTING_ROWS - before_dropped - df.shape[0]
    # adjusts number of row dropped since last call
    before_dropped += just_dropped
    return (f'{just_dropped} row(s) just dropped.'\
        f' {df.shape[0]} row(s) left.')

> **EMPTY ROWS**

In [6]:
# drop all rows with no values
df.dropna(how='all', inplace=True)
rows_dropped()

'15633 row(s) just dropped. 84367 row(s) left.'

> **ADULT**

In [7]:
# check data types
df['adult'].apply(type).value_counts()

<class 'float'>    84367
Name: adult, dtype: int64

These are the same data type.

In [8]:
# look at the adult labels
df.adult.value_counts()

0.0    83758
1.0      609
Name: adult, dtype: int64

This column looks clean. Let's keep it "clean".

In [9]:
# drop rows for adult movies
df.drop(df[df['adult'] == True].index, inplace=True)
rows_dropped()

'609 row(s) just dropped. 83758 row(s) left.'

> **FEATURE SELECTION**

Now that the adult movies are gone, I don't need that feature. I'll drop a few others, also.

In [10]:
# drop features not needed
df.drop(['deathday', 'also_known_as', 'profile_path', 'adult', 'homepage'], axis=1, inplace=True)
df.shape

(83758, 9)

> **ZERO VALUES**

In [11]:
# check if missing data is disguised as 0
df.isin([0]).sum()

birthday                    0
known_for_department        0
id                          0
name                        0
gender                  36961
biography                   0
popularity                  0
place_of_birth              0
imdb_id                     0
dtype: int64

This looks good, except for gender. I'll look at that later.

> **BIRTHDAY**

In [12]:
# check data types
df['birthday'].apply(type).value_counts()

<class 'NoneType'>    47497
<class 'str'>         36261
Name: birthday, dtype: int64

In [13]:
# convert None types to NaN
df['birthday'] = df['birthday'].replace([None], np.NaN)
df['birthday'].apply(type).value_counts()

<class 'float'>    47497
<class 'str'>      36261
Name: birthday, dtype: int64

All values are strings.

In [14]:
# convert birthday to datetime objects
df['birthday'] = pd.to_datetime(df['birthday'], errors='coerce')
df['birthday'].apply(type).value_counts()

<class 'pandas._libs.tslibs.nattype.NaTType'>         47500
<class 'pandas._libs.tslibs.timestamps.Timestamp'>    36258
Name: birthday, dtype: int64

Birthdays that were out of range were converted to NaT.

> **KNOWN FOR DEPARTMENT**

In [15]:
# check data types
df['known_for_department'].apply(type).value_counts()

<class 'str'>         83747
<class 'NoneType'>       11
Name: known_for_department, dtype: int64

All values are strings

In [16]:
# look at the department labels
df.known_for_department.value_counts()

Acting               47198
Production            8600
Writing               7512
Directing             5090
Sound                 3701
Editing               2728
Art                   2695
Camera                2469
Costume & Make-Up     1831
Crew                  1223
Visual Effects         486
Lighting               169
Creator                 42
Actors                   3
Name: known_for_department, dtype: int64

In [17]:
# find rows where department is Actors
df[df['known_for_department'] == 'Actors']

Unnamed: 0,birthday,known_for_department,id,name,gender,biography,popularity,place_of_birth,imdb_id
7343,NaT,Actors,7344.0,Martha Crawford,0.0,,0.6,,nm0186895
14444,NaT,Actors,14445.0,Leigh Walsh,0.0,,0.6,,nm0909741
16688,NaT,Actors,16689.0,Diana Wiersma,0.0,,0.6,,nm0927560


In [18]:
# change Actors to Acting
df.loc[df['known_for_department'] == 'Actors'] = 'Acting'

In [19]:
# trust but verify
df[df['known_for_department'] == 'Actors']

Unnamed: 0,birthday,known_for_department,id,name,gender,biography,popularity,place_of_birth,imdb_id


In [20]:
# drop all rows without department data
df.dropna(subset=['known_for_department'], inplace=True)
rows_dropped()

'11 row(s) just dropped. 83747 row(s) left.'

> **id**

In [21]:
# check data types
df['id'].apply(type).value_counts()

<class 'float'>    83744
<class 'str'>          3
Name: id, dtype: int64

There's a few suspicious rows here.

In [22]:
# look at rows with strings
df[df['id'].apply(type) == str]

Unnamed: 0,birthday,known_for_department,id,name,gender,biography,popularity,place_of_birth,imdb_id
7343,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting
14444,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting
16688,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting,Acting


In [23]:
# drop nonsensical rows
df.drop(df[df.id.apply(type) == str].index, inplace=True)
rows_dropped()

'3 row(s) just dropped. 83744 row(s) left.'

> **NAME**

In [24]:
# check data types
df['name'].apply(type).value_counts()

<class 'str'>    83744
Name: name, dtype: int64

This looks good.

> **GENDER**

In [25]:
# check data types
df['gender'].apply(type).value_counts()

<class 'float'>    83744
Name: gender, dtype: int64

All values are floats.

In [26]:
# check values
df.gender.value_counts()

0.0    36951
2.0    33282
1.0    13511
Name: gender, dtype: int64

Gender type 0 is missing data, 1 is female, and 2 is male.

In [27]:
# convert 0 values to NaN
df['gender'] = df['gender'].replace(0, np.NaN)
df.gender.value_counts()

2.0    33282
1.0    13511
Name: gender, dtype: int64

These look good now.

> **BIOGRAPHY**

In [28]:
# check data types
df['biography'].apply(type).value_counts()

<class 'str'>    83744
Name: biography, dtype: int64

All values are strings.

> **POPULARITY**

In [29]:
# check data types
df['popularity'].apply(type).value_counts()

<class 'float'>    83744
Name: popularity, dtype: int64

All vaues are floats.

In [30]:
# check values
df.popularity.sort_values(ascending=False)

1244     42.005
74567     37.53
3222     36.953
9826     29.495
5008     24.557
          ...  
58705       0.6
58704       0.6
58703       0.6
58702       0.6
99999       0.6
Name: popularity, Length: 83744, dtype: object

The scale is from 0.6 to 100.

> **PLACE OF BIRTH**

In [31]:
# check data types
df['place_of_birth'].apply(type).value_counts()

<class 'NoneType'>    53100
<class 'str'>         30644
Name: place_of_birth, dtype: int64

All values are strings.

In [32]:
# convert None types to NaN
df['place_of_birth'] = df['place_of_birth'].replace([None], np.NaN)
df['place_of_birth'].apply(type).value_counts()

<class 'float'>    53100
<class 'str'>      30644
Name: place_of_birth, dtype: int64

> **IMBD ID**

In [33]:
# check data types
df['imdb_id'].apply(type).value_counts()

<class 'str'>    83744
Name: imdb_id, dtype: int64

All values are strings.

In [34]:
# inspect to verify DataFrame is clean
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 83744 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   birthday              36257 non-null  object 
 1   known_for_department  83744 non-null  object 
 2   id                    83744 non-null  object 
 3   name                  83744 non-null  object 
 4   gender                46793 non-null  float64
 5   biography             83744 non-null  object 
 6   popularity            83744 non-null  object 
 7   place_of_birth        30644 non-null  object 
 8   imdb_id               83744 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.4+ MB


The three categories with missing values are birthday, gender, and place of birth.<br>
I may need those categories as metadata. I will attempt to fill some of those missing values by leveraging other datasets in a separate notebook.
<br>If I need to drop them later, I will do so. For now, I'll accept fewer observations for maximaizing features.

In [35]:
# look at final DataFrame
df.head(8)

Unnamed: 0,birthday,known_for_department,id,name,gender,biography,popularity,place_of_birth,imdb_id
0,1944-05-14 00:00:00,Directing,1,George Lucas,2.0,"George Walton Lucas Jr. (born May 14, 1944) is...",6.761,"Modesto, California, USA",nm0000184
1,1951-09-25 00:00:00,Acting,2,Mark Hamill,2.0,"Mark Richard Hamill (born September 25, 1951) ...",10.497,"Concord, California, USA",nm0000434
2,1942-07-13 00:00:00,Acting,3,Harrison Ford,2.0,Legendary Hollywood Icon Harrison Ford was bor...,20.349,"Chicago, Illinois, USA",nm0000148
3,1956-10-21 00:00:00,Acting,4,Carrie Fisher,1.0,Carrie Frances Fisher (21 October 1956 - 27 De...,7.228,"Beverly Hills, Los Angeles, California, USA",nm0000402
4,1913-05-26 00:00:00,Acting,5,Peter Cushing,2.0,"Peter Wilton Cushing, OBE (26 May 1913 – 11 A...",3.902,"Kenley, Surrey, England, UK",nm0001088
5,1946-02-21 00:00:00,Acting,6,Anthony Daniels,2.0,Anthony Daniels (born 21 February 1946) is an ...,4.822,"Salisbury, Wiltshire, England, UK",nm0000355
6,1965-12-03 00:00:00,Writing,7,Andrew Stanton,2.0,"Andrew Stanton (born December 3, 1965) is an A...",5.203,"Boston, Massachusetts, USA",nm0004056
7,1967-08-08 00:00:00,Directing,8,Lee Unkrich,2.0,Lee Unkrich is an American director and film e...,2.332,"Cleveland, Ohio, USA",nm0881279


In [36]:
# store clean data in JSON
#df.to_json('tmdb_person_1.json')