In [1]:
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True) # Suppress scientific notation where possible

import json
# from textblob import TextBlob
import pycld2 as cld2

In [2]:
import regex

In [3]:
# Make some room to see stuff (i.e. drop display limits on Pandas rows & cols - be careful w/ big df's!)

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)

## 1) These 2 Datasets might be useful later for a Collaborative Recommendation Model:

In [4]:
ratings = pd.read_csv('datasets/ratings.csv')

print(ratings.info())
print(ratings.shape)
ratings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10747027 entries, 0 to 10747026
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 328.0 MB
None
(10747027, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [5]:
ratings['rating'].describe()

count    1.074703e+07
mean     3.530689e+00
std      1.069138e+00
min      5.000000e-01
25%      3.000000e+00
50%      3.500000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [6]:
tags = pd.read_csv('datasets/tags.csv')

print(tags.info())
print(tags.shape)
tags.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1108997 entries, 0 to 1108996
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   userId     1108997 non-null  int64 
 1   movieId    1108997 non-null  int64 
 2   tag        1108981 non-null  object
 3   timestamp  1108997 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 33.8+ MB
None
(1108997, 4)


Unnamed: 0,userId,movieId,tag,timestamp
0,14,110,epic,1443148538
1,14,110,Medieval,1443148532
2,14,260,sci-fi,1442169410
3,14,260,space action,1442169421
4,14,318,imdb top 250,1442615195


## 2) These Datasets will be the basis for a Content-Based Recommendation Model

In [7]:
movies = pd.read_csv('datasets/movies.csv')

print(movies.info())
print(movies.shape)
movies.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  58098 non-null  int64 
 1   title    58098 non-null  object
 2   genres   58098 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB
None
(58098, 3)


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
links = pd.read_csv('datasets/links.csv')

print(links.info())
print(links.shape)
links.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  58098 non-null  int64  
 1   imdbId   58098 non-null  int64  
 2   tmdbId   57917 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.3 MB
None
(58098, 3)


Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [9]:
links = links[~links['tmdbId'].isna()]

movies = movies[['movieId', 'genres']].merge(links, on='movieId', how='inner')

movies = movies[~(movies['genres'] == '(no genres listed)')]
movies['tmdbId'] = movies['tmdbId'].astype('int')
movies = movies.reset_index(drop=True)

print(movies.info())
movies.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53652 entries, 0 to 53651
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  53652 non-null  int64 
 1   genres   53652 non-null  object
 2   imdbId   53652 non-null  int64 
 3   tmdbId   53652 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.6+ MB
None


Unnamed: 0,movieId,genres,imdbId,tmdbId
0,1,Adventure|Animation|Children|Comedy|Fantasy,114709,862
1,2,Adventure|Children|Fantasy,113497,8844
2,3,Comedy|Romance,113228,15602
3,4,Comedy|Drama|Romance,114885,31357
4,5,Comedy,113041,11862


In [10]:
details = pd.read_json('datasets/details.json')
details = details[['id', 'title', 'overview']].rename(columns={'id': 'tmdbId'})

print(details.info())
print(details.shape)
details.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57160 entries, 0 to 57159
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tmdbId    57160 non-null  int64 
 1   title     57160 non-null  object
 2   overview  57160 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB
None
(57160, 3)


Unnamed: 0,tmdbId,title,overview
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [11]:
movies = movies.merge(details, on='tmdbId', how='inner') \
               .rename(columns={'genres_x': 'genres'})
movies = movies[['movieId', 'tmdbId', 'imdbId', 'title', 'genres', 'overview']]
movies = movies.reset_index(drop=True)

print(movies.info())
print(movies.shape)
movies.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53064 entries, 0 to 53063
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movieId   53064 non-null  int64 
 1   tmdbId    53064 non-null  int64 
 2   imdbId    53064 non-null  int64 
 3   title     53064 non-null  object
 4   genres    53064 non-null  object
 5   overview  53064 non-null  object
dtypes: int64(3), object(3)
memory usage: 2.4+ MB
None
(53064, 6)


Unnamed: 0,movieId,tmdbId,imdbId,title,genres,overview
0,1,862,114709,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"Led by Woody, Andy's toys live happily in his ..."
1,2,8844,113497,Jumanji,Adventure|Children|Fantasy,When siblings Judy and Peter discover an encha...
2,3,15602,113228,Grumpier Old Men,Comedy|Romance,A family wedding reignites the ancient feud be...
3,4,31357,114885,Waiting to Exhale,Comedy|Drama|Romance,"Cheated on, mistreated and stepped on, the wom..."
4,5,11862,113041,Father of the Bride Part II,Comedy,Just when George Banks has recovered from his ...


In [12]:
people = pd.read_json('datasets/people.json')[['id', 'actor1_id', 'actor2_id', 
                                               'actor3_id', 'actor4_id', 'director1_id', ]] \
           .rename(columns={'id': 'tmdbId'})

people = people[~people.isna().any(axis=1)].reset_index(drop=True)

print(people.info())
print(people.shape)
people.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51362 entries, 0 to 51361
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   tmdbId        51362 non-null  int64  
 1   actor1_id     51362 non-null  float64
 2   actor2_id     51362 non-null  float64
 3   actor3_id     51362 non-null  float64
 4   actor4_id     51362 non-null  float64
 5   director1_id  51362 non-null  float64
dtypes: float64(5), int64(1)
memory usage: 2.4 MB
None
(51362, 6)


Unnamed: 0,tmdbId,actor1_id,actor2_id,actor3_id,actor4_id,director1_id
0,862,31.0,12898.0,7167.0,12899.0,7879.0
1,8844,2157.0,205.0,145151.0,5149.0,4945.0
2,15602,6837.0,3151.0,13567.0,16757.0,26502.0
3,31357,8851.0,9780.0,18284.0,51359.0,2178.0
4,11862,67773.0,3092.0,519.0,70696.0,56106.0


In [13]:
actors_df = pd.read_json('datasets/actors.json')[['id', 'name']]
actors_df['name'] = actors_df['name'].apply(lambda x: str(x).replace(' ', '_').lower())

print(actors_df.info())
print(actors_df.shape)
actors_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93346 entries, 0 to 93345
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      93344 non-null  float64
 1   name    93346 non-null  object 
dtypes: float64(1), object(1)
memory usage: 1.4+ MB
None
(93346, 2)


Unnamed: 0,id,name
0,1.0,george_lucas
1,2.0,mark_hamill
2,3.0,harrison_ford
3,4.0,carrie_fisher
4,5.0,peter_cushing


In [14]:
directors_df = pd.read_json('datasets/directors.json')[['id', 'name']]
directors_df['name'] = directors_df['name'].apply(lambda x: str(x).replace(' ', '_').lower())

print(directors_df.info())
print(directors_df.shape)
directors_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24223 entries, 0 to 24222
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      24223 non-null  int64 
 1   name    24223 non-null  object
dtypes: int64(1), object(1)
memory usage: 378.6+ KB
None
(24223, 2)


Unnamed: 0,id,name
0,1,george_lucas
1,2,mark_hamill
2,7,andrew_stanton
3,8,lee_unkrich
4,13,albert_brooks


In [15]:
people['actor1'] = people['actor1_id'].replace(actors_df.set_index('id')['name'])
people['actor2'] = people['actor2_id'].replace(actors_df.set_index('id')['name'])
people['actor3'] = people['actor3_id'].replace(actors_df.set_index('id')['name'])
people['actor4'] = people['actor4_id'].replace(actors_df.set_index('id')['name'])
people['director'] = people['director1_id'].replace(directors_df.set_index('id')['name'])

people['actors'] = people[['actor1', 'actor2', 'actor3', 'actor4']].values.tolist()
people = people[['tmdbId', 'director', 'actors']]

print(people.info())
print(people.shape)
people.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51362 entries, 0 to 51361
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   tmdbId    51362 non-null  int64 
 1   director  51362 non-null  object
 2   actors    51362 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB
None
(51362, 3)


Unnamed: 0,tmdbId,director,actors
0,862,john_lasseter,"[tom_hanks, tim_allen, don_rickles, jim_varney]"
1,8844,joe_johnston,"[robin_williams, kirsten_dunst, bradley_pierce..."
2,15602,howard_deutch,"[walter_matthau, jack_lemmon, ann-margret, sop..."
3,31357,forest_whitaker,"[whitney_houston, angela_bassett, loretta_devi..."
4,11862,charles_shyer,"[steve_martin, diane_keaton, martin_short, kim..."


In [16]:
people.isna().any()

tmdbId      False
director    False
actors      False
dtype: bool

In [17]:
movies.isna().any()

movieId     False
tmdbId      False
imdbId      False
title       False
genres      False
overview    False
dtype: bool

In [18]:
movies = movies.merge(people, on='tmdbId')
movies.reset_index(drop=True)

movies['genres'] = movies['genres'].apply(lambda x: str(x).split('|'))

print(movies.info())
print(movies.shape)
movies.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48267 entries, 0 to 48266
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movieId   48267 non-null  int64 
 1   tmdbId    48267 non-null  int64 
 2   imdbId    48267 non-null  int64 
 3   title     48267 non-null  object
 4   genres    48267 non-null  object
 5   overview  48267 non-null  object
 6   director  48267 non-null  object
 7   actors    48267 non-null  object
dtypes: int64(3), object(5)
memory usage: 3.3+ MB
None
(48267, 8)


Unnamed: 0,movieId,tmdbId,imdbId,title,genres,overview,director,actors
0,1,862,114709,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]","Led by Woody, Andy's toys live happily in his ...",john_lasseter,"[tom_hanks, tim_allen, don_rickles, jim_varney]"
1,2,8844,113497,Jumanji,"[Adventure, Children, Fantasy]",When siblings Judy and Peter discover an encha...,joe_johnston,"[robin_williams, kirsten_dunst, bradley_pierce..."
2,3,15602,113228,Grumpier Old Men,"[Comedy, Romance]",A family wedding reignites the ancient feud be...,howard_deutch,"[walter_matthau, jack_lemmon, ann-margret, sop..."
3,4,31357,114885,Waiting to Exhale,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",forest_whitaker,"[whitney_houston, angela_bassett, loretta_devi..."
4,5,11862,113041,Father of the Bride Part II,[Comedy],Just when George Banks has recovered from his ...,charles_shyer,"[steve_martin, diane_keaton, martin_short, kim..."


### Need to establish the minimum length of a valid/non-trivial 'overview'

In [20]:
movies[movies['overview'].apply(lambda x: len(x)) == 0].shape

(403, 8)

In [21]:
movies[~(movies['overview'].apply(lambda x: len(x)) == 0)]['overview'].apply(lambda x: len(x)).min()

11

In [22]:
movies[(movies['overview'].apply(lambda x: len(x)) == 11)]['overview']

46391    Documentary
Name: overview, dtype: object

In [23]:
movies[~(movies['overview'].apply(lambda x: len(x)) <= 11)]['overview'].apply(lambda x: len(x)).min()

13

In [24]:
movies[(movies['overview'].apply(lambda x: len(x)) == 13)]['overview']

708      German Comedy
28758    A heist movie
34931    Eurospy movie
Name: overview, dtype: object

In [25]:
movies[~(movies['overview'].apply(lambda x: len(x)) <= 13)]['overview'].apply(lambda x: len(x)).min()

14

In [26]:
movies[(movies['overview'].apply(lambda x: len(x)) == 14)]['overview']

33847    Italian comedy
Name: overview, dtype: object

In [27]:
movies[~(movies['overview'].apply(lambda x: len(x)) <= 14)]['overview'].apply(lambda x: len(x)).min()

16

In [28]:
movies[(movies['overview'].apply(lambda x: len(x)) == 16)]['overview']

17264    Return of Django
20515    No overview yet.
Name: overview, dtype: object

In [29]:
movies[~(movies['overview'].apply(lambda x: len(x)) <= 16)]['overview'].apply(lambda x: len(x)).min()

17

In [30]:
movies[(movies['overview'].apply(lambda x: len(x)) == 17)]['overview']

44556    Independent movie
45971    Never at the sea.
Name: overview, dtype: object

In [31]:
movies[~(movies['overview'].apply(lambda x: len(x)) <= 17)]['overview'].apply(lambda x: len(x)).min()

18

In [32]:
movies[(movies['overview'].apply(lambda x: len(x)) == 18)]['overview']

104      No overview found.
947      No overview found.
12682    No overview found.
12759    No overview found.
15826    No overview found.
16302    No overview found.
17944    Film by Juha Rosma
19149    No overview found.
19817    No overview found.
20251    No overview found.
22587    No overview found.
23033    No overview found.
24181    No overview found.
24817    No overview found.
26746    No overview found.
27928    No overview found.
27954    No overview found.
28493    No overview found.
29707    No overview found.
30260    No overview found.
31884    No overview found.
34157    No overview found.
34167    No overview found.
34252    No overview found.
35071    No overview found.
35266    No overview found.
35482    No overview found.
35654    No overview found.
35656    No overview found.
35658    No overview found.
35661    No overview found.
36220    No overview found.
36222    No overview found.
37318    No overview found.
37800    No overview found.
38675    No overview

In [33]:
movies[~(movies['overview'].apply(lambda x: len(x)) <= 24)]['overview'].apply(lambda x: len(x)).min()

25

In [34]:
movies[(movies['overview'].apply(lambda x: len(x)) == 25)]['overview']

17585    A film by Juho Kuosmanen.
21175    Finnish soft erotic movie
34185    Directed by  Özcan Deniz.
37157    Prohibition-era thriller.
38179    Fourth Break Blade Movie.
Name: overview, dtype: object

In [35]:
movies[~(movies['overview'].apply(lambda x: len(x)) <= 25)].shape

(47794, 8)

In [36]:
movies[(movies['overview'].apply(lambda x: len(x)) <= 50) & (movies['overview'].apply(lambda x: len(x)) > 25)]['overview'].values

array(['A comedy about a couple who cannot conceive a baby',
       'A surfer becomes the head of a major company.',
       'The story of Johann Strauss the elder and younger.',
       'Interracial love story set in Detroit.',
       'An alien creature stalks human prey.',
       'Domestic robots fall in love and run off together.',
       'A young boy trades the family cow for magic beans.',
       'A criminal holds a wealthy family hostage.',
       "Documentary on the director's meeting with Castro.",
       'A black female TV producer struggles in Hollywood.',
       'A mysterious girl inspires a struggling artist.',
       'An extremely nice guy falls for a really bad girl',
       "A simple funeral turns a man's world upside down.",
       'A con man tries to blackmail a Mexican gangster.',
       'Giant mutant rabbits terrorize the southwest!!',
       'A safecracker turns double agent during WWII.',
       'Vampires terrorize a city in Norrbotten, Sweden.',
       'A 15-episode

### Looks like >25 words is a good floor for minimumn length of meaningful plot summaries

In [37]:
movies = movies[(movies['overview'].apply(lambda x: len(x)) > 25)].reset_index(drop=True)
print(movies.shape)
movies['overview'].apply(lambda x: len(x)).min()

(47794, 8)


26

### Need to detect and remove rows where the 'overview' is not in English

#### This workaround utility function to catch "bad" utf-8 encodings that cause pycld2 to crash comes from this link: 
#### https://github.com/aboSamoor/polyglot/issues/71

In [39]:
RE_BAD_CHARS = regex.compile(r"\p{Cc}|\p{Cs}")

def remove_bad_chars(text):
    return RE_BAD_CHARS.sub("", text)

remove_bad_chars("A\x96 bad char")  # Cc category
# 'A bad char'

'A bad char'

In [40]:
def lang_det(x):
    _, _, detected_language, _ = cld2.detect(remove_bad_chars(x),  returnVectors=True)
    return detected_language[0][0]

movies['overview_lang'] = movies['overview'].map(lang_det)

In [41]:
movies['overview_lang'].value_counts()

ENGLISH       47708
FRENCH           28
Unknown          15
SPANISH           6
ITALIAN           5
POLISH            5
DUTCH             5
RUSSIAN           4
TURKISH           4
CZECH             3
FINNISH           3
SWEDISH           2
GERMAN            2
HUNGARIAN         1
PORTUGUESE        1
GREEK             1
Japanese          1
Name: overview_lang, dtype: int64

In [42]:
movies[movies['overview_lang'] == 'Unknown']

Unnamed: 0,movieId,tmdbId,imdbId,title,genres,overview,director,actors,overview_lang
1776,1950,10633,61811,In the Heat of the Night,"[Drama, Mystery]",An African American detective is asked to inve...,norman_jewison,"[sidney_poitier, rod_steiger, warren_oates, le...",Unknown
8081,9011,32558,40705,Portrait of Jennie,"[Drama, Fantasy, Mystery, Romance]",A mysterious girl inspires a struggling artist.,william_dieterle,"[jennifer_jones, joseph_cotten, ethel_barrymor...",Unknown
10586,44663,31289,429727,The Caiman,"[Comedy, Drama, Romance]",A skewering of Italian Prime Minister Silvio B...,nanni_moretti,"[silvio_orlando, margherita_buy, jasmine_trinc...",Unknown
11406,52224,297853,250790,Turn of Faith,"[Crime, Drama]","Mob drama starring Ray 'Boom Boom' Mancini, Mi...",charles_jarrott,"[ray_mancini, mia_sara, costas_mandylor, alan_...",Unknown
11792,55462,13319,454457,Frostbitten,"[Comedy, Horror]","Vampires terrorize a city in Norrbotten, Sweden.",anders_banke,"[petra_nielsen, carl-åke_eriksson, grete_havne...",Unknown
16442,86902,63858,45492,All I Desire,"[Drama, Romance]","In 1910, a wayward mother re-visits the family...",douglas_sirk,"[barbara_stanwyck, richard_carlson, lyle_bettg...",Unknown
26044,124803,157343,24090,Hard to Handle,"[Comedy, Romance]",A hustling public relations man promotes a ser...,mervyn_leroy,"[james_cagney, mary_brian, allen_jenkins, ruth...",Unknown
28633,133622,271561,52638,The Blue Angel,"[Drama, Romance]",Remake of Josef von Sternberg's 1930 classic.,edward_dmytryk,"[curd_jürgens, may_britt, theodore_bikel, john...",Unknown
29002,134710,279960,3883282,I Am Road Comic,[Documentary],"Interviews with T.J. Miller, Pete Holmes, Marc...",jordan_brady,"[maria_bamford, w._kamau_bell, wayne_federman,...",Unknown
31884,143609,262976,54304,Blood Feud,"[Crime, Drama]",A drama directed by Damiano Damiani.,damiano_damiani,"[belinda_lee, sylva_koscina, sergio_fantoni, a...",Unknown


#### Despite being labelled 'Unknown', the 15 movies above appear to have English overviews; drop the non-English entries:

In [43]:
movies = movies[movies['overview_lang'].isin(['ENGLISH', 'Unknown'])].drop(columns='overview_lang')
movies = movies.reset_index(drop=True)

print(movies.shape)
movies.head()

(47723, 8)


Unnamed: 0,movieId,tmdbId,imdbId,title,genres,overview,director,actors
0,1,862,114709,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]","Led by Woody, Andy's toys live happily in his ...",john_lasseter,"[tom_hanks, tim_allen, don_rickles, jim_varney]"
1,2,8844,113497,Jumanji,"[Adventure, Children, Fantasy]",When siblings Judy and Peter discover an encha...,joe_johnston,"[robin_williams, kirsten_dunst, bradley_pierce..."
2,3,15602,113228,Grumpier Old Men,"[Comedy, Romance]",A family wedding reignites the ancient feud be...,howard_deutch,"[walter_matthau, jack_lemmon, ann-margret, sop..."
3,4,31357,114885,Waiting to Exhale,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",forest_whitaker,"[whitney_houston, angela_bassett, loretta_devi..."
4,5,11862,113041,Father of the Bride Part II,[Comedy],Just when George Banks has recovered from his ...,charles_shyer,"[steve_martin, diane_keaton, martin_short, kim..."


### Save this dataset to file:

In [45]:
movies.to_csv('final_dataset.csv', index=True)

In [46]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47723 entries, 0 to 47722
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movieId   47723 non-null  int64 
 1   tmdbId    47723 non-null  int64 
 2   imdbId    47723 non-null  int64 
 3   title     47723 non-null  object
 4   genres    47723 non-null  object
 5   overview  47723 non-null  object
 6   director  47723 non-null  object
 7   actors    47723 non-null  object
dtypes: int64(3), object(5)
memory usage: 2.9+ MB


In [47]:
movies['director'].value_counts()

michael_curtiz      72
john_ford           65
richard_thorpe      53
alfred_hitchcock    53
henry_hathaway      50
                    ..
filip_tegstedt       1
walter_boholst       1
roger_holzberg       1
wayne_holloway       1
rodrigo_salomón      1
Name: director, Length: 18010, dtype: int64