In [617]:
from ast import literal_eval
import pandas as pd

In [618]:
# There are multiple genres per movie
df = pd.read_csv("dataset/movies_metadata.csv",usecols=['genres', 'overview'])
# Convert genres string type to list
df['genres'] = df['genres'].apply(lambda x: literal_eval(x)).apply(lambda x: sorted([d['name'] for d in x]))
df.head()

Unnamed: 0,genres,overview
0,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ..."
1,"[Adventure, Family, Fantasy]",When siblings Judy and Peter discover an encha...
2,"[Comedy, Romance]",A family wedding reignites the ancient feud be...
3,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom..."
4,[Comedy],Just when George Banks has recovered from his ...


In [619]:
print(f"There are {len(df)} rows in the dataset.")

There are 45466 rows in the dataset.


In [620]:
# Add String literal of genres to make it hashable for some future data cleaning steps
df['genres_str'] = df['genres'].apply(lambda x: " ".join(str(s) for s in x))
df

Unnamed: 0,genres,overview,genres_str
0,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",Animation Comedy Family
1,"[Adventure, Family, Fantasy]",When siblings Judy and Peter discover an encha...,Adventure Family Fantasy
2,"[Comedy, Romance]",A family wedding reignites the ancient feud be...,Comedy Romance
3,"[Comedy, Drama, Romance]","Cheated on, mistreated and stepped on, the wom...",Comedy Drama Romance
4,[Comedy],Just when George Banks has recovered from his ...,Comedy
...,...,...,...
45461,"[Drama, Family]",Rising and falling between a man and woman.,Drama Family
45462,[Drama],An artist struggles to finish his work while a...,Drama
45463,"[Action, Drama, Thriller]","When one of her hits goes wrong, a professiona...",Action Drama Thriller
45464,[],"In a small town live two brothers, one a minis...",


In [621]:
total_no_overview = df['overview'].isnull().sum()
print(f"There are {total_no_overview} movies with no overview.")

df = df.dropna(subset = ['overview']).reset_index(drop=True)
print(f"There are {len(df)} rows in the dataset after removing movies with no overview.")

There are 954 movies with no overview.
There are 44512 rows in the dataset after removing movies with no overview.


In [622]:
# Remove movies with no genres
total_no_label = sum(df['genres'].apply(lambda x: len(x) == 0))
print(f"There are {total_no_label} movies with no genres.")

# Remove movies with no genres
df = df[df['genres'].apply(lambda x: len(x)) > 0]
print(f"There are {len(df)} rows in the dataset after removing movies with no genres.")

There are 2185 movies with no genres.
There are 42327 rows in the dataset after removing movies with no genres.


In [623]:
# Check duplicate rows, i.e. movies and labels
print(f'There are {sum(df[["overview", "genres_str"]].duplicated())} duplicate overviews and labels, which means that some duplicate overviews have different labels')
df = df[~df[["overview", "genres_str"]].duplicated()]
print(f"There are {len(df)} rows in the dataset without duplicates.")

There are 113 duplicate overviews and labels, which means that some duplicate overviews have different labels
There are 42214 rows in the dataset without duplicates.


In [624]:
# Check duplicate rows, see that no movie overview is duplicate, so remove no movie overview
print(f'There are still {sum(df["overview"].duplicated())} duplicate overviews which means that some duplicate overviews have different labels')

# Check most recurring duplicates
print('Most recurring duplicates')
df[df["overview"].duplicated()]["overview"].value_counts().rename_axis('unique_values').reset_index(name='counts')

There are still 65 duplicate overviews which means that some duplicate overviews have different labels
Most recurring duplicates


Unnamed: 0,unique_values,counts
0,No overview found.,46
1,No movie overview available.,2
2,Released,2
3,,2
4,"Poor but happy, young Nello and his grandfathe...",1
5,"With friends like these, who needs enemies? Th...",1
6,When four women move into an old house left by...,1
7,No Overview,1
8,A Russian engineer Petr Garin possesses a uniq...,1
9,"A group of travelers, including a monk, stay i...",1


In [625]:
# Remove remaining duplictes since there is only 14 of them
print(f'There are {sum(df["overview"].duplicated(keep=False))} total duplicate overviews with different labels')

df = df[~df["overview"].duplicated(keep=False)]
print(f"There are {len(df)} rows in the dataset without duplicate overviews with different labels.")

There are 82 total duplicate overviews with different labels
There are 42132 rows in the dataset without duplicate overviews with different labels.


In [626]:
# Shortest movie reviews by str length
pd.DataFrame(df.sort_values(by="overview", key=lambda x: x.str.len())["overview"].unique()).head(30)

Unnamed: 0,0
0,x
1,...
2,Documentary
3,French Film
4,Feature film.
5,German Comedy
6,Not Available
7,japanese movie
8,Italian comedy
9,Netflix Special


In [627]:
# Shortest movie reviews by word count
pd.DataFrame(df.sort_values(by="overview", key=lambda x: x.str.split().apply(len))["overview"].unique()).head(30)

Unnamed: 0,0
0,...
1,x
2,Documentary
3,Netflix Special
4,japanese movie
5,Not Available
6,Italian comedy
7,Feature film.
8,French Film
9,Finnish Documentary


In [628]:
# Based on the above tables, let's drop overviews with <= 3 words or with str len <= 20
total_short = sum((df["overview"].apply(lambda x: len(x.split(" "))) <= 3) | (df["overview"].apply(lambda x: len(x)) <= 20))
print(f"There are {total_short} overviews with <= 3 words or <= 20 characters.")

df = df[~((df["overview"].apply(lambda x: len(x.split(" "))) <= 3) | (df["overview"].apply(lambda x: len(x)) <= 20))]
print(f"There are {len(df)} rows in the dataset after removing movies with short overviews.")

There are 25 overviews with <= 3 words or <= 20 characters.
There are 42107 rows in the dataset after removing movies with short overviews.


In [629]:
# There are some genres combinations with occurrence as low as 1.
total_single_occurence_genres_combations = sum(df["genres_str"].value_counts() == 1)
print(f"There are {total_single_occurence_genres_combations} genres combinations that occur only once")

# How many genres unique genres combinations
total_unique_genres_combinations = df["genres_str"].nunique()
print(f"There are {total_unique_genres_combinations} unique genres combinations")

There are 847 genres combinations that occur only once
There are 1908 unique genres combinations


In [630]:
# Since there are 847 single occurence genres combination out of 42107 rows, for simplicity we will drop those rows in order to be able to stratify the train test split on genres.
# If we wanted we could use a better appraoch for example see Multi-label data stratification (http://scikit.ml/stratification.html)
df = df.groupby("genres_str").filter(lambda x: len(x) > 1)#[['overview', 'genres']]

In [631]:
# Analyze labels
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(df['genres'])
classes = mlb.classes_
label_df = pd.DataFrame(labels, columns=classes)

labels_count = label_df.sum().sort_values(ascending=False)
labels_count

Drama              19540
Comedy             12424
Thriller            7293
Romance             6417
Action              6217
Horror              4481
Crime               4085
Documentary         3811
Adventure           3203
Science Fiction     2801
Family              2548
Mystery             2255
Fantasy             2054
Animation           1745
Music               1473
Foreign             1435
History             1273
War                 1228
Western              946
TV Movie             646
dtype: int64

In [632]:
# Train and test split
from sklearn.model_selection import train_test_split

test_split = 0.1

train_df, test_df = train_test_split(
    df,
    test_size=test_split,
    stratify=df["genres_str"].values,
)

In [633]:
vdf = df["genres_str"].value_counts().rename_axis('unique_values').reset_index(name='counts')

In [634]:
vdf[vdf['counts'] == 2]['unique_values'].to_list()

['Adventure Comedy Crime Science Fiction',
 'Action Horror Science Fiction TV Movie',
 'Action Adventure Fantasy Foreign Science Fiction',
 'Adventure Drama Science Fiction Thriller',
 'Drama Fantasy Music',
 'Action Comedy Crime Horror',
 'Action Crime Drama History',
 'Action Adventure Fantasy Horror Science Fiction',
 'Drama TV Movie Western',
 'Comedy Drama Family Fantasy Romance',
 'Action Foreign Horror Science Fiction',
 'Family Music TV Movie',
 'Adventure Drama Horror Thriller',
 'Crime Drama Romance Western',
 'Action Comedy Drama War',
 'Comedy Documentary History',
 'History Thriller',
 'Action Drama Fantasy Horror Thriller',
 'Comedy Fantasy Horror Thriller',
 'Animation Documentary History',
 'Adventure Crime Drama Mystery Thriller',
 'Action Animation Crime Thriller',
 'Action Animation Romance',
 'Crime Drama Horror Mystery',
 'Comedy Family Romance Science Fiction',
 'Comedy Drama Fantasy Thriller',
 'Adventure Comedy Thriller',
 'Action Adventure Animation Comedy Scie

In [635]:
cnt = 2
print(len(train_df[train_df["genres_str"].isin(vdf[vdf['counts'] == cnt]['unique_values'].to_list())]))
print(len(test_df[test_df["genres_str"].isin(vdf[vdf['counts'] == cnt]['unique_values'].to_list())]))

536
0


In [636]:
# There are no genres combinations with occurrence as low as 1 in the train data, so we can still use stratify for val split
total_single_occurence_genres_combations = sum(train_df["genres_str"].value_counts() == 1)
print(f"There are {total_single_occurence_genres_combations} genres combinations that occur only once")

# How many genres unique genres combinations
total_unique_genres_combinations = train_df["genres_str"].nunique()
print(f"There are {total_unique_genres_combinations} unique genres combinations")

There are 0 genres combinations that occur only once
There are 1061 unique genres combinations


In [637]:
train_df.to_csv("dataset/train_data.csv")
test_df.to_csv("dataset/test_data.csv")