<div style="text-align: center;">
<h1>Reel Realities: How Gender and Age Shape Success Across Box Office and Streaming Platforms</h1>
</div>

### <u>Imports</u>

In [55]:
import pandas as pd
import matplotlib.pyplot as plt
import ast
import re
import numpy as np

### 1. <u>Data cleaning and pre-processing</u>

#### 1.1 CMU Dataset

We will use three files of the CMU dataset:
1. "plot_summaries.txt" gives us the plots of the movies.
2. "movie.metadata.tsv" gives us information about the languages, countries, and genres of the movies.
3. "character.metadata.tsv" gives us 

In [56]:
plot_summaries_df = pd.read_csv("data/CMU/plot_summaries.txt", delimiter="\t", names = ["Wikipedia_movie_ID", "Plot Summaries"])

print(f"The plot summaries dataframe has {len(plot_summaries_df):,} values.")
plot_summaries_df.head()

The plot summaries dataframe has 42,303 values.


Unnamed: 0,Wikipedia_movie_ID,Plot Summaries
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [57]:
movie_metadata_df = pd.read_csv("data/CMU/movie.metadata.tsv", delimiter='\t', names = ["Wikipedia_movie_ID","Freebase_movie_ID","Movie_name","Movie_release_date","Movie_box_office_revenue","Movie_runtime","Movie_languages","Movie_countries","Movie_genres"])
print(f"The movie metadata dataframe has {len(movie_metadata_df):,} values.")
movie_metadata_df.head()

The movie metadata dataframe has 81,741 values.


Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [58]:
movie_metadata_df["Movie_name"] = (movie_metadata_df["Movie_name"]
                    .str.strip()  #Remove leading/trailing whitespace
                    .str.lower()  #Convert to lowercase
                    .replace("", np.nan)  #Replace empty strings with NaN
                   )

In [59]:
#Movie languages, countries, and genres are formatted as a dictionary, but they are actually a string. We convert them to a list.
movie_metadata_df["Movie_languages"] = movie_metadata_df["Movie_languages"].apply(ast.literal_eval)
movie_metadata_df["Movie_languages"] = movie_metadata_df["Movie_languages"].apply(lambda x: list(x.values()))

movie_metadata_df["Movie_countries"] = movie_metadata_df["Movie_countries"].apply(ast.literal_eval)
movie_metadata_df["Movie_countries"] = movie_metadata_df["Movie_countries"].apply(lambda x: list(x.values()))

movie_metadata_df["Movie_genres"] = movie_metadata_df["Movie_genres"].apply(ast.literal_eval)
movie_metadata_df["Movie_genres"] = movie_metadata_df["Movie_genres"].apply(lambda x: list(x.values()))

In [60]:
#Cleaning
def clean_string_list(lst):
    return [s.strip().lower() if isinstance(s, str) and s.strip() != "" else np.nan for s in lst]

movie_metadata_df["Movie_languages"] = movie_metadata_df["Movie_languages"].apply(clean_string_list)
movie_metadata_df["Movie_countries"] = movie_metadata_df["Movie_countries"].apply(clean_string_list)
movie_metadata_df["Movie_genres"] = movie_metadata_df["Movie_genres"].apply(clean_string_list)

In [61]:
movie_metadata_df.head()

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
0,975900,/m/03vyhn,ghosts of mars,2001-08-24,14010832.0,98.0,[english language],[united states of america],"[thriller, science fiction, horror, adventure,..."
1,3196793,/m/08yl5d,getting away with murder: the jonbenét ramsey ...,2000-02-16,,95.0,[english language],[united states of america],"[mystery, biographical film, drama, crime drama]"
2,28463795,/m/0crgdbh,brun bitter,1988,,83.0,[norwegian language],[norway],"[crime fiction, drama]"
3,9363483,/m/0285_cd,white of the eye,1987,,110.0,[english language],[united kingdom],"[thriller, erotic thriller, psychological thri..."
4,261236,/m/01mrr1,a woman in flames,1983,,106.0,[german language],[germany],[drama]


In [62]:
#There are inconsistencies in the date format: some rows have the date as year only, others as year-month, and some as year-month-day. We convert them all to the year format only since the month and day are not relevant to our analysis.
full_date_pattern = r'^\d{4}-\d{2}-\d{2}$' #Matches YYYY-MM-DD
year_month_pattern = r'^\d{4}-\d{2}$' #Matches YYYY-MM
year_only_pattern = r'^\d{4}$' #Matches YYYY

def identify_pattern(date):
    if pd.isna(date):
        return "Missing"
    elif re.match(full_date_pattern, date):
        return "Full Date (YYYY-MM-DD)"
    elif re.match(year_month_pattern, date):
        return "Year & Month Date (YYYY-MM)"
    elif re.match(year_only_pattern, date):
        return "Year Only (YYYY)"
    else:
        return "Other"

movie_metadata_df["Pattern"] = movie_metadata_df["Movie_release_date"].apply(identify_pattern)
pattern_summary = movie_metadata_df.groupby("Pattern").size().reset_index(name="Count")

print("Pattern Summary:")
print(pattern_summary)

Pattern Summary:
                       Pattern  Count
0       Full Date (YYYY-MM-DD)  39373
1                      Missing   6902
2  Year & Month Date (YYYY-MM)   3294
3             Year Only (YYYY)  32172


In [63]:
movie_metadata_df["Movie_release_date"] = movie_metadata_df["Movie_release_date"].apply(lambda x: str(x)[:4] if pd.notnull(x) else None)
movie_metadata_df = movie_metadata_df.drop(columns=["Pattern"])

In [64]:
n_null_movie_metadata = ((movie_metadata_df.isnull().sum() / len(movie_metadata_df)) * 100).apply(lambda x: f"{x:,.2f}%")

print(
    f"Percentage of null values per column:\n"
    f"movie_metadata_df:\n{n_null_movie_metadata}"
)

Percentage of null values per column:
movie_metadata_df:
Wikipedia_movie_ID           0.00%
Freebase_movie_ID            0.00%
Movie_name                   0.00%
Movie_release_date           8.44%
Movie_box_office_revenue    89.72%
Movie_runtime               25.02%
Movie_languages              0.00%
Movie_countries              0.00%
Movie_genres                 0.00%
dtype: object


In [65]:
movie_metadata_df.head()

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres
0,975900,/m/03vyhn,ghosts of mars,2001,14010832.0,98.0,[english language],[united states of america],"[thriller, science fiction, horror, adventure,..."
1,3196793,/m/08yl5d,getting away with murder: the jonbenét ramsey ...,2000,,95.0,[english language],[united states of america],"[mystery, biographical film, drama, crime drama]"
2,28463795,/m/0crgdbh,brun bitter,1988,,83.0,[norwegian language],[norway],"[crime fiction, drama]"
3,9363483,/m/0285_cd,white of the eye,1987,,110.0,[english language],[united kingdom],"[thriller, erotic thriller, psychological thri..."
4,261236,/m/01mrr1,a woman in flames,1983,,106.0,[german language],[germany],[drama]


In [66]:
character_metadata = pd.read_csv("./data/CMU/character.metadata.tsv", delimiter="\t", header = None)
character_metadata.columns = [
    'Wikipedia_movie_ID', 
    'Freebase_movie_ID', 
    'Movie_release_date', 
    'Character_name', 
    'Actor_date_of_birth', 
    'Actor_gender', 
    'Actor_height', 
    'Actor_ethnicity', 
    'Actor_name', 
    'Actor_age_at_movie_release', 
    'Freebase_character_actor_map_ID', 
    'Freebase_character_ID', 
    'Freebase_actor_ID'
]

In [67]:
character_metadata['Actor_name']= character_metadata['Actor_name'].fillna('unknown')
character_metadata["Actor_name"] = character_metadata["Actor_name"].str.lower()

# Convert 'Movie_release_date' and 'Actor_date_of_birth' columns to datetime and display only the date part
# ensures that the date columns are correctly interpreted as datetime objects in pandas. 
character_metadata['Movie_release_date'] = pd.to_datetime(character_metadata['Movie_release_date'], errors='coerce').dt.date
character_metadata['Actor_date_of_birth'] = pd.to_datetime(character_metadata['Actor_date_of_birth'], errors='coerce').dt.date

# Drop rows where 'Movie_release_date' or 'Actor_date_of_birth' is NaT
character_metadata = character_metadata.dropna(subset=['Movie_release_date', 'Actor_date_of_birth'])

# Calculate Actor_age_at_movie_release if missing and data is available
character_metadata['Actor_age_at_movie_release'] = character_metadata.apply(
    lambda row: (row['Movie_release_date'] - row['Actor_date_of_birth']).days // 365 if pd.isnull(row['Actor_age_at_movie_release']) else row['Actor_age_at_movie_release'],
    axis=1
)

#Drop unnecessary columns , heere the actors' heights
character_metadata.drop(columns=['Actor_height', 'Actor_ethnicity', 'Character_name'], inplace=True)


In [68]:
print(f"Movies without actor gender data: {character_metadata['Actor_gender'].isnull().sum()}") 

Movies without actor gender data: 892


We therefore drop the mvies with no information the gender of the actors.

In [69]:
character_metadata.dropna(subset=['Actor_gender'])

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_release_date,Actor_date_of_birth,Actor_gender,Actor_name,Actor_age_at_movie_release,Freebase_character_actor_map_ID,Freebase_character_ID,Freebase_actor_ID
0,975900,/m/03vyhn,2001-08-24,1958-08-26,F,wanda de jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,1974-08-15,F,natasha henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,1969-06-15,M,ice cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,1967-09-12,M,jason statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,1977-09-25,F,clea duvall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg
...,...,...,...,...,...,...,...,...,...,...
450654,913762,/m/03pcrp,1992-05-21,1966-12-20,F,yumi tōma,25.0,/m/0kr40cd,/m/0kr40ch,/m/08g3fb
450655,913762,/m/03pcrp,1992-05-21,1939-10-02,M,yoshisada sakaguchi,52.0,/m/0kr406x,/m/0kr406_,/m/04gq373
450657,913762,/m/03pcrp,1992-05-21,1960-04-28,M,steven blum,32.0,/m/0kr40d9,/m/0kr40df,/m/044_7j
450658,913762,/m/03pcrp,1992-05-21,1960-04-28,M,steven blum,32.0,/m/0kr408g,/m/0kr408l,/m/044_7j


Now, let's groupe the actors together by creating a *Cast* column on the *movie_metadata_df*. 

In [70]:
cast_per_movie = character_metadata.groupby('Freebase_movie_ID')['Actor_name'].apply(lambda x: ', '.join(x)).reset_index()
cast_per_movie = cast_per_movie.rename(columns={'Actor_name': 'Cast'})

movie_metadata_df = movie_metadata_df.merge(cast_per_movie, on='Freebase_movie_ID', how='left')

In [71]:
missing_cast_count = movie_metadata_df["Cast"].isna().sum()
total_movies = len(movie_metadata_df)
print(f"Movies without actor data: {missing_cast_count} out of {total_movies} ({missing_cast_count / total_movies * 100:.2f}%)")

Movies without actor data: 48518 out of 81741 (59.36%)


We decide to drop the movies without actor data since it is one of the most important data of ur study. 

In [72]:
movie_metadata_df = movie_metadata_df.dropna(subset = ['Cast'])

In [73]:
# Create columns for male, female actor counts
actor_counts = character_metadata.groupby('Freebase_movie_ID')['Actor_gender'].value_counts().unstack(fill_value=0)
actor_counts = actor_counts.rename(columns={'M': 'Male_actors', 'F': 'Female_actors'})

# Ensure all columns are present (set missing columns to 0 if necessary)
for col in ['Male_actors', 'Female_actors']:
    if col not in actor_counts.columns:
        actor_counts[col] = 0

actor_counts = actor_counts.reset_index()

# Merge actor counts with the movie_metadata_df DataFrame
movie_metadata_df = movie_metadata_df.merge(actor_counts, on='Freebase_movie_ID', how='left')

# Fill NaN values with 0 
movie_metadata_df[['Male_actors', 'Female_actors']] = movie_metadata_df[['Male_actors', 'Female_actors']].fillna(0)

In [74]:
# Calculate the percentage of female actors in each movie and round to two decimal places
movie_metadata_df['Female_actor_percentage'] = (
    (movie_metadata_df['Female_actors'] / 
     (movie_metadata_df['Female_actors'] + movie_metadata_df['Male_actors'])) * 100
).round(2)

In [75]:
movie_metadata_df.head()

Unnamed: 0,Wikipedia_movie_ID,Freebase_movie_ID,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,Movie_languages,Movie_countries,Movie_genres,Cast,Female_actors,Male_actors,Female_actor_percentage
0,975900,/m/03vyhn,ghosts of mars,2001,14010832.0,98.0,[english language],[united states of america],"[thriller, science fiction, horror, adventure,...","wanda de jesus, natasha henstridge, ice cube, ...",6.0,7.0,46.15
1,3196793,/m/08yl5d,getting away with murder: the jonbenét ramsey ...,2000,,95.0,[english language],[united states of america],"[mystery, biographical film, drama, crime drama]","alice barrett, robert catrini, cliff deyoung, ...",2.0,5.0,28.57
2,13696889,/m/03cfc81,the gangsters,1913,,35.0,"[silent film, english language]",[united states of america],"[short film, silent film, indie, black-and-whi...",roscoe arbuckle,0.0,1.0,0.0
3,10408933,/m/02qc0j7,alexander's ragtime band,1938,3600000.0,106.0,[english language],[united states of america],"[musical, comedy, black-and-white]","ethel merman, tyrone power, alice faye, don am...",2.0,2.0,50.0
4,6631279,/m/0gffwj,little city,1997,,93.0,[english language],[united states of america],"[romantic comedy, ensemble film, comedy-drama,...","josh charles, penelope ann miller, annabella s...",4.0,2.0,66.67


#### 1.2 IMDB Dataset

We will use two IMDB datasets to decribe movies:
1. "title.ratings.tsv" gives us the ratings of the movies as voted by viewers. 
2. "title.basics.tsv", indexes into "title.ratings.tsv" using a alphanumeric unique identifier of the title. It gives general information about the movie such as runtime, release date and adult rating.
3. "title.crew.tsv", indexes into the previous two using the same alphanumeric unique identifier of the title. It gives information on the directors and writers of the movie.

Reference:
Internet Movie Database. (2024). IMDb non-commercial datasets. Retrieved from https://developer.imdb.com/non-commercial-datasets/

In [76]:
# Loading the datasets. Null values are represented using "\N".
IMDB_ratings_df = pd.read_csv("data/IMDB/title.ratings.tsv", delimiter="\t", na_values="\\N")
IMDB_basics_df = pd.read_csv("data/IMDB/title.basics.tsv", delimiter="\t", na_values="\\N", low_memory=False)
IMDB_crew_df = pd.read_csv("data/IMDB/title.crew.tsv", delimiter="\t", na_values="\\N", low_memory=False)

In [77]:
IMDB_ratings_df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2100
1,tt0000002,5.6,282
2,tt0000003,6.5,2119
3,tt0000004,5.4,182
4,tt0000005,6.2,2850


In [78]:
IMDB_basics_df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0.0,1892.0,,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1,"Comedy,Short"


In [79]:
IMDB_crew_df.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
2,tt0000003,nm0721526,
3,tt0000004,nm0721526,
4,tt0000005,nm0005690,


In [80]:
# Checking the lengths of the datasets
print(f"Length of IMDB_ratings_df: {len(IMDB_ratings_df):,}\n"
      f"Length of IMDB_basics_df: {len(IMDB_basics_df):,}\n"
      f"Length of IMDB_crew_df: {len(IMDB_crew_df):,}")

Length of IMDB_ratings_df: 1,498,615
Length of IMDB_basics_df: 11,235,767
Length of IMDB_crew_df: 10,571,536


Before dealing with the null values we will merge the dataframes together using the alphanumeric unique identifier.

In [81]:
# Merging all three datasets.
IMDB_merged_df = pd.merge(IMDB_ratings_df, IMDB_basics_df, how="inner", left_on="tconst", right_on="tconst")
IMDB_merged_df = pd.merge(IMDB_merged_df, IMDB_crew_df, how="inner", on="tconst")

print(f"The resulting merged dataframe has length: {len(IMDB_merged_df):,}.")
print(f"{len(IMDB_ratings_df)-len(IMDB_merged_df):,} rows were lost in the merging process.")
IMDB_merged_df.head()

The resulting merged dataframe has length: 1,484,729.
13,886 rows were lost in the merging process.


Unnamed: 0,tconst,averageRating,numVotes,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,directors,writers
0,tt0000001,5.7,2100,short,Carmencita,Carmencita,0.0,1894.0,,1,"Documentary,Short",nm0005690,
1,tt0000002,5.6,282,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5,"Animation,Short",nm0721526,
2,tt0000003,6.5,2119,short,Poor Pierrot,Pauvre Pierrot,0.0,1892.0,,5,"Animation,Comedy,Romance",nm0721526,
3,tt0000004,5.4,182,short,Un bon bock,Un bon bock,0.0,1892.0,,12,"Animation,Short",nm0721526,
4,tt0000005,6.2,2850,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1,"Comedy,Short",nm0005690,


We can see we do not lose a lot of rows with respect to the IMDB_ratings_df dataframe.

Next, we look at titleType. These dataframes do not only have movies but also short movies, tv shows, episodes. The next step is thus to filter only movies.

In [82]:
# Filtering movies from the list of titles.
IMDB_merged_df = IMDB_merged_df[IMDB_merged_df["titleType"] == "movie"]

print(f"There are {len(IMDB_merged_df):,} movies in the resulting dataframe.")
IMDB_merged_df.head()

There are 319,293 movies in the resulting dataframe.


Unnamed: 0,tconst,averageRating,numVotes,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,directors,writers
8,tt0000009,5.4,216,movie,Miss Jerry,Miss Jerry,0.0,1894.0,,45,Romance,nm0085156,nm0085156
143,tt0000147,5.2,540,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0.0,1897.0,,100,"Documentary,News,Sport",nm0714557,
337,tt0000502,4.1,19,movie,Bohemios,Bohemios,0.0,1905.0,,100,,nm0063413,"nm0063413,nm0657268,nm0675388"
372,tt0000574,6.0,938,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0.0,1906.0,,70,"Action,Adventure,Biography",nm0846879,nm0846879
382,tt0000591,5.7,28,movie,The Prodigal Son,L'enfant prodigue,0.0,1907.0,,90,Drama,nm0141150,nm0141150


We can now look at null values in the merged IMDB dataframe.

In [83]:
# Checking the percentage of null values in the dataset.
n_null_IMDB = ((IMDB_merged_df.isnull().sum() / len(IMDB_merged_df)) * 100).apply(lambda x: f"{x:,.2f}%")

print(
    f"Percentage of null values per column:\n"
    f"IMDB_ratings_df:\n{n_null_IMDB}"
)

Percentage of null values per column:
IMDB_ratings_df:
tconst              0.00%
averageRating       0.00%
numVotes            0.00%
titleType           0.00%
primaryTitle        0.00%
originalTitle       0.00%
isAdult             0.00%
startYear           0.01%
endYear           100.00%
runtimeMinutes      9.98%
genres              3.26%
directors           0.98%
writers            12.30%
dtype: object


The end year is always missing. Other than that the proportion of missing values is very small (< 12%). End year does not have any useful information for our intended analysis and can thus be dropped. We can also drop the titleType column since we know they are all movies after the filtering that was done above.

In [84]:
# Dropping unnecessary columns.
IMDB_merged_df = IMDB_merged_df.drop(columns=["endYear", "titleType"], axis=1)

print(f"The resulting dataframe has {len(IMDB_merged_df):,} rows.")
IMDB_merged_df.head()

The resulting dataframe has 319,293 rows.


Unnamed: 0,tconst,averageRating,numVotes,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres,directors,writers
8,tt0000009,5.4,216,Miss Jerry,Miss Jerry,0.0,1894.0,45,Romance,nm0085156,nm0085156
143,tt0000147,5.2,540,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0.0,1897.0,100,"Documentary,News,Sport",nm0714557,
337,tt0000502,4.1,19,Bohemios,Bohemios,0.0,1905.0,100,,nm0063413,"nm0063413,nm0657268,nm0675388"
372,tt0000574,6.0,938,The Story of the Kelly Gang,The Story of the Kelly Gang,0.0,1906.0,70,"Action,Adventure,Biography",nm0846879,nm0846879
382,tt0000591,5.7,28,The Prodigal Son,L'enfant prodigue,0.0,1907.0,90,Drama,nm0141150,nm0141150


We will use ratings from the IMDB dataset during our study. However, these ratings are based on viewer votes. Initially we thought of discarding rows with too few votes. However, there could be a link between number of votes and number of views of a movie (although definitely not a direct one). We thus decided to keep all rows for the analysis.

#### 1.3 Merging the datasets

##### 1.3.1 Merging IMDB and CMU Movies

In [85]:
# Merging on the original title.
merge1 = pd.merge(IMDB_merged_df, movie_metadata_df, how="inner", left_on="originalTitle", right_on="Movie_name")
# Merging on the secondary title.
merge2 = pd.merge(IMDB_merged_df, movie_metadata_df, how="inner", left_on="primaryTitle", right_on="Movie_name")

# Concatenating and dropping duplicates that appear from movies with the same originalTitle and primaryTitle.
movie_df = pd.concat([merge1, merge2]).drop_duplicates().reset_index(drop=True)

print(f"The resulting dataframe has {len(movie_df):,} rows.")
movie_df.head()

TypeError: unhashable type: 'list'

Some columns appear twice. Let's take a look at the proportion of null values in each duplicate column.

In [None]:
n_null_movie = (movie_df.isnull().sum()/len(movie_df)).apply(lambda x:f"{x:.2%}")

print(f"Percentage of null values per column:\n{n_null_movie}")

We can see:
- runtimeMinutes_x and runtimeMinutes_y have 6.02% and 11.58% missing values respectively. We will combine the non null values from both these columns into a new column called runtimeMinutes and then drop the previous two columns. 
- genres_x has 1.54% missing values against 0.00% missing values for genres_y. Furthermore genres_y is from the CMU dataset and seems more complete. We will thus drop the genres_x column.

In [None]:
# Combining all non-null values from runtimeMinutes_x and runtimeMinutes_y into runtimeMinutes.
movie_df["runtimeMinutes"] = movie_df["runtimeMinutes_x"].combine_first(movie_df["runtimeMinutes_y"])

# Dropping the unnecessary columns.
movie_df.drop(columns=["runtimeMinutes_x", "runtimeMinutes_y", "genres_x"], inplace=True)

# Renaming the column to Genres.
movie_df.rename(columns={"genres_y":"Genres"}, inplace=True)
movie_df.head()

Let's now see if Movie_name, originalTitle and primaryTitle are all necessary or if there are any redundancies.

In [None]:
test1 = movie_df["Movie_name"] == movie_df["primaryTitle"]  
test2 = movie_df["Movie_name"] == movie_df["originalTitle"]

# Checking if there are any movies for which Movie_name is not either in primaryTitle or originalTitle
print(f"There are {(~(test1 | test2)).sum().item()} movies for which Movie_name is in neither primaryTitle ot originalTitle.")

We can see the Movie_name column is redundant as its information is either in primaryTitle or in originalTitle. We can thus drop this column.

In [None]:
movie_df.drop(columns="Movie_name", inplace=True)

##### 1.3.2 Adding Plot Summaries when possible

In [None]:
movie_df = pd.merge(movie_df, plot_summaries_df, how="left", on="Wikipedia_movie_ID")

# Checking how many movies have plot summaries
n_movie_plots = (1 - (movie_df["Plot Summaries"].isnull().sum() / len(movie_df))) * len(movie_df)
print(f"{int(n_movie_plots):,} movies from our final dataset have plot summaries.")

We can now also drop movie identifier columns (as everything is already indexed): tconst, Freebase_movie_ID and Wikipedia_movie_ID.

In [None]:
movie_df.drop(columns=["Wikipedia_movie_ID", "tconst", "Freebase_movie_ID"], inplace=True)

This gives us our final cleaned dataset for our study:

In [None]:
movie_df.head()

### 2. <u>Our success metric</u>

### 3. <u>Gender and age vs success BLABLABLA</u>

Dependent variables:
- Ratings
- Success metric
- Profit ratio

Independent variables:
- Gender
- Age
- Genre
- isAdult?
- Movie country
- Movie language
- Release date

Look at adding starpower

### 4. <u>How does it compare to streaming platforms? Are movies made for these platforms different? Have box office movies adapted since the rise of streaming?</u>

### 5. <u>What are the social reasons behind the presence of female characters in movies? Is it due to sexualization or genuine equality of representation?</u>