In [6]:
# imports
# processing data
import pandas as pd
import numpy as np
from collections import defaultdict
# analysis
import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from scipy.stats import entropy

# plot
import matplotlib.pyplot as plt
from d3blocks import D3Blocks
import seaborn as sns
import plotly.express as px


# Section 1: preprocessing data.

In [5]:
# Please add all preprocessing code here.

# loading imdb actor information
actors_raw = pd.read_csv('data/name.basics.tsv/data.tsv', sep='\t')

# filter out rows with missing data cruical for analysis
actors_raw = actors_raw[actors_raw["birthYear"] != r"\N"]

# loading imdb's principals
principals_raw = pd.read_csv('data/title.principals.tsv/data.tsv', sep='\t')

## Filter for leading actors and add column for gender
Filter for leading actors based on imdb's principals data which contains for each movie the most important people (i. e. actors, actress, directors etc.). The importance is given by imdb and stored in the 'ordering' column

In [4]:
# filter for actors and actress in principals
principals = principals_raw[(principals_raw["category"] == "actress") | (principals_raw["category"] == "actor")] 

# add gender column (0=male, 1=female)
principals["gender"] = 0
principals.loc[principals["category"] == "actress", "gender"] = 1

# select ony important columns of principal and actors data set and join them left
principal_actors = principals[["tconst", "ordering", "nconst", "gender"]].merge(actors_raw[["nconst",	"primaryName",	"birthYear", "deathYear"]], how="left", on="nconst")

# Find total number of movies played by every leading actor
principal_actors.drop(columns=["deathYear"], inplace=True)
unique_actors = principal_actors["nconst"].unique()

male_actors = principal_actors[principal_actors["gender"] == 0]["nconst"].unique()
female_actors = principal_actors[principal_actors["gender"] == 1]["nconst"].unique()
print("There's a total of", len(unique_actors), "unique actors.")

print("There's a total of", len(male_actors), "male actors.")
print("There's a total of", len(female_actors), "female actors.")

NameError: name 'principals_raw' is not defined

In [None]:
total_actor_experience = principal_actors.groupby(["nconst"]).count()
actor_to_experience = {}
for row in total_actor_experience.iterrows():
    actor_to_experience[row[0]] = row[1]["tconst"]

principal_actors.dropna(inplace=True)

## Add column for age of actor during start of movie

In [None]:
# load in movie data and select relevent columns: tconst and startYear
movies_raw = pd.read_csv("data/title.basics.tsv/data.tsv", sep="\t")[["tconst", "startYear", "genres"]]

# filter out rows with missing data cruical for analysis
movies_raw = movies_raw[movies_raw["startYear"] != r"\N"]

# join movie data with actors
actor_movie_combi = principal_actors.merge(movies_raw, how="left", on="tconst")

# cast year columns to numeric in order to calculate
actor_movie_combi["startYear"] = pd.to_numeric(actor_movie_combi["startYear"]) # errors parameter for the case when date not given
actor_movie_combi["birthYear"] = pd.to_numeric(actor_movie_combi["birthYear"]) # errors parameter for the case when date not given
#actor_movie_combi["deatYear"] = pd.to_numeric(actor_movie_combi["deathYear"], errors="coerce") # errors parameter for the case when actor is not dead, then NaN

# add age_at_movie_start column
actor_movie_combi["age_at_movie_start"] = actor_movie_combi["startYear"] - actor_movie_combi["birthYear"]
print("Number of movie actor combinations:", actor_movie_combi.size)


## Add column for imdb ratings

In [None]:
# load in imdb ratings
ratings_raw = pd.read_csv('data/title.ratings.tsv/data.tsv', sep='\t')

# add ratings to overall data set
actor_movie_combi = actor_movie_combi.merge(ratings_raw, how="left", on="tconst")


## Add column for number of movies an actor has been in before start of respective movie

In [None]:
actor_movie_combi["exp_so_far"] = 0
actor_movie_combi.sort_values(by="startYear", inplace=True, ascending=True)
from collections import defaultdict
exp_so_far = defaultdict(int)
for row in actor_movie_combi.iterrows():
    actor = row[1]["nconst"]
    actor_movie_combi.loc[row[0], "exp_so_far"] = exp_so_far[actor]
    exp_so_far[actor] += 1

# # Drop rows with NA values and save to CSV
actor_movie_combi.dropna(inplace=True) 
#actor_movie_combi.drop(columns="exp_so_far", inplace=True)
actor_movie_combi.to_csv("data/actor_movie_combi.csv" , index=None)

# Defining functions to :
* Filter out movies from unwanted movie genres = ["Animation", "Biography", "Documentary","Short"] 
    * Animation - Only voice actors meanning that casting is heavily dictated by voice and language
    * Biography, Documentary - Heavily dependent on the subject
    * Short - We aim to study larger scale movies
* Study the genre density of the movie dataset
* Calculate the experience of actors per genre at the time of each movie release

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

# Broad range of genres taken into consideration
genres_major_add =  ['Drama', "Action", "Romance","Comedy","Adventure","Crime", "War", "Family", "Mystery"]

# remove genres which do not have a strict relation to actor features in general
genres_remove = ["Animation", "Biography", "Documentary","Short"]

def genre_exp(dataset, actor_keyword = "nconst", genre_keyword = "genres"):

    for x in genres_major_add:
        genre_label = x+"_exp_so_far"
        dataset[genre_label] = 0
        genre_exp_so_far = defaultdict(int)

        for row in dataset.iterrows():
            actor = row[1][actor_keyword]
            genres = row[1][genre_keyword].split(",")

            if x in genres:
                genre_exp_so_far[actor] += 1


            dataset.loc[row[0], genre_label] = genre_exp_so_far[actor]

    return dataset


# function removes certain genres and gives a bar plot and pie chart to help study the distribution of movies for various genres

def extract_genres(dataset, genres_major=genres_major, genres_remove=genres_remove, genre_keyword ='genres', year_keyword='startYear'):

    x = []
    a = []


    # filter genres that we are not considering = [short, documentary, biography, animation]
    dataset_filtered = dataset[~dataset[genre_keyword].str.contains('|'.join(genres_remove))]

    # to store only movies and actors that have worked in the major genres
    movies_major = dataset_filtered[dataset_filtered[genre_keyword].str.contains('|'.join(genres_major))]

    # for i in genres_major:

    #     movies_major[i] = movies_major[genre_keyword].apply(lambda x: 1 if i in x else 0)

    movies_major.sort_values(by=year_keyword, ascending=True)

    for y in movies_major[genre_keyword]:

        if type(y) == str:
            x+= y.split(",")

    genres_all, counts = np.unique(x, return_counts=True)
    fig1 = plt.figure()
    genre_df = pd.DataFrame(list(zip(genres_all, counts)), columns=["genre", "count"])
    genre_df.sort_values('count',inplace=True, ascending = False)
    genre_df.reset_index(drop=True,inplace=True)
    genre_df.set_index('genre').plot(kind='bar', figsize=(13,4))

    fig2 = plt.figure(figsize=(9,9))
    df_draw = genre_df.copy()
    df_draw.loc[df_draw['count'] < 50000, 'genre'] = 'Others'

    exp = np.zeros(len(df_draw["genre"].unique()))
    exp[0:5] = 0.1

    df_draw = df_draw.groupby('genre')['count'].sum().reset_index()
    df_draw.sort_values('count',inplace=True, ascending = False)

    plt.pie(df_draw['count'], labels=df_draw['genre'], autopct='%.0f%%', explode=exp)
    plt.show()

    return genres_all, counts, movies_major

# Section 2: genre analysis

- 1. Feature engineering


- 2. Genre pattern evaluation.

In [22]:
# load data.
movie_data = pd.read_csv("data/genre_experience_alive.csv")
movie_data.head(5)

Unnamed: 0,tconst,ordering,nconst,gender,primaryName,birthYear,startYear,genres,age_at_movie_start,averageRating,...,exp_so_far,Drama_exp_so_far,Action_exp_so_far,Romance_exp_so_far,Comedy_exp_so_far,Adventure_exp_so_far,Crime_exp_so_far,War_exp_so_far,Family_exp_so_far,Mystery_exp_so_far
0,tt0000009,2,nm0183823,0,William Courtenay,1875,1894.0,Romance,19.0,5.3,...,0,0,0,1,0,0,0,0,0,0
1,tt0000009,3,nm1309758,0,Chauncey Depew,1834,1894.0,Romance,60.0,5.3,...,0,0,0,1,0,0,0,0,0,0
2,tt0000009,1,nm0063086,1,Blanche Bayliss,1878,1894.0,Romance,16.0,5.3,...,0,0,0,1,0,0,0,0,0,0
3,tt1666737,1,nm0525908,0,Auguste Lumière,1862,1896.0,"Action,Comedy,Family",34.0,5.3,...,1,0,1,0,2,0,0,0,1,0
4,tt0000211,2,nm0617588,0,Georges Méliès,1861,1898.0,"Comedy,Fantasy,Horror",37.0,7.4,...,19,0,0,0,5,0,0,0,0,0


In [None]:
%%capture
# pair-wise genre analysis
# we are interested in popular genres here.
target_genres = ["Drama", "Comedy", "Crime", "Action", "Romance", "Family", "Adventure", "Mystery", "War", "Musical", "Thriller"]
corr_matrix = np.zeros((len(target_genres), len(target_genres)))

# filter out movies with single genre.
multiple_genres_movie = movie_data[movie_data["genres"].str.contains(",")].reset_index()

# count pair-wise movie genres.
for i in range(len(multiple_genres_movie)):
    for index1, first_genre in enumerate(target_genres):
        for index2, second_genre in enumerate(target_genres):
            if (first_genre != second_genre) and (first_genre in multiple_genres_movie["genres"][i]) and (second_genre in multiple_genres_movie["genres"][i]):
                corr_matrix[index1, index2] += 1

# build the matrix for plot.
weight_matrix = [list(corr_matrix[genre_index, :]) for genre_index in range(len(target_genres))]
rows = []
for index1 in range(len(target_genres)):
    for index2 in range(index1, len(target_genres)):
        row = {
            "source": target_genres[index1],
            "target": target_genres[index2],
            "weight": weight_matrix[index1][index2],
        }
        row = pd.DataFrame(data=row, index=[0])
        rows.append(row)

matrix = pd.concat(rows)

# Initialize plot
d3 = D3Blocks()

# chord plot
d3.chord(matrix);


- 3. Correlation matrix of the background of actors (i.e., experience in each genre) to the genre of the movie he was filming.

In [14]:
# load data
movie_data = pd.read_csv("data/genre_experience_alive.csv")
movie_data.head(-10)

Unnamed: 0,tconst,ordering,nconst,gender,primaryName,birthYear,startYear,genres,age_at_movie_start,averageRating,...,exp_so_far,Drama_exp_so_far,Action_exp_so_far,Romance_exp_so_far,Comedy_exp_so_far,Adventure_exp_so_far,Crime_exp_so_far,War_exp_so_far,Family_exp_so_far,Mystery_exp_so_far
0,tt0000009,2,nm0183823,0,William Courtenay,1875,1894.0,Romance,19.0,5.3,...,0,0,0,1,0,0,0,0,0,0
1,tt0000009,3,nm1309758,0,Chauncey Depew,1834,1894.0,Romance,60.0,5.3,...,0,0,0,1,0,0,0,0,0,0
2,tt0000009,1,nm0063086,1,Blanche Bayliss,1878,1894.0,Romance,16.0,5.3,...,0,0,0,1,0,0,0,0,0,0
3,tt1666737,1,nm0525908,0,Auguste Lumière,1862,1896.0,"Action,Comedy,Family",34.0,5.3,...,1,0,1,0,2,0,0,0,1,0
4,tt0000211,2,nm0617588,0,Georges Méliès,1861,1898.0,"Comedy,Fantasy,Horror",37.0,7.4,...,19,0,0,0,5,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1976488,tt13760918,3,nm1589340,1,Isha Talwar,1987,2022.0,"Drama,Thriller",35.0,5.5,...,15,13,1,4,8,0,1,0,4,0
1976489,tt16374130,2,nm4609670,0,Lou Ferrigno Jr.,1984,2022.0,"Drama,Romance",38.0,5.4,...,15,9,6,2,0,1,0,4,0,2
1976490,tt19394272,8,nm9462038,1,Kimya Gökçe Aytaç,1990,2022.0,"Comedy,Drama,Romance",32.0,7.0,...,3,1,1,2,4,0,1,0,0,0
1976491,tt11318364,2,nm0154169,1,Swatilekha Sengupta,1950,2022.0,"Drama,Family",72.0,7.1,...,2,3,0,0,0,0,0,0,2,0


In [17]:
# we focus on popular genres.
genres = ["Drama", "Action", "Romance", "Comedy", "Crime", "War", "Family", "Mystery", "Adventure"]

# calculate correlation.
corr_before_ongoing_genres = []
for genre in genres:
    movie_data_par_genre = movie_data[movie_data["genres"].str.contains(genre)]
    corr = movie_data_par_genre[["Drama_exp_so_far", "Action_exp_so_far", "Romance_exp_so_far", "Comedy_exp_so_far", "Crime_exp_so_far", "War_exp_so_far", "Family_exp_so_far", "Mystery_exp_so_far", "Adventure_exp_so_far"]].corr()
    corr_before_ongoing_genres.append(list(corr.loc[f"{genre}_exp_so_far"]))

In [18]:
# plot.
fig = px.imshow(corr_before_ongoing_genres,
                labels=dict(x="Actor's experience in each genre", y="Genre of the movie", color="correlation"),
                x=['Drama', 'Action', 'Romance', 'Comedy', 'Crime', 'War', 'Family', 'Mystery', 'Adventure'],
                y=['Drama', 'Action', 'Romance', 'Comedy', 'Crime', 'War', 'Family', 'Mystery', 'Adventure'],
                color_continuous_scale=px.colors.sequential.Cividis_r
               )
fig.update_xaxes(side="top")
fig.show()
# fig.write_html("./data/correlation_map.html")