In [1]:
# imports
# processing data
import pandas as pd
import numpy as np

# analysis
import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# plot
import matplotlib.pyplot as plt
from d3blocks import D3Blocks
import seaborn as sns
import plotly.express as px



# Section 1: preprocessing data.

In [2]:
# Please add all preprocessing code here.

# Section 2: genre analysis

- 1. Feature engineering


- 2. Genre pattern evaluation.

In [3]:
# load data.
movie_data = pd.read_csv("data/actor_movie_combi.csv")
movie_data.head(-10)

Unnamed: 0,tconst,ordering,nconst,gender,primaryName,birthYear,deathYear,startYear,genres,deatYear,age_at_movie_start,averageRating,numVotes,exp_so_far,drama_exp_so_far,action_exp_so_far,romance_exp_so_far,comedy_exp_so_far
0,tt7816420,1,nm1155956,0,Eadweard Muybridge,1830.0,1904,1881.0,"Documentary,Short",1904.0,51.0,5.2,462.0,0,0,0,0,0
1,tt1758563,1,nm1796515,0,Adolphe Le Prince,1872.0,1901,1888.0,"Documentary,Short",1901.0,16.0,5.5,1333.0,0,0,0,0,0
2,tt0361921,1,nm1362928,0,Giuseppe Sacco Albanese,1872.0,1943,1890.0,"Documentary,Short",1943.0,18.0,5.0,1508.0,0,0,0,0,0
3,tt0416047,1,nm1362928,0,Giuseppe Sacco Albanese,1872.0,1943,1890.0,Short,1943.0,18.0,4.2,425.0,1,0,0,0,0
4,tt0416046,1,nm1362928,0,Giuseppe Sacco Albanese,1872.0,1943,1890.0,Short,1943.0,18.0,4.8,1106.0,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698931,tt7728792,2,nm6074154,0,Harry Hains,1992.0,2020,2022.0,Drama,2020.0,30.0,8.2,19.0,5,4,0,2,0
698932,tt21045916,3,nm0582378,0,Sombat Metanee,1937.0,2022,2022.0,"Drama,Romance",2022.0,85.0,10.0,5.0,16,2,9,2,4
698933,tt21045922,3,nm0582378,0,Sombat Metanee,1937.0,2022,2022.0,"Drama,Romance",2022.0,85.0,10.0,5.0,17,3,9,3,4
698934,tt21048302,2,nm7670169,0,Papangkorn Lerkchaleampote,1996.0,2022,2022.0,"Adventure,Drama,Thriller",2022.0,26.0,8.0,57.0,13,13,0,1,1


In [None]:
%%capture
# pair-wise genre analysis
# we are interested in popular genres here.
target_genres = ["Drama", "Comedy", "Crime", "Action", "Romance", "Family", "Adventure", "Mystery", "War", "Musical", "Thriller"]
corr_matrix = np.zeros((len(target_genres), len(target_genres)))

# filter out movies with single genre.
multiple_genres_movie = movie_data[movie_data["genres"].str.contains(",")].reset_index()
for i in range(len(multiple_genres_movie)):
    for index1, first_genre in enumerate(target_genres):
        for index2, second_genre in enumerate(target_genres):
            if (first_genre != second_genre) and (first_genre in multiple_genres_movie["genres"][i]) and (second_genre in multiple_genres_movie["genres"][i]):
                corr_matrix[index1, index2] += 1

weight_matrix = [list(corr_matrix[genre_index, :]) for genre_index in range(len(target_genres))]
rows = []
for index1 in range(len(target_genres)):
    for index2 in range(index1, len(target_genres)):
        row = {
            "source": target_genres[index1],
            "target": target_genres[index2],
            "weight": weight_matrix[index1][index2],
        }
        row = pd.DataFrame(data=row, index=[0])
        rows.append(row)

matrix = pd.concat(rows)

# Initialize plot
d3 = D3Blocks()

# chord plot
d3.chord(matrix);


- 3. Correlation matrix of the background of actors (i.e., experience in each genre) to the genre of the movie he was filming.

In [9]:
# load data
movie_data = pd.read_csv("data/genre_experience.csv")
movie_data.head(-10)

Unnamed: 0.1,Unnamed: 0,tconst,ordering,nconst,gender,primaryName,birthYear,deathYear,startYear,genres,...,comedy_exp_so_far,adventure_exp_so_far,crime_exp_so_far,war_exp_so_far,family_exp_so_far,mystery_exp_so_far,Drama,Action,Romance,Comedy
0,0,tt0000009,3,nm1309758,0,Chauncey Depew,1834.0,1928,1894.0,Romance,...,0,0,0,0,0,0,0,0,1,0
1,1,tt0000009,1,nm0063086,1,Blanche Bayliss,1878.0,1951,1894.0,Romance,...,0,0,0,0,0,0,0,0,1,0
2,2,tt0000009,2,nm0183823,0,William Courtenay,1875.0,1933,1894.0,Romance,...,0,0,0,0,0,0,0,0,1,0
3,3,tt1666737,1,nm0525908,0,Auguste Lumière,1862.0,1954,1896.0,"Action,Comedy,Family",...,2,0,0,0,1,0,0,1,0,1
4,4,tt0000211,1,nm0194945,1,Jehanne d'Alcy,1865.0,1956,1898.0,"Comedy,Fantasy,Horror",...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468542,468542,tt18351130,3,nm0430074,0,Leslie Jordan,1955.0,2022,2022.0,Comedy,...,99,4,18,0,1,0,0,0,0,1
468543,468543,tt18561180,3,nm0430074,0,Leslie Jordan,1955.0,2022,2022.0,Comedy,...,100,4,18,0,1,0,0,0,0,1
468544,468544,tt18568804,3,nm0430074,0,Leslie Jordan,1955.0,2022,2022.0,Comedy,...,101,4,18,0,1,0,0,0,0,1
468545,468545,tt7728792,2,nm6074154,0,Harry Hains,1992.0,2020,2022.0,Drama,...,0,0,0,0,0,0,1,0,0,0


In [10]:
genres = ["Drama", "Action", "Romance", "Comedy", "Crime", "War", "Family", "Mystery", "Adventure"]
corr_before_ongoing_genres = []
for genre in genres:
    movie_data_par_genre = movie_data[movie_data["genres"].str.contains(genre)]
    corr = movie_data_par_genre[["drama_exp_so_far", "action_exp_so_far", "romance_exp_so_far", "comedy_exp_so_far", "crime_exp_so_far", "war_exp_so_far", "family_exp_so_far", "mystery_exp_so_far", "adventure_exp_so_far"]].corr()
    corr_before_ongoing_genres.append(list(corr.loc[f"{genre.lower()}_exp_so_far"]))

In [12]:
# plot.
fig = px.imshow(corr_before_ongoing_genres,
                labels=dict(x="Actor's experience in each genre", y="Genre of the movie", color="correlation"),
                x=['Drama', 'Action', 'Romance', 'Comedy', 'Crime', 'War', 'Family', 'Mystery', 'Adventure'],
                y=['Drama', 'Action', 'Romance', 'Comedy', 'Crime', 'War', 'Family', 'Mystery', 'Adventure'],
                color_continuous_scale=px.colors.sequential.Cividis_r
               )
fig.update_xaxes(side="top")
fig.show()
# fig.write_html("./data/correlation_map.html")

# Section 3: Analysis of actors' features over time

In [None]:
# import movie and actor data
# TODO get experience_so_far_per_genre into the movie-actor combi data set
movie_actor_combi = pd.read_csv("./data/actor_movie_combi.csv", index_col=0) #note: index_col=0 to solve unnamed colunm issue
movie_actor_combi # note: only actors with importance to the respective movieare considered. The importance is determined by IMDB itself and stored in "ordering" column

In [None]:
# add startDecade column to prepare data over time
def getGeneration(movieReleaseYear):
    if movieReleaseYear <= 1924:
        return "greatest"
    else:
        if movieReleaseYear >= 1925 and movieReleaseYear <= 1945:
            return "silent"
        else:
            if movieReleaseYear >= 1946 and movieReleaseYear <= 1964:
                return "baby_boomer"
            else:
                if movieReleaseYear >= 1965 and movieReleaseYear <= 1985:
                    return "boomer"
                else:
                    if movieReleaseYear >= 1986 and movieReleaseYear <= 1996:
                        return "millennial"
                    else:
                        if movieReleaseYear >= 1997 and movieReleaseYear <= 2012:
                            return "gen_z"
                        else:
                            return "gen_x"   

movie_actor_combi["generation"] = movie_actor_combi.apply(lambda x: getGeneration(x["startYear"]), axis=1)
movie_actor_combi

In [None]:
# filter for selected genres
selected_genres = ["Drama", "Action", "Comedy", "Adventure", "Romance"]
movie_actor_combi["genres"] = movie_actor_combi["genres"].values #converting genres column from series to np.array

# add binary column for each selected genre which indicates if movie is part of respective selected genre 
for genre in selected_genres:
    movie_actor_combi[genre] = [1 if genre in genres else 0 for genres in movie_actor_combi["genres"]] # 0 = movie not part of genre

# filter for movie-actor combies whose movies are part of at least on of the selected genres
selected_movie_actor_combi = movie_actor_combi[movie_actor_combi[selected_genres].sum(axis = 1) == 1] # aka filter out rows which only have zeros in colums of selected genres
selected_movie_actor_combi # TODO add column for experience so far in selected genre 

## Analyse movies per selected genre overtime

In [None]:
# count movies of selected genres for each generation
count_selected_genres_in_generation = selected_movie_actor_combi.drop_duplicates(subset=["tconst"]).groupby("generation").sum().reset_index()[selected_genres + ["generation"]]

# get total amount of movies aired in each generation
count_selected_genres_in_generation["total_movie_amount"] = movie_actor_combi.drop_duplicates(subset=["tconst"]).groupby("generation").count().reset_index()["tconst"]

# get number of movies of other genres aired in each generation
count_selected_genres_in_generation["other_genres"] = count_selected_genres_in_generation["total_movie_amount"] - count_selected_genres_in_generation[selected_genres].sum(axis=1)

# sort by generation
def sortByGeneration(df):
    generationOrder = {"greatest":0, "silent":1, "baby_boomer":2, "boomer":3, "millennial":4, "gen_z":5, "gen_x":6}     
    df = df.sort_values(by=["generation"], key=lambda x: x.map(generationOrder))
    return df
count_selected_genres_in_generation = sortByGeneration(count_selected_genres_in_generation)
count_selected_genres_in_generation

In [None]:
# staked plot of movies per selected genre for each generation as mosaik
y = []

for genre in selected_genres + ["other_genres"]:
    y.append(count_selected_genres_in_generation[genre]/count_selected_genres_in_generation["total_movie_amount"])

plt.stackplot(count_selected_genres_in_generation["generation"], y, labels=selected_genres + ["other_genres"])
plt.ylabel("Movies' share of total")
plt.xlabel("Generation")
plt.legend(title = "Genre", loc='center right', bbox_to_anchor=(1.3, 0.5))
plt.title("Genre composition over generations")
plt.xticks(rotation=90)
plt.show()

## Creating weights based on different success metrics for movies for actor profile analysis
To incooperate success of movies into actor profile analysis and thus making it interesting for producers who seek a cast which create successfull movies.

Used weights:
1. no weight
2. numVotes (= popularity/trendingness)
3. imdb score (= opinion)
4. 0.8 * numVotes + 0.2 imdb score (= considering opinion of viewers and overall popularity with an emphasis on opinion)


In [None]:
# creating a column for "opinion and popularity" success metric
selected_movie_actor_combi["opinion_and_popularity"] = 0.8 * selected_movie_actor_combi["averageRating"] + 0.2 * selected_movie_actor_combi["numVotes"]


In [None]:
## Analysis of (successfull) actor profiles over time using different weights of movie success
selected_features = ["gender", "age_at_movie_start", "exp_so_far"]
# analyse selected features of actors in selected genres over generations (using average or weighted averages)

# helper variables and functions
weights = ["none", "numVotes", "averageRating", "opinion_and_popularity"]

def weighted_average(x, values, weights):
    return sum(x[weights] * x[values]) / x[weights].sum()

# analyse for each selected genre
for genre in selected_genres:

    # filter for genre in selected genres
    movie_actor_combi_filtered_by_genre = selected_movie_actor_combi[selected_movie_actor_combi[genre] == 1]

    # run feature analysis over generation for genre 
    for feature in selected_features:

        # analyse using different weights
        for weight in weights:
            if "none" == weight:
                avg_movie_actor_combi_per_generation = movie_actor_combi_filtered_by_genre.groupby("generation").mean().reset_index()
            else:
                avg_movie_actor_combi_per_generation = movie_actor_combi_filtered_by_genre.groupby("generation").apply(lambda x : weighted_average(x, feature, weight)).to_frame(name=feature).reset_index()
            
            avg_movie_actor_combi_per_generation = sortByGeneration(avg_movie_actor_combi_per_generation)
            plt.plot(avg_movie_actor_combi_per_generation["generation"], avg_movie_actor_combi_per_generation[feature], label=weight) 
            
        plt.title(feature+"' s change in '"+genre+"' over generations")
        plt.ylabel(feature)
        plt.xlabel("Era")
        plt.xticks(rotation=90)
        plt.legend()
        plt.show()

### Interpretation and concerns
1. Interpretation

2. Concerns
    - age always compounds up --> there has to be an error
    - non-weighted average and average weighted with imdb score more or less the same --> why?
    - opininon_and_popularity should be in the middle of numVotes and averageRating. However, it follows numVotes more (although it was weighted down) --> que?
    - concern about how gender is analysed:

## Visualise actor feature change over generations for each genre
Since weighting the average with any success metrics does not really matter, we are not considering a movie success metric for the following visuals. This means that we cannot say that those are the profiles of successfull actors.

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
for feature in selected_features:   

    if len(selected_genres) % 2 == 1:
        fig, axs = plt.subplots(int(len(selected_genres)/2)+1, 2, sharex=True, sharey=True)
        fig.delaxes(axs[int(len(selected_genres)/2), 1])
    else:
        fig, axs = plt.subplots(len(selected_genres)/2, 2, sharex=True, sharey=True)

    x_len = 0
    y_len = 0
        
    for idx, genre in enumerate(selected_genres):   
        avg_selected_movie_actor_combi_by_genre = selected_movie_actor_combi.groupby("generation").apply(lambda x: x[x[genre] == 1].mean()).reset_index()
        axs[y_len, x_len].plot(avg_selected_movie_actor_combi_by_genre["generation"], avg_selected_movie_actor_combi_by_genre[feature])
        axs[y_len, x_len].set_title(genre)
        axs[y_len, x_len].tick_params(axis='x', rotation= 90)

        if idx%2 == 0:
            x_len += 1
        else:
            y_len += 1
            x_len = 0

    # Hide x labels and tick labels for top plots and y ticks for right plots.
    for ax in axs.flat:
        ax.label_outer()

    # add title
    fig.tight_layout(pad=1.5) # improve spacing
    fig.suptitle("Analysis of feature '"+feature+"'", y=1.05)

## Visualise change in features of actors and actresses over generations for each genre