In [1]:
# imports
# processing data
import pandas as pd
import numpy as np

# analysis
import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# plot
import matplotlib.pyplot as plt
from d3blocks import D3Blocks
import seaborn as sns
import plotly.express as px



# Section 1: preprocessing data.

In [2]:
# Please add all preprocessing code here.

# Section 2: genre analysis

- 1. Feature engineering


- 2. Genre pattern evaluation.

In [22]:
# load data.
movie_data = pd.read_csv("data/genre_experience_alive.csv")
movie_data.head(5)

Unnamed: 0,tconst,ordering,nconst,gender,primaryName,birthYear,startYear,genres,age_at_movie_start,averageRating,...,exp_so_far,Drama_exp_so_far,Action_exp_so_far,Romance_exp_so_far,Comedy_exp_so_far,Adventure_exp_so_far,Crime_exp_so_far,War_exp_so_far,Family_exp_so_far,Mystery_exp_so_far
0,tt0000009,2,nm0183823,0,William Courtenay,1875,1894.0,Romance,19.0,5.3,...,0,0,0,1,0,0,0,0,0,0
1,tt0000009,3,nm1309758,0,Chauncey Depew,1834,1894.0,Romance,60.0,5.3,...,0,0,0,1,0,0,0,0,0,0
2,tt0000009,1,nm0063086,1,Blanche Bayliss,1878,1894.0,Romance,16.0,5.3,...,0,0,0,1,0,0,0,0,0,0
3,tt1666737,1,nm0525908,0,Auguste Lumière,1862,1896.0,"Action,Comedy,Family",34.0,5.3,...,1,0,1,0,2,0,0,0,1,0
4,tt0000211,2,nm0617588,0,Georges Méliès,1861,1898.0,"Comedy,Fantasy,Horror",37.0,7.4,...,19,0,0,0,5,0,0,0,0,0


In [None]:
%%capture
# pair-wise genre analysis
# we are interested in popular genres here.
target_genres = ["Drama", "Comedy", "Crime", "Action", "Romance", "Family", "Adventure", "Mystery", "War", "Musical", "Thriller"]
corr_matrix = np.zeros((len(target_genres), len(target_genres)))

# filter out movies with single genre.
multiple_genres_movie = movie_data[movie_data["genres"].str.contains(",")].reset_index()

# count pair-wise movie genres.
for i in range(len(multiple_genres_movie)):
    for index1, first_genre in enumerate(target_genres):
        for index2, second_genre in enumerate(target_genres):
            if (first_genre != second_genre) and (first_genre in multiple_genres_movie["genres"][i]) and (second_genre in multiple_genres_movie["genres"][i]):
                corr_matrix[index1, index2] += 1

# build the matrix for plot.
weight_matrix = [list(corr_matrix[genre_index, :]) for genre_index in range(len(target_genres))]
rows = []
for index1 in range(len(target_genres)):
    for index2 in range(index1, len(target_genres)):
        row = {
            "source": target_genres[index1],
            "target": target_genres[index2],
            "weight": weight_matrix[index1][index2],
        }
        row = pd.DataFrame(data=row, index=[0])
        rows.append(row)

matrix = pd.concat(rows)

# Initialize plot
d3 = D3Blocks()

# chord plot
d3.chord(matrix);


- 3. Correlation matrix of the background of actors (i.e., experience in each genre) to the genre of the movie he was filming.

In [14]:
# load data
movie_data = pd.read_csv("data/genre_experience_alive.csv")
movie_data.head(-10)

Unnamed: 0,tconst,ordering,nconst,gender,primaryName,birthYear,startYear,genres,age_at_movie_start,averageRating,...,exp_so_far,Drama_exp_so_far,Action_exp_so_far,Romance_exp_so_far,Comedy_exp_so_far,Adventure_exp_so_far,Crime_exp_so_far,War_exp_so_far,Family_exp_so_far,Mystery_exp_so_far
0,tt0000009,2,nm0183823,0,William Courtenay,1875,1894.0,Romance,19.0,5.3,...,0,0,0,1,0,0,0,0,0,0
1,tt0000009,3,nm1309758,0,Chauncey Depew,1834,1894.0,Romance,60.0,5.3,...,0,0,0,1,0,0,0,0,0,0
2,tt0000009,1,nm0063086,1,Blanche Bayliss,1878,1894.0,Romance,16.0,5.3,...,0,0,0,1,0,0,0,0,0,0
3,tt1666737,1,nm0525908,0,Auguste Lumière,1862,1896.0,"Action,Comedy,Family",34.0,5.3,...,1,0,1,0,2,0,0,0,1,0
4,tt0000211,2,nm0617588,0,Georges Méliès,1861,1898.0,"Comedy,Fantasy,Horror",37.0,7.4,...,19,0,0,0,5,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1976488,tt13760918,3,nm1589340,1,Isha Talwar,1987,2022.0,"Drama,Thriller",35.0,5.5,...,15,13,1,4,8,0,1,0,4,0
1976489,tt16374130,2,nm4609670,0,Lou Ferrigno Jr.,1984,2022.0,"Drama,Romance",38.0,5.4,...,15,9,6,2,0,1,0,4,0,2
1976490,tt19394272,8,nm9462038,1,Kimya Gökçe Aytaç,1990,2022.0,"Comedy,Drama,Romance",32.0,7.0,...,3,1,1,2,4,0,1,0,0,0
1976491,tt11318364,2,nm0154169,1,Swatilekha Sengupta,1950,2022.0,"Drama,Family",72.0,7.1,...,2,3,0,0,0,0,0,0,2,0


In [17]:
# we focus on popular genres.
genres = ["Drama", "Action", "Romance", "Comedy", "Crime", "War", "Family", "Mystery", "Adventure"]

# calculate correlation.
corr_before_ongoing_genres = []
for genre in genres:
    movie_data_par_genre = movie_data[movie_data["genres"].str.contains(genre)]
    corr = movie_data_par_genre[["Drama_exp_so_far", "Action_exp_so_far", "Romance_exp_so_far", "Comedy_exp_so_far", "Crime_exp_so_far", "War_exp_so_far", "Family_exp_so_far", "Mystery_exp_so_far", "Adventure_exp_so_far"]].corr()
    corr_before_ongoing_genres.append(list(corr.loc[f"{genre}_exp_so_far"]))

In [18]:
# plot.
fig = px.imshow(corr_before_ongoing_genres,
                labels=dict(x="Actor's experience in each genre", y="Genre of the movie", color="correlation"),
                x=['Drama', 'Action', 'Romance', 'Comedy', 'Crime', 'War', 'Family', 'Mystery', 'Adventure'],
                y=['Drama', 'Action', 'Romance', 'Comedy', 'Crime', 'War', 'Family', 'Mystery', 'Adventure'],
                color_continuous_scale=px.colors.sequential.Cividis_r
               )
fig.update_xaxes(side="top")
fig.show()
# fig.write_html("./data/correlation_map.html")