In [1]:
# imports
# processing data
import pandas as pd
import numpy as np

# analysis
import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# plot
import matplotlib.pyplot as plt
from d3blocks import D3Blocks
import seaborn as sns
import plotly.express as px



# Section 1: preprocessing data.

In [2]:
# Please add all preprocessing code here.

# Section 2: genre analysis

- 1. Feature engineering


- 2. Genre pattern evaluation.

In [3]:
# load data.
movie_data = pd.read_csv("data/actor_movie_combi.csv")
movie_data.head(-10)

Unnamed: 0,tconst,ordering,nconst,gender,primaryName,birthYear,deathYear,startYear,genres,deatYear,age_at_movie_start,averageRating,numVotes,exp_so_far,drama_exp_so_far,action_exp_so_far,romance_exp_so_far,comedy_exp_so_far
0,tt7816420,1,nm1155956,0,Eadweard Muybridge,1830.0,1904,1881.0,"Documentary,Short",1904.0,51.0,5.2,462.0,0,0,0,0,0
1,tt1758563,1,nm1796515,0,Adolphe Le Prince,1872.0,1901,1888.0,"Documentary,Short",1901.0,16.0,5.5,1333.0,0,0,0,0,0
2,tt0361921,1,nm1362928,0,Giuseppe Sacco Albanese,1872.0,1943,1890.0,"Documentary,Short",1943.0,18.0,5.0,1508.0,0,0,0,0,0
3,tt0416047,1,nm1362928,0,Giuseppe Sacco Albanese,1872.0,1943,1890.0,Short,1943.0,18.0,4.2,425.0,1,0,0,0,0
4,tt0416046,1,nm1362928,0,Giuseppe Sacco Albanese,1872.0,1943,1890.0,Short,1943.0,18.0,4.8,1106.0,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698931,tt7728792,2,nm6074154,0,Harry Hains,1992.0,2020,2022.0,Drama,2020.0,30.0,8.2,19.0,5,4,0,2,0
698932,tt21045916,3,nm0582378,0,Sombat Metanee,1937.0,2022,2022.0,"Drama,Romance",2022.0,85.0,10.0,5.0,16,2,9,2,4
698933,tt21045922,3,nm0582378,0,Sombat Metanee,1937.0,2022,2022.0,"Drama,Romance",2022.0,85.0,10.0,5.0,17,3,9,3,4
698934,tt21048302,2,nm7670169,0,Papangkorn Lerkchaleampote,1996.0,2022,2022.0,"Adventure,Drama,Thriller",2022.0,26.0,8.0,57.0,13,13,0,1,1


In [None]:
%%capture
# pair-wise genre analysis
# we are interested in popular genres here.
target_genres = ["Drama", "Comedy", "Crime", "Action", "Romance", "Family", "Adventure", "Mystery", "War", "Musical", "Thriller"]
corr_matrix = np.zeros((len(target_genres), len(target_genres)))

# filter out movies with single genre.
multiple_genres_movie = movie_data[movie_data["genres"].str.contains(",")].reset_index()
for i in range(len(multiple_genres_movie)):
    for index1, first_genre in enumerate(target_genres):
        for index2, second_genre in enumerate(target_genres):
            if (first_genre != second_genre) and (first_genre in multiple_genres_movie["genres"][i]) and (second_genre in multiple_genres_movie["genres"][i]):
                corr_matrix[index1, index2] += 1

weight_matrix = [list(corr_matrix[genre_index, :]) for genre_index in range(len(target_genres))]
rows = []
for index1 in range(len(target_genres)):
    for index2 in range(index1, len(target_genres)):
        row = {
            "source": target_genres[index1],
            "target": target_genres[index2],
            "weight": weight_matrix[index1][index2],
        }
        row = pd.DataFrame(data=row, index=[0])
        rows.append(row)

matrix = pd.concat(rows)

# Initialize plot
d3 = D3Blocks()

# chord plot
d3.chord(matrix);


- 3. Correlation matrix of the background of actors (i.e., experience in each genre) to the genre of the movie he was filming.

In [9]:
# load data
movie_data = pd.read_csv("data/genre_experience.csv")
movie_data.head(-10)

Unnamed: 0.1,Unnamed: 0,tconst,ordering,nconst,gender,primaryName,birthYear,deathYear,startYear,genres,...,comedy_exp_so_far,adventure_exp_so_far,crime_exp_so_far,war_exp_so_far,family_exp_so_far,mystery_exp_so_far,Drama,Action,Romance,Comedy
0,0,tt0000009,3,nm1309758,0,Chauncey Depew,1834.0,1928,1894.0,Romance,...,0,0,0,0,0,0,0,0,1,0
1,1,tt0000009,1,nm0063086,1,Blanche Bayliss,1878.0,1951,1894.0,Romance,...,0,0,0,0,0,0,0,0,1,0
2,2,tt0000009,2,nm0183823,0,William Courtenay,1875.0,1933,1894.0,Romance,...,0,0,0,0,0,0,0,0,1,0
3,3,tt1666737,1,nm0525908,0,Auguste Lumière,1862.0,1954,1896.0,"Action,Comedy,Family",...,2,0,0,0,1,0,0,1,0,1
4,4,tt0000211,1,nm0194945,1,Jehanne d'Alcy,1865.0,1956,1898.0,"Comedy,Fantasy,Horror",...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
468542,468542,tt18351130,3,nm0430074,0,Leslie Jordan,1955.0,2022,2022.0,Comedy,...,99,4,18,0,1,0,0,0,0,1
468543,468543,tt18561180,3,nm0430074,0,Leslie Jordan,1955.0,2022,2022.0,Comedy,...,100,4,18,0,1,0,0,0,0,1
468544,468544,tt18568804,3,nm0430074,0,Leslie Jordan,1955.0,2022,2022.0,Comedy,...,101,4,18,0,1,0,0,0,0,1
468545,468545,tt7728792,2,nm6074154,0,Harry Hains,1992.0,2020,2022.0,Drama,...,0,0,0,0,0,0,1,0,0,0


In [10]:
genres = ["Drama", "Action", "Romance", "Comedy", "Crime", "War", "Family", "Mystery", "Adventure"]
corr_before_ongoing_genres = []
for genre in genres:
    movie_data_par_genre = movie_data[movie_data["genres"].str.contains(genre)]
    corr = movie_data_par_genre[["drama_exp_so_far", "action_exp_so_far", "romance_exp_so_far", "comedy_exp_so_far", "crime_exp_so_far", "war_exp_so_far", "family_exp_so_far", "mystery_exp_so_far", "adventure_exp_so_far"]].corr()
    corr_before_ongoing_genres.append(list(corr.loc[f"{genre.lower()}_exp_so_far"]))

In [12]:
# plot.
fig = px.imshow(corr_before_ongoing_genres,
                labels=dict(x="Actor's experience in each genre", y="Genre of the movie", color="correlation"),
                x=['Drama', 'Action', 'Romance', 'Comedy', 'Crime', 'War', 'Family', 'Mystery', 'Adventure'],
                y=['Drama', 'Action', 'Romance', 'Comedy', 'Crime', 'War', 'Family', 'Mystery', 'Adventure'],
                color_continuous_scale=px.colors.sequential.Cividis_r
               )
fig.update_xaxes(side="top")
fig.show()
# fig.write_html("./data/correlation_map.html")