In [72]:
# processing data
import pandas as pd
import numpy as np
import itertools

# analysis
import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import rfpimp

# plot
import matplotlib.pyplot as plt
from d3blocks import D3Blocks
import seaborn as sns
import plotly.express as px



In [35]:
movie_data = pd.read_csv("data/actor_movie_combi2.csv")
movie_data.head(-10)

Unnamed: 0,tconst,ordering,nconst,gender,primaryName,birthYear,deathYear,startYear,genres,deatYear,age_at_movie_start,averageRating,numVotes,exp_so_far,drama_exp_so_far,action_exp_so_far,romance_exp_so_far,comedy_exp_so_far
0,tt7816420,1,nm1155956,0,Eadweard Muybridge,1830.0,1904,1881.0,"Documentary,Short",1904.0,51.0,5.2,462.0,0,0,0,0,0
1,tt1758563,1,nm1796515,0,Adolphe Le Prince,1872.0,1901,1888.0,"Documentary,Short",1901.0,16.0,5.5,1333.0,0,0,0,0,0
2,tt0361921,1,nm1362928,0,Giuseppe Sacco Albanese,1872.0,1943,1890.0,"Documentary,Short",1943.0,18.0,5.0,1508.0,0,0,0,0,0
3,tt0416047,1,nm1362928,0,Giuseppe Sacco Albanese,1872.0,1943,1890.0,Short,1943.0,18.0,4.2,425.0,1,0,0,0,0
4,tt0416046,1,nm1362928,0,Giuseppe Sacco Albanese,1872.0,1943,1890.0,Short,1943.0,18.0,4.8,1106.0,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
698931,tt7728792,2,nm6074154,0,Harry Hains,1992.0,2020,2022.0,Drama,2020.0,30.0,8.2,19.0,5,4,0,2,0
698932,tt21045916,3,nm0582378,0,Sombat Metanee,1937.0,2022,2022.0,"Drama,Romance",2022.0,85.0,10.0,5.0,16,2,9,2,4
698933,tt21045922,3,nm0582378,0,Sombat Metanee,1937.0,2022,2022.0,"Drama,Romance",2022.0,85.0,10.0,5.0,17,3,9,3,4
698934,tt21048302,2,nm7670169,0,Papangkorn Lerkchaleampote,1996.0,2022,2022.0,"Adventure,Drama,Thriller",2022.0,26.0,8.0,57.0,13,13,0,1,1


# Credibility
- 1. genre pair pattern

In [36]:
target_genres = ["Drama", "Comedy", "Crime", "Action", "Romance", "Family", "Adventure", "Mystery", "War", "Musical", "Thriller"]
genre_matrix = np.zeros((len(target_genres), len(target_genres)))

# filter out movies with single genre.
multiple_genres_movie = movie_data[movie_data["genres"].str.contains(",")].reset_index()
for i in range(len(multiple_genres_movie)):
    for index1, first_genre in enumerate(target_genres):
        for index2, second_genre in enumerate(target_genres):
            if (first_genre != second_genre) and (first_genre in multiple_genres_movie["genres"][i]) and (second_genre in multiple_genres_movie["genres"][i]):
                genre_matrix[index1, index2] += 1

matrix = [list(genre_matrix[genre_index, :]) for genre_index in range(len(target_genres))]
rows = []
for index1 in range(len(target_genres)):
    for index2 in range(index1, len(target_genres)):
        row = {
            "source": target_genres[index1],
            "target": target_genres[index2],
            "weight": matrix[index1][index2],
        }
        row = pd.DataFrame(data=row, index=[0])
        rows.append(row)

plot_matrix = pd.concat(rows)

# Initialize
d3 = D3Blocks()

# chord plot
d3.chord(plot_matrix)

[d3blocks] >INFO> Cleaning edge_properties and config parameters..
[d3blocks] >INFO> Cleaning edge_properties and config parameters..
[d3blocks] >INFO> Initializing [Chord]
[d3blocks] >INFO> Convert to Frame.
[d3blocks] >INFO> Node properties are set.
[d3blocks] >INFO> Set edge-opacity based on the [source] node-opacity.
[d3blocks] >INFO> Set edge-colors based on the [source] node-color.
[d3blocks] >INFO> Edge properties are set.
[d3blocks] >INFO> File already exists and will be overwritten: [/var/folders/hc/l31vrq9j7_s_2l_yt65xnntr0000gp/T/d3blocks/chord.html]
[d3blocks] >INFO> File not found: [file:////var/folders/hc/l31vrq9j7_s_2l_yt65xnntr0000gp/T/d3blocks/chord.html]


- 2. What background does you need to get a movie of particular genre?

In [49]:
movie_data = pd.read_csv("data/genre_experience.csv")
movie_data.head(-10)

Unnamed: 0.1,Unnamed: 0,tconst,ordering,nconst,gender,primaryName,birthYear,deathYear,startYear,genres,...,comedy_exp_so_far,crime_exp_so_far,war_exp_so_far,family_exp_so_far,mystery_exp_so_far,Drama,Action,Adventure,Romance,Comedy
0,15,tt0000009,3,nm1309758,0,Chauncey Depew,1834.0,1928,1894.0,Romance,...,0,0,0,0,0,0,0,0,1,0
1,16,tt0000009,1,nm0063086,1,Blanche Bayliss,1878.0,1951,1894.0,Romance,...,0,0,0,0,0,0,0,0,1,0
2,22,tt0000009,2,nm0183823,0,William Courtenay,1875.0,1933,1894.0,Romance,...,0,0,0,0,0,0,0,0,1,0
3,74,tt1666737,1,nm0525908,0,Auguste Lumière,1862.0,1954,1896.0,"Action,Comedy,Family",...,2,0,0,1,0,0,1,0,0,1
4,142,tt0000211,1,nm0194945,1,Jehanne d'Alcy,1865.0,1956,1898.0,"Comedy,Fantasy,Horror",...,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486530,698923,tt18351130,3,nm0430074,0,Leslie Jordan,1955.0,2022,2022.0,Comedy,...,99,18,0,1,0,0,0,0,0,1
486531,698926,tt18561180,3,nm0430074,0,Leslie Jordan,1955.0,2022,2022.0,Comedy,...,100,18,0,1,0,0,0,0,0,1
486532,698927,tt18568804,3,nm0430074,0,Leslie Jordan,1955.0,2022,2022.0,Comedy,...,101,18,0,1,0,0,0,0,0,1
486533,698931,tt7728792,2,nm6074154,0,Harry Hains,1992.0,2020,2022.0,Drama,...,0,0,0,0,0,1,0,0,0,0


In [56]:
movie_data.columns

Index(['Unnamed: 0', 'tconst', 'ordering', 'nconst', 'gender', 'primaryName',
       'birthYear', 'deathYear', 'startYear', 'genres', 'deatYear',
       'age_at_movie_start', 'averageRating', 'numVotes', 'exp_so_far',
       'drama_exp_so_far', 'action_exp_so_far', 'romance_exp_so_far',
       'comedy_exp_so_far', 'crime_exp_so_far', 'war_exp_so_far',
       'family_exp_so_far', 'mystery_exp_so_far', 'Drama', 'Action',
       'Adventure', 'Romance', 'Comedy'],
      dtype='object')

In [73]:
genres = ["Drama", "Action", "Romance", "Comedy", "Crime", "War", "Family", "Mystery"]
corr_before_ongoing_genres = []
# movie_data_par_genre["imp_genres_exp"] = movie_data_par_genre["drama_exp_so_far"] + movie_data_par_genre["action_exp_so_far"] + movie_data_par_genre["romance_exp_so_far"] + movie_data_par_genre["comedy_exp_so_far"]
for genre in genres:
    movie_data_par_genre = movie_data[movie_data["genres"].str.contains(genre)]
    # corr = movie_data_par_genre[["age_at_movie_start", "averageRating", "numVotes", "exp_so_far", "drama_exp_so_far", "action_exp_so_far", "romance_exp_so_far", "comedy_exp_so_far", "crime_exp_so_far", "war_exp_so_far", "family_exp_so_far", "mystery_exp_so_far", "imp_genres_exp"]].corr()
    corr = movie_data_par_genre[["drama_exp_so_far", "action_exp_so_far", "romance_exp_so_far", "comedy_exp_so_far", "crime_exp_so_far", "war_exp_so_far", "family_exp_so_far", "mystery_exp_so_far"]].corr()
    corr_before_ongoing_genres.append(list(corr.loc[f"{genre.lower()}_exp_so_far"]))
    # corr.style.background_gradient(cmap='coolwarm').set_precision(2)


[[1.0,
  0.33122421638616023,
  0.3475828366072777,
  0.4148992625300694,
  0.6604587195636393,
  0.14133254010769872,
  0.33219610937447064,
  0.5027843235441387],
 [0.4741118747804988,
  1.0,
  0.01053863903995987,
  0.23217147187192408,
  0.5987133526364763,
  0.12570959619981595,
  0.12328133616806136,
  0.11397715346952726],
 [0.8236204855536398,
  0.11494900241236616,
  1.0,
  0.632892937004162,
  0.1359947922914056,
  -0.0034836157456953044,
  0.3676120963548429,
  0.07908826734728817],
 [0.3730353223014378,
  0.11369746914139921,
  0.2401741930318785,
  1.0,
  0.15588788511153553,
  0.13275332471803414,
  0.6801294674909604,
  0.06305040564199375],
 [0.8833215613458237,
  0.4447523386694388,
  -0.00601413384575488,
  0.22911398909314215,
  1.0,
  0.031202789585442037,
  0.10042053582611443,
  0.6801953937932703],
 [0.36911828654637924,
  0.2954932856416001,
  -0.05694774981349347,
  0.8459789160094907,
  0.2180725236245068,
  1.0,
  0.055508209227943896,
  0.25277941030643275],

In [83]:
# sns.heatmap(corr_before_ongoing_genres)
fig = px.imshow(corr_before_ongoing_genres,
                labels=dict(x="Genres experience in the past", y="Genre of the current movie", color="correlation"),
                x=['Drama', 'Action', 'Romance', 'Comedy', 'Crime', 'War', 'Family', 'Mystery'],
                y=['Drama', 'Action', 'Romance', 'Comedy', 'Crime', 'War', 'Family', 'Mystery'],
                color_continuous_scale=px.colors.sequential.Cividis_r
               )
fig.update_xaxes(side="top")
fig.show()
fig.write_html("./data/correlation_map.html")

In [48]:
imp_movie_data = movie_data[movie_data["averageRating"] > 8.0]
imp_movie_data = imp_movie_data[imp_movie_data["startYear"] >= 1997.0]
imp_movie_data["imp_genres_exp"] = imp_movie_data["drama_exp_so_far"] + imp_movie_data["action_exp_so_far"] + imp_movie_data["romance_exp_so_far"] + imp_movie_data["comedy_exp_so_far"]
corr = imp_movie_data[["age_at_movie_start", "averageRating", "numVotes", "exp_so_far", "drama_exp_so_far", "action_exp_so_far", "romance_exp_so_far", "comedy_exp_so_far", "imp_genres_exp"]].corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

  """


Unnamed: 0,age_at_movie_start,averageRating,numVotes,exp_so_far,drama_exp_so_far,action_exp_so_far,romance_exp_so_far,comedy_exp_so_far,imp_genres_exp
age_at_movie_start,1.0,-0.01,0.01,0.13,0.09,0.02,0.08,0.13,0.12
averageRating,-0.01,1.0,0.0,-0.09,-0.06,-0.04,-0.01,-0.09,-0.08
numVotes,0.01,0.0,1.0,0.01,0.01,0.0,0.01,0.01,0.01
exp_so_far,0.13,-0.09,0.01,1.0,0.67,0.67,0.17,0.64,0.95
drama_exp_so_far,0.09,-0.06,0.01,0.67,1.0,0.24,0.21,0.27,0.73
action_exp_so_far,0.02,-0.04,0.0,0.67,0.24,1.0,0.02,0.18,0.68
romance_exp_so_far,0.08,-0.01,0.01,0.17,0.21,0.02,1.0,0.13,0.26
comedy_exp_so_far,0.13,-0.09,0.01,0.64,0.27,0.18,0.13,1.0,0.68
imp_genres_exp,0.12,-0.08,0.01,0.95,0.73,0.68,0.26,0.68,1.0


In [42]:
imp_movie_data = movie_data[movie_data["averageRating"] > 7.0]
len(imp_movie_data)

305465

# Rating
In this section, we are going to explore the relationship between average rating and actor features (i.e, age_at_movie_start, exp_so_far and gender).

In [None]:
# Build linear regression model
mod = smf.ols(formula='averageRating ~ age_at_movie_start + exp_so_far + C(gender)', data=movie_data)

# fit.
res = mod.fit()
print(res.summary())

In [None]:
# compute feature importance
# split data
features = ['age_at_movie_start', 'exp_so_far', 'gender', 'averageRating']
df_train, df_test = train_test_split(movie_data, test_size=0.20)
df_train = df_train[features]
df_test = df_test[features]

X_train, y_train = df_train.drop('averageRating',axis=1), df_train['averageRating']
X_test, y_test = df_test.drop('averageRating',axis=1), df_test['averageRating']

# train
rf = RandomForestRegressor(n_estimators=10, n_jobs=-1)
rf.fit(X_train, y_train)

# Permutation feature importance
imp = rfpimp.importances(rf, X_test, y_test)

# plot
fig, ax = plt.subplots(figsize=(6, 3))

ax.barh(imp.index, imp['Importance'], height=0.8, facecolor='grey', alpha=0.8, edgecolor='k')
ax.set_xlabel('Importance score')
ax.set_title('Permutation feature importance')
plt.gca().invert_yaxis()

fig.tight_layout()

In [None]:
# Build linear regression model
mod = smf.ols(formula='averageRating ~ age_at_movie_start + exp_so_far + numVotes + C(gender)', data=movie_data)

# fit.
res = mod.fit()
print(res.summary())

In [None]:
# compute feature importance
# split data
features = ['age_at_movie_start', 'numVotes', 'exp_so_far', 'gender', 'averageRating']
df_train, df_test = train_test_split(movie_data, test_size=0.20)
df_train = df_train[features]
df_test = df_test[features]

X_train, y_train = df_train.drop('averageRating',axis=1), df_train['averageRating']
X_test, y_test = df_test.drop('averageRating',axis=1), df_test['averageRating']

# train
rf = RandomForestRegressor(n_estimators=10, n_jobs=-1)
rf.fit(X_train, y_train)

# Permutation feature importance
imp = rfpimp.importances(rf, X_test, y_test)

# plot
fig, ax = plt.subplots(figsize=(6, 3))

ax.barh(imp.index, imp['Importance'], height=0.8, facecolor='grey', alpha=0.8, edgecolor='k')
ax.set_xlabel('Importance score')
ax.set_title('Permutation feature importance')
plt.gca().invert_yaxis()

fig.tight_layout()

In [24]:
# Load d3blocks
from d3blocks import D3Blocks

# Initialize
d3 = D3Blocks()

# Load example data
df = d3.import_example('energy')

# Plot
d3.chord(df)

# Or specify the output path
d3.chord(df)

[d3blocks] >INFO> Cleaning edge_properties and config parameters..
[d3blocks] >INFO> Extracting files..
[d3blocks] >INFO> Import dataset: [energy]
[d3blocks] >INFO> Cleaning edge_properties and config parameters..
[d3blocks] >INFO> Initializing [Chord]
[d3blocks] >INFO> Convert to Frame.
[d3blocks] >INFO> Node properties are set.
[d3blocks] >INFO> Set edge-opacity based on the [source] node-opacity.
[d3blocks] >INFO> Set edge-colors based on the [source] node-color.
[d3blocks] >INFO> Edge properties are set.
[d3blocks] >INFO> File already exists and will be overwritten: [/var/folders/hc/l31vrq9j7_s_2l_yt65xnntr0000gp/T/d3blocks/chord.html]
[d3blocks] >INFO> File not found: [file:////var/folders/hc/l31vrq9j7_s_2l_yt65xnntr0000gp/T/d3blocks/chord.html]
[d3blocks] >INFO> Cleaning edge_properties and config parameters..
[d3blocks] >INFO> Initializing [Chord]
[d3blocks] >INFO> Convert to Frame.
[d3blocks] >INFO> Node properties are set.
[d3blocks] >INFO> Set edge-opacity based on the [sourc

- 1. covariance map on the basis of dataset that filter in particular genres.
- 2. Create hierarchical Edge Bundling based on the `genres` column.
- 3. repeat the same procedure of computing covariance map as before but filtering out low quality movies which means low rating.