# Data exploration
This notebook contains some tests about data import and some data viz tools

In [1]:
DATA_FOLDER = './data/'
MOVIES_FOLDER = DATA_FOLDER + 'movies_summaries/'
PLOT_SUMMARY_FOLDER = DATA_FOLDER + 'corenlp_plot_summaries'

REPORT_FOLDER = './gen/reports/'
ETHNICITY_FILE = './gen/ethnicities.tsv'

CHARACTERS_FILE = MOVIES_FOLDER + 'character.metadata.tsv'
MOVIES_FILE = MOVIES_FOLDER + 'movie.metadata.tsv'
PLOT_SUMMARIES_FILE = MOVIES_FOLDER + 'plot_summaries.txt'
TROPES_FILE = MOVIES_FOLDER + 'tvtropes.clusters.txt'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast

from pandas_profiling import ProfileReport
%matplotlib inline

# Characters

In [None]:
character_columns = ['wiki_movie_id', 'freebase_movie_id', 'm_release_date', 'name', 'a_dob', 'a_gender', 'a_height', 'a_ethnicity_freebase_id', 'a_name', 'a_age_at_release', 'freebase_char/a_map', 'freebase_char_id', 'freebase_a_id']
characters = pd.read_csv(CHARACTERS_FILE, sep='\t', names=character_columns, index_col=False)

characters['m_release_date'] = pd.to_datetime(characters['m_release_date'], format='%Y-%m-%d', errors='coerce')

print(characters.shape)
characters.head()

In [None]:
# generate profiling report
# profile_characters = ProfileReport(characters, title='Characters report')
# profile_characters.to_file(REPORT_FOLDER + "character_report_0.html")

# Import ethnicities

In [None]:
ethnicities = pd.read_csv(ETHNICITY_FILE, sep='\t', header=None, names=['freebase_ethnicity_id', 'ethnicity_name'])
ethnicities

In [None]:
characters = pd.merge(left=characters, right=ethnicities, left_on='a_ethnicity_freebase_id', right_on='freebase_ethnicity_id', how='left')
characters = characters.drop(['freebase_ethnicity_id'], axis=1)
characters = characters.rename({'a_ethnicity_freebase_id': 'freebase_ethnicity_id'})
characters

# Import tvtropes
Import tv tropes to assign a trope to characters

In [None]:
tropes = pd.read_csv(TROPES_FILE, sep='\t', header=None, names=['trope', 'props'])
tropes

In [None]:
def split_tropes_props(x):
    res = {'trope': x['trope']}
    res.update(ast.literal_eval(x['props']))
    return res

tropes = tropes.apply(split_tropes_props, axis=1, result_type='expand')
tropes = tropes.rename(columns={'id': 'freebase_char/a_map'})
tropes

In [None]:
# problem: we only have 501 characters tropes
characters_tropes = pd.merge(left=characters, right=tropes, on='freebase_char/a_map', how='inner')
characters_tropes

# Movies

In [None]:
movies_columns = ['wiki_movie_id', 'freebase_movie_id', 'name', 'release_date', 'box_office_revenue', 'runtime', 'languages', 'countries', 'genres']
movies = pd.read_csv(MOVIES_FILE, sep='\t', names=movies_columns)

print(movies.shape)
movies.head()

In [None]:
# generate profiling report
# profile_movies = ProfileReport(movies, title='Movies report')
# profile_movies.to_file(REPORT_FOLDER + "movies_report_0.html")

In [None]:
# Clean dates
# Remember that one movie contains 1010 date. We need to check what to do with it. Movie : Hunting Season, changes by hand at the moment
# Note: use .dt.date to recover the date only (not the datetime part)
print("Number of nan movies release dates: ", movies['release_date'].isna().sum())
movies['release_date'] = pd.to_datetime(movies['release_date'], format='%Y-%m-%d', errors='coerce')
movies.head()

In [None]:
print("Number of movies with box office revenue indicated: {} ({:.2%})".format(movies.shape[0] - movies['box_office_revenue'].isna().sum(), (movies.shape[0] - movies['box_office_revenue'].isna().sum())/movies.shape[0]))

# keep only movies with box office revenue
movies_without_na = movies[movies['box_office_revenue'].notna()]
print(movies_without_na.shape)

In [None]:
movies_by_year = movies.groupby(movies['release_date'].dt.year)['release_date'].count()
movies_by_year_without_na = movies_without_na.groupby(movies_without_na['release_date'].dt.year)['release_date'].count()

# plot values
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
ax[0].set_title('Movies by year')
ax[0].set_xlabel('Year')
ax[0].set_ylabel('Number of movies')
ax[0].bar(movies_by_year.index, movies_by_year.values)
# ax[0] = plt.bar(x=movies_by_year.index, y=movies_by_year.values, ax=ax[0], color='lightblue')
# ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=90)

ax[1].set_title('Movies by year (without NA)')
ax[1].set_xlabel('Year')
ax[1].set_ylabel('Number of movies')
ax[1].bar(movies_by_year_without_na.index, movies_by_year_without_na.values)
plt.show()

In [None]:
# Get box office revenue by year
movies = movies_without_na
movies['box_office_revenue'] = movies['box_office_revenue'].astype(int)

In [None]:
def sort_by_box_office_revenue(df, k=10):
    return df.sort_values(by='box_office_revenue', ascending=False).head(k)

movies_by_year = movies.groupby(movies['release_date'].dt.year).apply(sort_by_box_office_revenue)
movies_by_year

# Import summaries

In [None]:
summaries = pd.read_csv(PLOT_SUMMARIES_FILE, sep='\t', index_col=0, names=['plot'])
summaries.index.name = 'wiki_movie_id'
summaries.head()

# Merge dataframes

In [None]:
# Movies and characters
# Here, we do a left join in order to keep all characters
# Note that we remove the release date field from the movie table and not from the character table, since it happens that characters have this info but we don't have any other info about the movie
df = pd.merge(left=characters, right=movies, on='wiki_movie_id', how='left', suffixes=('_c', '_m'))
duplicate_columns = ['freebase_movie_id_c', 'release_date']
df = df.drop(duplicate_columns, axis=1)
df = df.rename(columns={'freebase_movie_id_m': 'freebase_movie_id', 'name_c': 'char_name', 'name_m': 'movie_name', 'ethnicity_name': 'a_ethnicity', 'm_release_date': 'release_date'})

# Merge movies, characters and summaries
# Here, we do do a left join too, because we may want to have info relating the plots and the character, even without having info on the movie
# However, if other information are required, it could be better to join movies and summaries before merging with characters, in order to select only plots for which we have movies info
df = pd.merge(left=df, right=summaries, on='wiki_movie_id', how='left')

In [None]:
# Clean features by removing freebase id if they're not useful
def extract_feature(x):
    if x is np.nan:
        return np.nan
    return ', '.join(ast.literal_eval(x).values())

df['genres'] = df['genres'].apply(extract_feature)
df['languages'] = df['languages'].apply(extract_feature)
df['countries'] = df['countries'].apply(extract_feature)

# change order of columns
df = df[['wiki_movie_id','freebase_movie_id','movie_name','release_date','box_office_revenue','runtime','genres','languages','countries','plot','char_name','a_name','a_gender','a_ethnicity','a_dob','a_age_at_release','a_height','freebase_char/a_map','freebase_char_id','freebase_a_id','a_ethnicity_freebase_id']]

# print all columns and more rows
pd.set_option('display.max_columns', None)
df

## Display top 10 ethnicities by decade

In [None]:
# plot the number of characters of each ethnicity
ethnicity_count = df['a_ethnicity'].value_counts()
ethnicity_count = ethnicity_count[ethnicity_count > 100]

fig, ax = plt.subplots(figsize=(20, 5))
plt.title('Number of characters by ethnicity') 
sns.barplot(x=ethnicity_count.index, y=ethnicity_count, color='lightblue', ax=ax)
# sns.countplot(x=ethnicity_count, color='lightblue', ax=ax)
ax.set_xlabel('Ethnicity')
plt.xticks(rotation=90)
plt.grid(axis='y') #{'major', 'minor', 'both'}
plt.show()

In [None]:
# count instances of ethnicities for each year
df1 = characters[~characters['ethnicity_name'].isna()]

# from https://stackoverflow.com/questions/17764619/pandas-dataframe-group-year-index-by-decade
ethnicity_count = df1.groupby([df1['m_release_date'].dt.year // 10 * 10, df1['ethnicity_name']]).count()[['wiki_movie_id']]
# gen a "pretty" and usable dataset
ethnicity_count = ethnicity_count.rename(columns={'wiki_movie_id': 'count'})
ethnicity_count = ethnicity_count.reset_index() # level=1
ethnicity_count = ethnicity_count.rename(columns={'m_release_date': 'm_release_decade'})
ethnicity_count['m_release_decade'] = ethnicity_count['m_release_decade'].astype(int)

# get top k ethnicites for each decade
top_k = 10
decades = ethnicity_count['m_release_decade'].unique()

fig, ax = plt.subplots(2, 6, figsize=(20, 10), sharey=True)
ax = ax.flatten()
colors = {}

for i, decade in enumerate(decades[1:]): # TODO the 1890 movies are not displayed
    # get top k ethnicites
    df_top_k = ethnicity_count[ethnicity_count['m_release_decade'] == decade].sort_values(by='count', ascending=False).head(top_k)

    # percentage of the top k
    total = df_top_k['count'].sum()
    df_top_k['percentage'] = df_top_k['count'].apply(lambda x: x / total * 100)
    
    for ethnicity in df_top_k['ethnicity_name'].values:
        if ethnicity not in colors:
            colors[ethnicity] = sns.color_palette(n_colors=40)[len(colors)-1]

    # plot distribution
    sns.barplot(x=df_top_k['ethnicity_name'], y=df_top_k['percentage'], ax=ax[i], palette=[colors[ethnicity] for ethnicity in df_top_k['ethnicity_name'].values])

    ax[i].set_title('{}s'.format(int(decade)))
    ax[i].set_xlabel('Ethnicity')
    ax[i].set_xlabel('Percentage')

    # pretty label display
    # patches = [matplotlib.patches.Patch(color=sns.color_palette()[j], label=t) for j,t in enumerate(t.get_text() for t in ax[i].get_xticklabels())]
    # ax[i].legend(handles=patches, loc="upper right") 

    # before legend
    ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=90) 

plt.title("Ethnicity representation in movies")
plt.tight_layout()
plt.show()

In [None]:
df.head()

# Men/women ratio analysis

In [None]:
# At the moment, set the ratio to one if the denominator is zero
def compute_men_women_ratio(x):
    genders = x['a_gender']
    nb_actors = x['a_name'].count()
    nb_male = genders[genders == 'M'].count()
    nb_female = genders[genders == 'F'].count()
    nb_nan_gender = genders.isna().sum()

    nb_known_gender = nb_known_gender=nb_male+nb_female # TODO: if we add the NAN, then change the current line by: nb_known_gender=nb_actors
    m_f_ratio = nb_male / nb_female if nb_female > 0 else 1 if nb_male > 0 else 0
    m_ratio = nb_male / nb_known_gender if nb_known_gender > 0 else 0
    f_ratio = nb_female / nb_known_gender  if nb_known_gender > 0 else 0
    nan_ratio = nb_nan_gender / x.shape[0]

    return pd.Series(index=['nb_actors', 'nb_men', 'nb_female', 'nb_nan_gender', 'm_ratio', 'f_ratio', 'M_F_ratio', 'nan_ratio'], data=[nb_actors, nb_male, nb_female, nb_nan_gender, m_ratio, f_ratio, m_f_ratio, nan_ratio])


df_gender = df.groupby('wiki_movie_id').apply(compute_men_women_ratio)

df_gender['nb_actors'] = df_gender['nb_actors'].astype(int)
df_gender['nb_men'] = df_gender['nb_men'].astype(int)
df_gender['nb_female'] = df_gender['nb_female'].astype(int)
df_gender['nb_nan_gender'] = df_gender['nb_nan_gender'].astype(int)
df_gender

In [None]:
df_gender['total'] = df_gender['m_ratio'].add(df_gender['f_ratio']).add(df_gender['nan_ratio'])
df_gender

In [None]:
# get the info of only one movie
df_gender = pd.merge(left=df[['wiki_movie_id', 'release_date']], right=df_gender, how='inner', left_on='wiki_movie_id', right_index=True)
df_gender = df_gender.drop_duplicates('wiki_movie_id')
df_gender['m_release_decade'] = df_gender['release_date'].dt.year // 10 * 10
df_gender
# df2['m_release_decade'].isna().sum()

In [None]:
# ratio over the whole dataset
print("Male / Female ratio over the whole dataset:", df[df['a_gender'] == 'M'].shape[0] / df[df['a_gender'] == 'F'].shape[0])
print("Nan ratio over the whole dataset:", df[df['a_gender'].isna()].shape[0] / df.shape[0])

In [None]:
# mean gender ratio with matching paires
print("Male / Female ratio: :", df_gender['M_F_ratio'].mean())
print("Nan ratio: over bthe whole dataset:", df_gender['nan_ratio'].mean())

In [None]:
decades = np.sort(df_gender['m_release_decade'].unique())
decades = decades[~np.isnan(decades)]

fig, ax = plt.subplots(2, 7, figsize=(12, 7), sharey=True, sharex=True)
ax = ax.flatten()

for i, decade in enumerate(decades):
    df_decade = df_gender[df_gender['m_release_decade'] == decade]
    
    x = ['M', 'F', 'Nan']
    y = [df_decade['m_ratio'].mean()*100, df_decade['f_ratio'].mean()*100, df_decade['nan_ratio'].mean()*100]

    # plot distribution
    sns.barplot(x=x, y=y, ax=ax[i])

    ax[i].set_title('{}s'.format(int(decade)))

    if i/7 >= 1:
        ax[i].set_xlabel('Gender')

    if i%7 == 0:
        ax[i].set_ylabel('Percentage')

    # before legend
    ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=90) 

plt.tight_layout()
plt.show()

# TODO add confidence intervals

In [None]:
decades = np.sort(df_gender['m_release_decade'].unique())
decades = decades[~np.isnan(decades)]

fig, ax = plt.subplots(2, 7, figsize=(10, 7))
ax = ax.flatten()

for i, decade in enumerate(decades):
    df_decade = df_gender[df_gender['m_release_decade'] == decade]
    
    x = ['M', 'F', 'Nan']
    y = [df_decade['m_ratio'].mean()*100, df_decade['f_ratio'].mean()*100, df_decade['nan_ratio'].mean()*100]

    # plot distribution
    ax[i].pie(y, labels=x, autopct='%.0f%%')

    ax[i].set_title('{}s'.format(int(decade)))

    if i/7 >= 1:
        ax[i].set_xlabel('Gender')

    if i%7 == 0:
        ax[i].set_ylabel('Percentage')

    # before legend
    ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=90) 

plt.tight_layout()
plt.show()

In [None]:
# TODO improve how decades are treated
decades = np.sort(df_gender['m_release_decade'].unique())
decades = decades[~np.isnan(decades)]

gender_dict = {}
for i, decade in enumerate(decades):
    df_decade = df_gender[df_gender['m_release_decade'] == decade]
    
    gender_dict[decade] = {
        'M': df_decade['m_ratio'].mean()*100,
        'F': df_decade['f_ratio'].mean()*100
        # 'Nan': df_decade['nan_ratio'].mean()*100
        }

gender_ratio_df = pd.DataFrame(gender_dict)
gender_ratio_df.columns = ['{}s'.format(int(col)) for col in gender_ratio_df.columns]
gender_ratio_df = gender_ratio_df.T
gender_ratio_df

In [None]:
# pivot table to have one row per gender
plot_df = gender_ratio_df.reset_index()
plot_df = plot_df.rename(columns={'index': 'decade'})
plot_df = plot_df.melt(id_vars='decade', value_vars=['M', 'F'])
plot_df = plot_df.rename(columns={'variable': 'gender', 'value': 'ratio'})

# tmp_df
sns.barplot(data=plot_df, x='decade', y='ratio', hue='gender')
plt.title('Gender ratio evolution over the decades')
plt.xticks(rotation=45)
plt.tight_layout()
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()

# Age analysis

In [None]:
df2 = df.copy()
# here, we suppose that negative ages are just a mistake
df2['a_age_at_release'] = df2['a_age_at_release'].abs()
df2['age_decade_at_release'] = df2['a_age_at_release'] // 10 * 10
df2['m_release_decade'] = df2['release_date'].dt.year // 10 * 10
# df['a_age_at_release'].isna().sum()
# df['a_dob'].isna().sum() # TODO: more rows have dob than age at release. We can mabye compute age at release by ourselves

# age decades
age_decades = df2['age_decade_at_release'].value_counts().drop([960.0, 7890.0, 930.0]).index
age_decades = np.sort(age_decades)
age_decades

In [None]:
# Here, we put the 70s+ together
df2['age_decade_at_release'] = df2['age_decade_at_release'].replace(to_replace=[80.,  90., 100.], value=70.)
age_decades = age_decades[:-3]

In [None]:
decades = np.sort(df2['m_release_decade'].unique())
decades = decades[~np.isnan(decades)]

fig, ax = plt.subplots(2, 6, figsize=(12, 7), sharey=True, sharex=True)
ax = ax.flatten()
# colors = {}
# for i, age_decade in enumerate(age_decades):
#     colors[age_decade] = sns.color_palette(n_colors=age_decades.shape[0])[i]

for i, decade in enumerate(decades[3:]): # TODO 1880 and 1890 and 1900 have been removed because they contain almost no data
    # get top k ethnicites
    df_decade = df2[df2['m_release_decade'] == decade]
    df_decade = df_decade[~df_decade['age_decade_at_release'].isna()]

    x = []
    y = []
    for age_decade in age_decades:
        if age_decade == 70:
            x.append('70+')
        else:
            x.append('{}s'.format(int(age_decade)))
        y.append(df_decade[df_decade['age_decade_at_release'] == age_decade].shape[0] / df_decade.shape[0] * 100)

    # plot distribution
    sns.barplot(x=x, y=y, ax=ax[i], palette='tab10')

    ax[i].set_title('{}s'.format(int(decade)))

    if i/7 >= 1:
        ax[i].set_xlabel('Age decade')

    if i%7 == 0:
        ax[i].set_ylabel('Percentage')

    # before legend
    ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=90) 

plt.tight_layout()
plt.show()

# TODO add confidence intervals

In [None]:
age_decades_dict = {}

for i, decade in enumerate(decades[3:]): # TODO 1880 and 1890 and 1900 have been removed because they contain almost no data
    # get top k ethnicites
    df_decade = df2[df2['m_release_decade'] == decade]
    df_decade = df_decade[~df_decade['age_decade_at_release'].isna()]
    
    row = {}
    for age_decade in age_decades:
        if age_decade == 70:
            col_name = '70s+'
        else:
            col_name = '{}s'.format(int(age_decade))

        if col_name in row:
            row[col_name].update(df_decade[df_decade['age_decade_at_release'] == age_decade].shape[0] / df_decade.shape[0] * 100)
        else:
            row[col_name] =  df_decade[df_decade['age_decade_at_release'] == age_decade].shape[0] / df_decade.shape[0] * 100

    age_decades_dict[decade] = row

In [None]:
# pd.DataFrame(index='decage', columns=['{}s'.format(age_decade) for age_decade in age_decades])
age_decades_df = pd.DataFrame(age_decades_dict)
age_decades_df.columns = ['{}s'.format(int(col)) for col in age_decades_df.columns]
age_decades_df = age_decades_df.T
age_decades_df

In [None]:
# Potentially interesting: area, bar, pie (but for each of them independently)
# age_decades_df.plot(kind='bar', stacked=True)
age_decades_df.plot(kind='bar', stacked=True)

plt.xticks(rotation=90)
plt.xlabel('Decade')
plt.ylabel('Percentage')
plt.title('Actor age distribution over the decades')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xticks(rotation=45)
plt.show()
# sns.barplot(stacked=True)

# as the times advance, we can see that we have more and more older actors.
# Maybe this can be due to the ratio of good actors which continue to perform after a long time. This can be due to the life expectancy increase too
# Other ideas ?
# It can be good to do that too for the ethnicities. For the gender, it can be better to use an area plot