In [None]:
import pandas as pd

# Visualization librairies
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import PercentFormatter
import matplotlib.ticker as mtick
from matplotlib.lines import Line2D

#Calculating libraries
from scipy.stats import bootstrap
import numpy as np
from scipy import stats
import scipy as sp
import matplotlib.dates as md

#statistical librairies
from os import stat

# Others
from functools import partial
from ast import literal_eval

In [None]:
character=pd.read_table('./data/character.metadata.tsv',header=None)
movies=pd.read_table('./data/movie.metadata.tsv',header=None)
names=pd.read_csv('./data/name.clusters.txt',sep="\t",header=None)
summaries=pd.read_csv('./data/plot_summaries.txt',sep="\t",header=None)
tvtropes=pd.read_csv('./data/tvtropes.clusters.txt',sep="\t",header=None)

In [None]:
character=character.rename(columns={0:'wikipedia_movie_id',1:'Freebase_movie_ID',3:'character_name',4:'Actor_DOB',5:'Actor_gender',6:'actor_height',7:'Actor_etnicity',8:'Actor_name',9:'Actor_age_at_movie_release',10:'Freebase_character_map'})
movies.rename(columns={0:'wikipedia_movie_id',1:'Freebase_movie_ID',2:'Movie_name',3:'Movie_release',4:'Box_office_revenue',5:'Movie_runtime',6:'Movie_language',7:'Movie_country',8:'Movie_genre'},inplace=True)
names.rename(columns={0:'Character_names',1:'Freebase_ID'},inplace=True)
tvtropes.rename(columns={0:'Character_type',1:'Freebase_ID'},inplace=True)
summaries.rename(columns={0:'wikipedia_movie_id',1:'Plot_summaries'},inplace=True)

In [None]:
tvtropes["dict_ID"]=[literal_eval(i) for i in tvtropes.Freebase_ID]
tvtropes=pd.concat([tvtropes.drop(['dict_ID'], axis=1), tvtropes
                    ['dict_ID'].apply(pd.Series)], axis=1)

In [None]:
columns_with_null = list(character.isna().sum().loc[lambda x: x>0].index)
print ("\nhere are the character features with null values:")
for column in columns_with_null:
    print(column)

columns_with_null = list(movies.isna().sum().loc[lambda x: x>0].index)
print ("\nhere are the movie features with null values:")
for column in columns_with_null:
    print(column)

columns_with_null = list(names.isna().sum().loc[lambda x: x>0].index)
print ("\nhere are the name features with null values:")
for column in columns_with_null:
    print(column)

columns_with_null = list(tvtropes.isna().sum().loc[lambda x: x>0].index)
print ("\nhere are the tvtropes features with null values:")
for column in columns_with_null:
    print(column)

columns_with_null = list(summaries.isna().sum().loc[lambda x: x>0].index)
print ("\nhere are the summaries features with null values:")
for column in columns_with_null:
    print(column)

In [None]:
import json

from collections import defaultdict

movies['Movie_release'] = pd.to_datetime(movies['Movie_release'], errors = 'coerce')
sorted_movies = movies.sort_values("Movie_release", ascending=False).dropna(subset=['Movie_release'])

genres_per_year = sorted_movies[['Movie_release', 'Movie_name', 'Movie_genre']]
genres_per_year['Movie_genre'] = genres_per_year.Movie_genre.apply(json.loads).apply(lambda x: list(x.values()))

occur = genres_per_year.copy()

occur.head(20)

In [None]:
from utils.genres import genre_wordsets

def extract_genres(all_genres:object)->list:
    """ returns 1 if one pronoun in pronouns is in headline else 0

    Args:
        pronouns (list): list of pronouns
        headline (object): headline of the article

    Returns:
        list: 1 if there is a pronoun from pronouns in headline else 0 
    """
    genre_list = []
    genres = all_genres.copy()

    if 'Black-and-white' in all_genres:
        genre_list.append("Black & White")
        genres.remove("Black-and-white")
    
    if 'Science Fiction' in all_genres:
        genre_list.append("Science Fiction")
        genres.remove("Science Fiction")

    genre_string = " ".join([str(g) for g in genres]).lower()

    genre_string = genre_string.replace('/', ' ')
    genre_string = genre_string.replace('-', ' ')

    
    for m_genres, sub_genres in genre_wordsets.items() : 
        for s_g in sub_genres :
            if s_g in genre_string.split(" "):
                genre_list.append(m_genres)
    
    if len(genre_list) == 0 :
        genre_list.append("Other")

    genre_list = list(set(genre_list))

    return genre_list

In [None]:
genres_per_year["Major_genres"]=genres_per_year["Movie_genre"].apply(
    partial(extract_genres))

In [None]:
genres_per_year = genres_per_year.explode('Major_genres')
occur = occur.explode('Movie_genre')

In [None]:
occur_major = genres_per_year.copy()

time_genres = occur.groupby(pd.PeriodIndex(occur['Movie_release'], freq="Y"))['Movie_genre'].value_counts().reset_index(name='Genres_per_year')

f_per_year = occur.groupby(pd.PeriodIndex(occur['Movie_release'], freq="Y")).size().reset_index(name='Films_per_year')
f_per_year.index = f_per_year['Movie_release']
time_genres['Films_per_year'] = time_genres['Movie_release'].map(f_per_year['Films_per_year'])
time_genres['Genre_proportion'] = time_genres['Genres_per_year']/time_genres['Films_per_year']

time_genres.head(5)

In [None]:
time_genres = occur_major.groupby(pd.PeriodIndex(occur_major['Movie_release'], freq="Y"))['Major_genres'].value_counts().reset_index(name='Genres_per_year')

f_per_year = occur_major.groupby(pd.PeriodIndex(occur_major['Movie_release'], freq="Y")).size().reset_index(name='Films_per_year')
f_per_year.index = f_per_year['Movie_release']
time_genres['Films_per_year'] = time_genres['Movie_release'].map(f_per_year['Films_per_year'])
time_genres['Genre_proportion'] = time_genres['Genres_per_year']/time_genres['Films_per_year']

time_genres.head(30)

In [None]:
time_genres['Year_start'] = time_genres['Movie_release']

time_genres['Year_start'] = time_genres['Year_start'].dt.strftime('%Y').add('-01-01 00:00:00.000')
date_index = pd.to_datetime(time_genres['Year_start'], format="%Y-%m-%d %H:%M:%S.%f")

In [None]:
display_date = pd.Timestamp(2007, 1, 1, 0)

df_date = time_genres[(date_index == display_date)]
df_date.head(10)

In [None]:
start_date = pd.Timestamp(1913, 1, 1, 0)
end_date = pd.Timestamp(2012, 1, 2, 0)
tick_display_period = 5

df_period_1 = time_genres[(date_index > start_date) & (date_index < end_date)]

timeplot = sns.lineplot(data=df_period_1, x=df_period_1['Movie_release'].astype(str), y="Films_per_year")
i=0
for label in timeplot.xaxis.get_ticklabels():
    i+=1
    if(i%tick_display_period) : 
        label.set_visible(False)
    else : 
        label.set_visible(True)

In [None]:
start_date = pd.Timestamp(1913, 1, 1, 0)
end_date = pd.Timestamp(2012, 1, 2, 0)
tick_display_period = 5

df_period_1 = time_genres[(date_index > start_date) & (date_index < end_date)]

sns.set(rc={'figure.figsize':(11.7,8.27)})

timeplot = sns.lineplot(data=df_period_1, x=df_period_1['Movie_release'].astype(str), y="Genre_proportion", hue="Major_genres")

i=0
for label in timeplot.xaxis.get_ticklabels():
    i+=1
    if(i%tick_display_period) : 
        label.set_visible(False)
    else : 
        label.set_visible(True)

plt.show()

In [None]:
options = ['War', 'Propaganda']

timeplot = sns.lineplot(data=df_period_1[df_period_1["Major_genres"].isin(options)], x=df_period_1['Movie_release'].astype(str), y="Genre_proportion", hue="Major_genres")

i=0
for label in timeplot.xaxis.get_ticklabels():
    i+=1
    if(i%tick_display_period) : 
        label.set_visible(False)
    else : 
        label.set_visible(True)

plt.show()

In [None]:
occur.head(10)

In [None]:
reduced_movies = movies.sort_values("Movie_release", ascending=False).dropna(subset=['Movie_release'])
combined = pd.merge(reduced_movies, summaries, how='inner', on = 'wikipedia_movie_id')
pd.set_option('max_colwidth', 130)
combined[["Movie_release", "Movie_name",'Plot_summaries']].head(30)

In [None]:
genre_characters = character.dropna(subset=['Actor_gender'])
x=genre_characters.groupby(['Actor_gender'])['Actor_gender'].count()
y=genre_characters['Actor_gender'].count()
genre_percent = ((x/y)*100).round(2)
print(genre_percent)

In [None]:
genre_characters = character.dropna(subset=['Actor_etnicity'])
x=genre_characters.groupby(['Actor_etnicity'])['Actor_etnicity'].count().reset_index(name='Count').sort_values(['Count'], ascending=False)
y=x['Count'].sum()
x['percent'] = ((x['Count']/y)*100).round(2)

print(x.head(20))