In [1]:
# Libraries to import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import pickle

In [2]:
#Settings for the plots 
sns.set_style("ticks")
colors= sns.color_palette('colorblind')
plt.rc('xtick', labelsize=14) 
plt.rc('ytick', labelsize=14) 
plt.rc('axes', titlesize=20)
plt.rc('axes', labelsize=18)
plt.rcParams['ytick.major.size'] = 7
plt.rcParams['ytick.minor.size'] = 6

sns.set_style("darkgrid", {'axes.grid' : False, 'ytick.left': True, 'xtick.bottom': True})

In [13]:
DATA_PATH = './data/'
DATA_PATH2020 = DATA_PATH + 'data_2020/'
FILE2020 = DATA_PATH2020 + 'M3_df_2020_final.pkl'


DATA_PATH = './data/'

DATA_PATH2015 = DATA_PATH + 'data2015/'
DATA_PATH2016 = DATA_PATH + 'data2016/'
DATA_PATH2017 = DATA_PATH + 'data2017/'
DATA_PATH2018 = DATA_PATH + 'data2018/'
DATA_PATH2019 = DATA_PATH + 'data2019/'
DATA_PATH2020 = DATA_PATH + 'data2020/'

FILE2015 = DATA_PATH2015 + 'M3_df_2015_final.pkl'
FILE2016 = DATA_PATH2016 + 'M3_df_2016_final.pkl'
FILE2017 = DATA_PATH2017 + 'M3_df_2017_final.pkl'
FILE2018 = DATA_PATH2018 + 'M3_df_2018_final.pkl'
FILE2019 = DATA_PATH2019 + 'M3_df_2019_final.pkl'
FILE2020 = DATA_PATH2020 + 'M3_df_2020_final.pkl'

FILES = [FILE2015, FILE2016, FILE2017, FILE2018, FILE2019, FILE2020]

In [8]:
with open(FILE2015, 'rb') as input_file:
    df_2015_base = pickle.load(input_file)

In [9]:
with open(FILE2016, 'rb') as input_file:
    df_2016_base = pickle.load(input_file)

In [10]:
with open(FILE2017, 'rb') as input_file:
    df_2017_base = pickle.load(input_file)

In [14]:
with open(FILE2018, 'rb') as input_file:
    df_2018_base = pickle.load(input_file)

KeyboardInterrupt: 

In [None]:
with open(FILE2019, 'rb') as input_file:
    df_2019_base = pickle.load(input_file)

In [None]:
with open(FILE2020, 'rb') as input_file:
    df_2020_base = pickle.load(input_file)

In [15]:
df_2015 = df_2015_base.copy()

In [16]:
df_2016 = df_2016_base.copy()

In [17]:
df_2017 = df_2017_base.copy()

In [None]:
df_2018 = df_2018_base.copy()

In [None]:
df_2019 = df_2019_base.copy()

In [None]:
df_2020 = df_2020_base.copy()

## Number of quotes depending on gender

In [24]:
#change with num ocourrences instead of count 
def count_by_gender(df):
    """
        Function compute the number of quotes depending on `gender`
    :param df: dataframe 
    :return gender_count: dataframe of the number of quotes for one year
    """
    year = 2015
    #year = df['quoteID'][0][0:4]
    gender_count = df.groupby(by=['gender'])['numOcurrences'].sum().to_frame(name=year).T
    return gender_count

In [27]:
count_by_gender(df_2017)

gender,Female,Male
2015,1865085,9274759


In [28]:
def gather_all_years_to_one_df(df_list):
    """
        Function to merge all the years in one dataframe
    :param df: list of Dataframes 
    :return gender_count_all_years: dataframe of the number of quotes for all the years 
    :return year_list: list of years (integer format)
    """
    gender_list = []
    year_list = []
    for df in df_list:
        year = df['quoteID'][0][0:4]
        gender_count = count_by_gender(df)
        gender_list.append(gender_count)
        year_list.append(int(year))
    gender_count_all_years = pd.concat(gender_list)
    return gender_count_all_years, year_list

In [None]:
def gender_all_years_extension(df_list):
    """
        Function to add columns with the relative number of quotes for both gender for all the Dataframes in `df_list`
    :param df_list: list of Dataframes 
    :return gender_all_years: DataFrame with added columns `% Female/Male` and the `year`
    """
    gender_all_years, year_list = gather_all_years_to_one_df(df_list)
    gender_all_years['percentage_female'] = gender_all_years['Female']/(gender_all_years['Female'] + gender_all_years['Male'])
    gender_all_years['percentage_male'] = gender_all_years['Male']/(gender_all_years['Female'] + gender_all_years['Male'])
    gender_all_years['year'] = year_list
    return gender_all_years

In [None]:
def plot_gender_all_years(gender_all_years):
    """
        Function to plot the number of quotes depending on `gender`
    :param gender_all_years: data frame with percentage of males/females
    """       
    fig1 = (gender_all_years[['Male','Female']]/1000000).plot(kind='bar', title='Number of quotations depending on the gender in absolute value for each year', rot=0, xlabel='Years', ylabel='Number of quotations \n [in Millions]', figsize=(16,6))
    fig2 = gender_all_years[['percentage_male','percentage_female']].plot(kind='bar', title='Number of quotations depending on the gender in % for each year', rot=0, xlabel='Years', ylabel='% of quotations', figsize=(16,6))

## Number of quotes per age

In [None]:
def create_df_agerange(df):
    df_agerange = df_2020.groupby(['gender', 'age_range'])['numOcurrences'].sum().sort_values(ascending=False).to_frame(name='count').reset_index()
    return df_agerange

In [None]:
def plot_quotes_age(df, age_threshold):
    """
        Function to plot the number of quotes depending on `age_range` and `gender`
    :param df: dataframe
    :param age_threshold: consider only speakers with an age smaller than `age_threshold`
    """
    f = plt.figure(figsize=(16,6))
    ax = sns.barplot(data=df, x='age_range',y='count', hue='gender')
    plt.xlabel('Age intervals')
    plt.ylabel('Number of quotes')
    #year = df['quoteID'][0][0:4]
    #plt.title('Number of quotes depending on age and gender for the year '+ year)

In [None]:
df_agerange = create_df_agerange(df_2020)

In [None]:
plot_quotes_age(df_agerange, age_threshold=100)

## Number of quotes per country of citizenship

In [29]:
def create_df_citizenship(df):
    df_citizenship = df.groupby(['gender', 'citizenship'])['numOcurrences'].sum().sort_values(ascending=False).to_frame(name='count').reset_index()
    return df_citizenship

In [31]:
df_2015_citizenship = create_df_citizenship(df_2015)

In [35]:
df_2015_citizenship[df_2015_citizenship["gender"]=="Female"]

Unnamed: 0,gender,citizenship,count
2,Female,United States of America,472100
7,Female,United Kingdom,119823
10,Female,Australia,72700
16,Female,Canada,37852
19,Female,India,29790
...,...,...,...
591,Female,Liang dynasty,1
592,Female,French Indochina,1
593,Female,Transnistria,1
594,Female,Holland,1


In [None]:
def plot_quotes_country(df_citizenship, threshold_nber):
    """
        Function to plot the number of quotes depending on `citizenship` and `gender`
    :param df: dataframe
    :param threshold_nber: consider only countries of citizenship for which there is at least `threshold_nber` of quotes
    """
    f = plt.figure(figsize=(18,6))
    ax = sns.barplot(data=df_citizenship[df_citizenship['count']>threshold_nber], x='citizenship',y='count', hue='gender')
    plt.xlabel('Citizenship')
    plt.ylabel('Number of quotes')
    plt.legend(loc = 'upper right')
    #year = df_citizenship['quoteID'][0][0:4]
    #plt.title('Number of quotes above '+ str(threshold_nber) +' depending on gender and citizenship for the year '+year)
    locs, labels = plt.xticks()
    plt.setp(labels, rotation=90)
    ax.set_yscale('log')

In [None]:
df_citizenship = create_df_citizenship(df_2020)

In [None]:
plot_quotes_country(df_citizenship, threshold_nber=300)

## Numer of quotes per continent

In [None]:
def create_df_continent(df):
    df_continent = df_2020.groupby(['gender', 'Continent'])['numOcurrences'].sum().sort_values(ascending=False).to_frame(name='count').reset_index()
    return df_continent

In [None]:
def plot_quotes_continent(df):
    """
        Function to plot the number of quotes depending on `continent` and `gender`
    :param df: dataframe
    """
    f = plt.figure(figsize=(12,6))
    ax = sns.barplot(data=df, x='Continent', y='count', hue='gender')
    plt.xlabel('Continent')
    plt.ylabel('Number of quotes')
    #year = df['quoteID'][0][0:4]
    #plt.title('Number of quotes depending on gender and continent for the year '+year)

In [None]:
df_continent = create_df_continent(df_2020)

In [None]:
plot_quotes_continent(df_continent)

## Number of quotes depending on length of quotes

In [None]:
### Plot length of quotes
def plot_quotes_distribution(df):
    """
        Function to plot the distribution of the quotes length depending on `gender`
    :param df: dataframe
    """ 
    f = plt.figure(figsize=(14,6))
    f = sns.histplot(data = df, y='quotation_length', hue='gender', bins=200, alpha=0.8, log_scale = [True,False], 
                     hue_order = ['Female', 'Male'], palette=[colors[1], colors[0]])
    plt.ylabel('Quotation length')
    #year = df['quoteID'][0][0:4]
    #plt.title('Quotation length distribution per gender for the year '+year)
    plt.show()
    
def plot_avg_quotes_length(df, conf_int):
    """
        Function to plot average length of quotes and confidence intervals depending on `gender`
    :param df: dataframe 
    :param conf_int: confidence interval for the plot
    """
    f = plt.figure(figsize=(8,10))
    sns.catplot(data = df, x='gender', y='quotation_length', kind='bar', height=5, aspect=0.8, ci=conf_int)
    #year = df['quoteID'][0][0:4]
    #plt.title('Average quotation length depending on gender for the year '+ year)
    plt.ylabel('Quotation length')
    plt.tight_layout()
    #plt.ylim(113,126)
    plt.show()

In [None]:
plot_quotes_distribution(df_2020)

In [None]:
plot_avg_quotes_length(df_2020, conf_int=95)

## Number of quotes per media

In [36]:
def create_df_media(df):
    df_sitenames_explode = df.explode("sitenames")
    df_sitenames = df_sitenames_explode.groupby(["sitenames","gender"]).quoteID.count().to_frame(name="count").sort_values(['count'],ascending=False).reset_index()
    return df_sitenames

In [37]:
create_df_media(df_2015)

Unnamed: 0,sitenames,gender,count
0,yahoo,Male,684832
1,wikia,Male,318873
2,msn,Male,258520
3,theguardian,Male,228170
4,reuters,Male,197001
...,...,...,...
182,riotimesonline,Female,104
183,manoramaonline,Male,50
184,baltictimes,Female,31
185,sny,Female,23


In [None]:
def plot_quotes_media(df_sitenames, threshold_nber): 
    """
        Function to plot the number of quotes depending on `gender` and `media_country`
    :param df: dataframe 
    """
    f = plt.figure(figsize=(18,6))
    ax = sns.barplot(data=df_sitenames[df_sitenames['count']>threshold_nber], x='sitenames',y='count', hue='gender')
    locs, labels = plt.xticks();
    plt.setp(labels, rotation=90);
    plt.xlabel("Famous media")
    plt.ylabel('Number of quotes')
    #year = df['quoteID'][0][0:4]
    #plt.title("Number of quotes depending on gender and media's country for the year "+year)

In [None]:
df_sitenames = create_df_media(df_2020)

In [None]:
plot_quotes_media(df_sitenames, threshold_nber=1000)

## Number of quotes per category

In [None]:
def create_df_categories(df):
    df_tags_explode = df.explode("tags").explode("tags")
    df_tags = df_tags_explode.groupby(["tags","gender"]).quoteID.count().to_frame(name="count").sort_values(['count'],ascending=False).reset_index()
    return df_tags

In [None]:
def plot_quotes_categories(df_tags):
    """
        Function to plot the number of quotes depending on `tags` and `gender`
    :param df: dataframe 
    """
    #create plot
    f = plt.figure(figsize=(18,6))
    ax = sns.barplot(data=df_tags, x='tags',y='count', hue='gender')
    plt.xlabel('Category')
    plt.ylabel('Number of quotes')
    ax.set_yscale('log')
    #year = df['quoteID'][0][0:4]
    #plt.title('Number of quotes depending on gender and media for the year '+year)
    locs, labels = plt.xticks()
    plt.setp(labels, rotation=45,  horizontalalignment='right')

In [None]:
df_tags = create_df_categories(df_2020)

In [None]:
plot_quotes_categories(df_tags)

#### Number of quotes per media's country location

In [None]:
df_2020

In [None]:
def create_df_media_country(df):
    df_media_country_explode = df.explode("media_country")
    df_media_country = df_media_country_explode.groupby(["media_country","gender"]).quoteID.count().to_frame(name="count").sort_values(['count'],ascending=False).reset_index()
    return df_media_country

In [None]:
def plot_quotes_media_country(df_media_country):
    """
        Function to plot the number of quotes depending on `gender` and `media_country`
    :param df: dataframe 
    """    
    f = plt.figure(figsize=(12,6))
    ax = sns.barplot(data=df_media_country, x='media_country',y='count', hue='gender')
    plt.xlabel("Top media's country")
    plt.ylabel('Number of quotes')
    #year = df['quoteID'][0][0:4]
    #plt.title("Number of quotes depending on gender and media's country for the year "+year)
    locs, labels = plt.xticks()
    plt.setp(labels, rotation=45,  horizontalalignment='right')

In [None]:
df_media_country = create_df_media_country(df_2020)

In [None]:
plot_quotes_media_country(df_media_country)

print the name of the media in label also?