In [1]:
import pandas as pd
from tqdm.notebook import tqdm
from ast import literal_eval

In [2]:
tqdm.pandas()
def clean_data(data):
    # filter out the noise in the data
    data['domains'] = data['domains'].progress_apply(literal_eval)
    data = data[data['domains'].progress_apply(lambda x: 'gamereactor' not in x)]
    data = data[data['domains'].progress_apply(lambda x: 'duke' not in x)]
    return data

In [3]:
# this function is used to search the matching data, and grouped them according to different genders
# we grouped them for each month, and record the total occurrence in a month.
def gender_bias_time_series(data, groups, groupby, year):
    total_occu = []
    for subgroup in groups:
        print(subgroup)
        if subgroup == 'Total':
            subgroup_data = data
        else:
            if groupby == 'domains':
                subgroup_data = data[data[groupby].apply(lambda x: subgroup in x)]
            else:
                subgroup_data = data[data[groupby].apply(lambda x: subgroup == x)]

        # sort by date
        subgroup_data = subgroup_data.sort_values(by='date')
        # parse to datetime
        subgroup_data['date'] = pd.to_datetime(subgroup_data['date'])

        # male
        male_data = subgroup_data[subgroup_data.gender == 'male']
        male_occu = male_data.groupby(male_data['date'].dt.to_period("M")).agg({'numOccurrences':'sum'})
        male_occu = male_occu.reset_index()
        male_occu['gender'] = 'male'
        # female
        female_data = subgroup_data[subgroup_data.gender == 'female']
        female_occu = female_data.groupby(female_data['date'].dt.to_period("M")).agg({'numOccurrences':'sum'})
        female_occu = female_occu.reset_index()
        female_occu['gender'] = 'female'
        
        subgroup_occu = pd.concat([male_occu, female_occu])
        subgroup_occu[groupby] = subgroup
        total_occu.append(subgroup_occu)
    
    total_occu = pd.concat(total_occu)
    total_occu.to_csv('new_grouped_gender/' + groupby+"_gender_gap_{}.csv".format(year))
    return total_occu

In [None]:
path = "E:\\PycharmProject\\ada_project\\data\\"
years = [2015, 2016, 2017, 2018, 2019, 2020]
for year in years:
    print(year)
    data = pd.read_csv(path + 'processed-quotes-{}.csv'.format(year))
    data = clean_data(data)
    party_groups = ['Republican Party', 'Democratic Party', 'Conservative Party',
          'Labour Party', 'Liberal Democrats', 'Bharatiya Janata Party', 'Indian National Congress']

    gender_bias_time_series(data, groups=party_groups, groupby='party', year=year)

    domain_groups = ['nytimes', 'cnn', 'foxnews', 'bbc', 'nbcnews', 'time', 'cnbc']
    gender_bias_time_series(data, groups=domain_groups, groupby='domains', year=year)

    nationality_groups = ['United States of America', 'United Kingdom',
                          'Australia', 'Canada', 'France', 'Germany', 'India']

    gender_bias_time_series(data, groups=nationality_groups, groupby='domains', year=year)

In [9]:
# this cell is used to search the topk speaker for a specific years
def select_topk_freq_speaker(data, topk=None):
    speaker_data =data.groupby('speaker')
    speaker_freq = speaker_data.agg({'numOccurrences': 'sum'}).sort_values(by='numOccurrences', 
                                                                                 ascending=False)
    if topk is not None:
        return speaker_freq[:topk]
    else:
        return speaker_freq
    
def search_speaker_info(data, speakers):
    tmp = data.drop_duplicates('speaker')
    tmp = tmp.set_index('speaker')
    tmp = tmp.loc[speakers.index]
    tmp = pd.concat([tmp, speakers], axis=1)
    return tmp[['nationality', 'gender', 'party', 'numOccurrences']]

def search_topk_speaker(data, topk, year):
    # male
    male_data = data[data.gender == 'male']
    male_topk_speaker = select_topk_freq_speaker(male_data, topk)
    male_topk_speaker = search_speaker_info(male_data, male_topk_speaker)
    male_topk_speaker = male_topk_speaker.reset_index()
    male_topk_speaker['gender'] = 'male'
    # female
    female_data = data[data.gender == 'female']
    female_topk_speaker = select_topk_freq_speaker(female_data, topk)
    female_topk_speaker = search_speaker_info(female_data, female_topk_speaker)
    female_topk_speaker = female_topk_speaker.reset_index()
    female_topk_speaker['gender'] = 'female'

    total_topk_speaker = pd.concat([male_topk_speaker, female_topk_speaker])
    total_topk_speaker.to_csv('{}-topk_speaker.csv'.format(year))

year = 2019
data = pd.read_csv(path + 'processed-quotes-{}.csv'.format(year))
data = clean_data(data)
search_topk_speaker(data, 10, year)

  0%|          | 0/14183294 [00:00<?, ?it/s]

  0%|          | 0/14183294 [00:00<?, ?it/s]

  0%|          | 0/14181225 [00:00<?, ?it/s]