# Milestone 3

## Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.QuoteBankData import QuoteBankData
from src.Deaths import Deaths
import src.utilities.quotebank_preprocessing_utils as utils
from src.CONSTS import *
from src.utilities.synonym_utils import add_new_synonyms
from src.utilities.utils import *
from src.utilities.plotting import *
from src.utilities.countries_utils import *
import math
import pygal
from pygal.style import Style
from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jurri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Quotebank preprocessing

Most of the code used for the quotebank preprocessing can be found in the classes QuotebankData and Keyword, in addition to in files for helper functions. 

Initialize quotebank
- read keyword and synonyms for death causes
- create folders and file names for quotes that matches keywords or synonyms
- match keywords and synonyms with quotes

Since the last step takes a lot of time, we performed it separately in the file run_quotebank_search.py. In this notebook, we will load the results we got from there, to analyze those further.
run_quoteban_search.py matches quotes with certain topics with the help of synonyms, created with src/utilities/synonyms_utils.py. After keyword matching, seven topics are further filtered with clustering, which is performed with the help of src/utilities/clustering.py. The topics for clustering were determined based on random samples we took from found quotes.

load database of speakers


In [3]:
quotebank = QuoteBankData("Asymmetry of News", [])
quotebank.read_keywords_from_file()
utils.create_directories_for_every_year()
quotebank.create_json_dumps_filenames_for_each_keyword()

In [4]:
import warnings
import collections

warnings.filterwarnings("ignore")

def save_meta_data_quotes():
    
    quotebank.quotes_occurrences_df = pd.DataFrame()
    for year in range(BEGIN_YEAR, END_YEAR + 1):
        print("Creating meta data for year:", year, "...")
        dict_df_row = {"Year": year}
        
        for key in quotebank.keywords:
            file = key.output_filenames[year-BEGIN_YEAR]
            with open(file, 'rU') as f:
                quotes = pd.read_json(file, lines=True)
                
            dict_df_row[key.name] = 0
            if len(quotes) <= 0: continue
                
            # Save total number of quotes per year per topic in df
            num_of_quotes = quotes["n_appearances"].sum()
            dict_df_row[key.name] = num_of_quotes

            for _, line_quotes in quotes.iterrows():
                # Safe number of quotes per topic per year per country of url
                country_urls = line_quotes['country_urls']
                for country in country_urls:
                    if country == "Great Britain":
                        country = "United Kingdom"
                    if (country, year) in key.country_url_occurences:
                        key.country_url_occurences[(country, year)] += 1
                    else:
                        key.country_url_occurences[(country, year)] = 1

                # Safe number of quotes per topic per year per country of speaker
                country_speaker = line_quotes['country_speaker']
                if country_speaker:
                    if isinstance(country_speaker, list):        # if multiple countries, take most occuring
                        country_speaker = collections.Counter(country_speaker).most_common(1)[0][0]

                    if (country_speaker,year) in key.country_speaker_occurences:
                        key.country_speaker_occurences[(country_speaker,year)] +=1
                    else:
                        key.country_speaker_occurences[(country_speaker,year)] = 1 
            
        quotebank.quotes_occurrences_df = quotebank.quotes_occurrences_df.append(dict_df_row, ignore_index = True)
        cols = quotebank.quotes_occurrences_df.columns
        quotebank.quotes_occurrences_df[cols] = quotebank.quotes_occurrences_df[cols].astype(int)

### Load the results of the quote finding

In [None]:
save_meta_data_quotes()

Creating meta data for year: 2008 ...
Creating meta data for year: 2009 ...
Creating meta data for year: 2010 ...
Creating meta data for year: 2011 ...
Creating meta data for year: 2012 ...
Creating meta data for year: 2013 ...
Creating meta data for year: 2014 ...


In [None]:
quotebank.quotes_occurrences_df

In [None]:
quotebank.quotes_percentage_df = percentage_of_total_count(quotebank.quotes_occurrences_df, TOPICS)

In [None]:
quotebank.quotes_percentage_df

In [None]:
for y in range(BEGIN_YEAR, END_YEAR + 1):
    cause_pie_chart_plot(quotebank.quotes_percentage_df, y, 7, "_quotes_")

In [None]:
quotebank.map_df_causes_to_categories()

In [None]:
quotebank.cat_quotes_occurrences_df

In [None]:
quotebank.cat_quotes_percentage_df = percentage_of_total_count(quotebank.cat_quotes_occurrences_df, CATEGORIES)

In [None]:
quotebank.cat_quotes_percentage_df

# Country preprocessing

#### first find for how many quotes we actually found the country (w.r.t. url and speaker)

In [None]:
num_url_country = 0
num_speaker_country = 0
for i in range(len(quotebank.keywords)):   
    for key in quotebank.keywords[i].country_url_occurences:
        num_url_country += quotebank.keywords[i].country_url_occurences[key]
    for key in quotebank.keywords[i].country_speaker_occurences:
        num_speaker_country += quotebank.keywords[i].country_speaker_occurences[key]
        
total_quotes = quotebank.quotes_occurrences_df.sum().sum()
print("Percentage of country of url found: {:.4f}".format(num_url_country/total_quotes))
print("Percentage of country of speaker found: {:.4f}".format(num_speaker_country/total_quotes))

#### Create maps to show where the most quotes are from.

In [None]:
# Create Maps for all years for speaker and url and safes them in data/images/year_(url/speaker)_country_occurences.svg
url_speaker = ["url","speaker"]
for target in url_speaker:
    for year in range(2008,2017):
        totals_per_country = quotebank.get_quote_occurences_per_country_year(target)
        mapping = map_countries_according_to_mag_order(totals_per_country, year)
        create_world_map(mapping, str(year) + "_" + target + "_country_occurences", year)

#### Show the distribution of topics for the quotes in certain countries where enough quotes were found



First for url

In [None]:

countries = ['United Kingdom', 'United States of America','Canada', 'Australia', 'India', 'Pakistan', 'France','South Africa']
for year in range(2008,2017):
    if year == 2008:
        totals_per_country = quotebank.get_country_per_year_count('url', countries, year)
    else:
        totals_per_country = totals_per_country + quotebank.get_country_per_year_count('url', countries, year)
        
df_reduce = reduce_df(totals_per_country, 5, return_percentage = True)
countries.append('Rest of the World')
stacked_barplot(df_reduce, x_labels=countries, y_label= "Percentage", title=None,
                width = 0.35, safe_name='country_specific_quote_url_percentage', log_y=False)

In [None]:
df_reduce = reduce_df(totals_per_country, 5, return_percentage = False)
countries.append('Rest of the World')
stacked_barplot(df_reduce, x_labels=countries, y_label= "Occurences (log)", title=None,
                width = 0.35, safe_name='country_specific_quote_url_log', log_y=True)

Now for the speaker

In [None]:
countries = ['United Kingdom', 'United States of America','Canada', 'Australia', 'India', 'Pakistan', 'France','South Africa']
for year in range(2008,2017):
    if year == 2008:
        totals_per_country = get_country_per_year_count('speaker', countries, year)
    else:
        totals_per_country = totals_per_country + get_country_per_year_count('speaker', countries, year)
        
df_reduce = reduce_df(totals_per_country, 5, return_percentage = True)
countries.append('Rest of the World')
stacked_barplot(df_reduce, x_labels=countries, y_label= "Percentage", title=None,
                width = 0.35, safe_name='country_specific_quote_speaker_percentage', log_y=False)

In [None]:
df_reduce = reduce_df(totals_per_country, 5, return_percentage = False)
countries.append('Rest of the World')
stacked_barplot(df_reduce, x_labels=countries, y_label= "Occurences (log)", title=None,
                width = 0.35, safe_name='country_specific_quote_speaker_log', log_y=True)

## Show real death causes for regions in the world

In [None]:
# Will print Nan if values are missing. If possible, the values will be replaced by learning them with linear regression
deaths = dict()
for region in REGIONS:
    df = Deaths("deaths_by_cause", region, DEATH_BY_CAUSE_PATH, rename_cols = RENAME_CAUSE_COLS, drop_cols = DROP_CAUSE_COLS).df
    for main_region in MAIN_REGIONS:
        if region in MAIN_REGIONS[main_region]:
            if main_region in deaths:
                deaths[main_region] += df.fillna(0)
                deaths[main_region]["Year"] = deaths[main_region]["Year"]/2
            else:
                deaths[main_region] = df.fillna(0)

In [None]:
# Sum causes for all years together
for year in range(2008,2016):
    deaths_year, _, labels = get_data_for_one_year(deaths, year)
    if year == 2008:
        all_deaths = deaths_year
    else:
        all_deaths = all_deaths + deaths_year
# plot relative number of deaths
all_deaths_reduced = reduce_df(all_deaths, percentage_treshold = 5, return_percentage = True)
stacked_barplot(all_deaths_reduced, x_labels = labels,
                y_label= "Percentage", safe_name="deaths_world_regions", log_y=False)

# plot absolute number of deaths
all_deaths_reduced = reduce_df(all_deaths, percentage_treshold = 5, return_percentage = False)
stacked_barplot(all_deaths_reduced, x_labels = labels,
                y_label="Percentage", safe_name="absolute_deaths_world_regions", log_y=True)

# Our World in Data preprocessing

### Deaths by cause

In [None]:
death_by_cause_path = DATA_PATH + "annual_number_of_deaths_by_cause.csv"
deaths_by_cause = Deaths("deaths_by_cause", "World", death_by_cause_path, rename_cols = RENAME_CAUSE_COLS, drop_cols = DROP_CAUSE_COLS)
deaths_by_cause.df

In [None]:
deaths_by_cause.percentage_df

### Deaths by category

In [None]:
death_by_category_path = DATA_PATH + "categorized_annual_number_of_deaths.csv"
deaths_by_category = Deaths("deaths_by_category",'World', death_by_category_path, rename_cols = RENAME_CAT_COLS)
deaths_by_category.df

In [None]:
deaths_by_category.percentage_df

## Plotting

### Deaths by cause

In [None]:
deaths_by_cause.plot_stacked_areas()

In [None]:
for y in range(2008, 2017):
    cause_pie_chart_plot(deaths_by_cause.percentage_df, y, 7, "_death_")

### Deaths by category

In [None]:
deaths_by_category.plot_stacked_areas()

# RQ1

In [None]:
def get_summed_values(df, cols):
    values = []
    for col in cols:
        values.append(df[col].sum())
    return np.array(values)


def create_RQ1_df():
    causes = deaths_by_cause.quant_columns
    
    summed_death_values = get_summed_values(deaths_by_cause.df, causes)
    summed_quote_values = get_summed_values(quotebank.quotes_occurrences_df, causes)
    
    quotes_per_death = np.divide(summed_quote_values, summed_death_values/1000000)
    
    df = pd.DataFrame(columns=causes)

    for i, cause in enumerate(causes):
        df.at[0, cause] = quotes_per_death[i]
    
    return df


def plot_RQ1(log = True):
    
    df = create_RQ1_df()

    df = df.T
    value_col = list(df.columns)[0]
    df = df.sort_values([value_col], ascending=False)
    
    ax = plt.gca()
    df.plot.barh(figsize=(12,12), log=log, title=None, ax=ax, xlabel=None, ylabel=None, color="#006198", legend=None, xlim = (10**2,10**6))
    
    # Remove border top and right
    for line in ['top', 'right']:
        ax.spines[line].set_visible(False)
    
    # Remove y Ticks
    ax.yaxis.set_ticks_position('none')

    # Add padding between axes and labels
    ax.xaxis.set_tick_params(pad = 5)
    ax.yaxis.set_tick_params(pad = 10)

    
    # Add x, y gridlines
    ax.grid(b = True, color ='grey',
            linestyle ='-.', linewidth = 0.6,
            alpha = 0.2)
        
    # Show top values
    ax.invert_yaxis()

    # Add annotation to bars
    for i in ax.patches:
        plt.text(i.get_width()+0.2, i.get_y()+0.4,
                 ' ' + pretty_num_string(int(i.get_width())),
                 fontsize = 10, fontweight ='bold',
                 color ="#777777")
        
    save_plt("quotes_per_million_deaths")
    plt.show()    

In [None]:
plot_RQ1()

# RQ2

In [None]:
from mpl_toolkits.axes_grid1 import host_subplot
COLORS = ["003F5C", "2F4B7C", "665191", "A05195", "D45087", "F95D6A", "FF7C43", "FF600"]
COLOR_MAP = plt.get_cmap('tab20c')

def plot_actual_vs_quote_per_year(d_c_actual, df_a, df_q, x_label, y_label_1, y_label_2, year_list = list(range(BEGIN_YEAR, END_YEAR +1))):
    
    quote_occurences = list()
    actual_occurences = list()
    
    for year in year_list:
        quote_occurences.append(df_q.loc[df_q['Year'] == year][d_c_actual].values)
        actual_occurences.append(df_a.loc[df_a['Year'] == year][d_c_actual].values)
        
    host = host_subplot(111)
    par = host.twinx()

    host.set_xlabel(x_label)
    host.set_ylabel(y_label_1)
    par.set_ylabel(y_label_2)

    p1, = host.plot(year_list, quote_occurences, label=y_label_1)
    p2, = par.plot(year_list, actual_occurences, label=y_label_2)

    leg = plt.legend()

    host.yaxis.get_label().set_color(p1.get_color())
    leg.texts[0].set_color(p1.get_color())

    par.yaxis.get_label().set_color(p2.get_color())
    leg.texts[1].set_color(p2.get_color())
    
    plt.xticks(year_list)
    plt.savefig('./generated/images/double_line_' + d_c_actual.replace("/", "_") + '.png') 
    plt.show()


In [None]:
for cause in TOPICS:
    print(cause)
    plot_actual_vs_quote_per_year(cause, deaths_by_cause.percentage_df, quotebank.quotes_percentage_df,
    "Year", "Quotebank", "Actual deaths")

In [None]:
for category in CATEGORIES:
    plot_actual_vs_quote_per_year(category, deaths_by_category.percentage_df, quotebank.cat_quotes_percentage_df,
        "Year", "Quotebank", "Actual deaths")

# RQ5

In [None]:
def plot_RQ5():
    index = ["Quotes", "Deaths"]
    width = 0.4
    
    value_dict = {}

    for column in deaths_by_category.quant_columns:
        value_dict[column] = [quotebank.cat_quotes_percentage_df[column].sum(), 
                  deaths_by_category.percentage_df[column].sum()]


    df = pd.DataFrame({INJURIES: value_dict[INJURIES], 
                       NON_COMMUNICABLE_DISEASES: value_dict[NON_COMMUNICABLE_DISEASES], 
                       COMMUNICABLE_DISEASES: value_dict[COMMUNICABLE_DISEASES]}, 
                       index=index)
    df = percentage_of_total_count(df, CATEGORIES)

    ax = df.plot.bar(stacked=True, colormap=COLOR_MAP)
    
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[::-1], labels[::-1], loc='center left', bbox_to_anchor=(1, 0.5))
    
    ax.set_ylabel('')
    
    save_plt("RQ5")

    plt.show()

In [None]:
plot_RQ5()