# Milestone 2

## Imports

# Quotebank preprocessing

Most of the code used for the quotebank preprocessing can be found in the classes QuotebankData and Keyword, in addition to in files for helper functions. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.QuoteBankData import QuoteBankData
from src.Deaths import Deaths
import src.utilities.quotebank_preprocessing_utils as utils
from src.CONSTS import DATA_PATH, CATEGORIES, BEGIN_YEAR, END_YEAR, CATEGORY_MAPPING, COLOR_MAP 
from src.utilities.synonym_utils import add_new_synonyms
from src.utilities.utils import *
from src.utilities.plotting import save_plt

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/andreaperozziello/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Initialize quotebank
- read keyword and synonyms for death causes
- create folders and file names for quotes that matches keywords or synonyms
- match keywords and synonyms with quotes

load database of speakers


In [2]:
quotebank = QuoteBankData("Asymmetry of News", [])
quotebank.read_keywords_from_file()
utils.create_directories_for_every_year()
quotebank.create_json_dumps_filenames_for_each_keyword()

In [3]:
import warnings
import collections

warnings.filterwarnings("ignore")

def save_meta_data_quotes():
    quotebank.quotes_occurrences_df = pd.DataFrame()
    for year in range(BEGIN_YEAR, END_YEAR + 1):
        dict_df_row = {"Year": year}
        for key in quotebank.keywords:
            file = key.output_filenames[year-BEGIN_YEAR]
            with open(file, 'rU') as f:
                quotes = pd.read_json(file, lines=True)
            if len(quotes) > 0:
                # Save total number of quotes per year per topic in df
                num_of_quotes = quotes["n_appearances"].sum()
                

                for _, line_quotes in quotes.iterrows():
                    # Safe number of quotes per topic per year per country of url
                    country_urls = line_quotes['country_urls']
                    for country in country_urls:
                        if country in key.country_url_occurences:
                            key.country_url_occurences[country] += 1
                        else:
                            key.country_url_occurences[country] = 1

                    # Safe number of quotes per topic per year per country of speaker
                    country_speaker = line_quotes['country_speaker']
                    if country_speaker:
                        if isinstance(country_speaker, list):        # if multiple countries, take most occuring
                            country_speaker = collections.Counter(country_speaker).most_common(1)[0][0]

                        if country_speaker in key.country_speaker_occurences:
                            key.country_speaker_occurences[country_speaker] +=1
                        else:
                            key.country_speaker_occurences[country_speaker] = 1 
            else:
                num_of_quotes = 0
            dict_df_row[key.name] = num_of_quotes
            
        quotebank.quotes_occurrences_df = quotebank.quotes_occurrences_df.append(dict_df_row, ignore_index = True)
        cols = quotebank.quotes_occurrences_df.columns
        quotebank.quotes_occurrences_df[cols] = quotebank.quotes_occurrences_df[cols].astype(int)

In [4]:
save_meta_data_quotes()

FileNotFoundError: [Errno 2] No such file or directory: './generated/2014/Meningitis-2014.json.bz2'

In [None]:
quotebank.quotes_occurrences_df

In [None]:
quotebank.quotes_percentage_df = percentage_of_total_count(quotebank.quotes_occurrences_df, quotebank.get_all_keyword_names())

In [None]:
quotebank.quotes_percentage_df

In [None]:
quotebank.map_df_causes_to_categories()

In [None]:
quotebank.cat_quotes_occurrences_df

In [None]:
quotebank.cat_quotes_percentages_df = percentage_of_total_count(quotebank.cat_quotes_occurrences_df, CATEGORIES)

In [None]:
quotebank.cat_quotes_percentages_df

# Our World in Data preprocessing

This part of the notebook is more exploratory. We wanted to preprocess and look at the two main datasets form Our World in Data to be sure that the project is possible to do. 

## Preprocess death data

### Deaths by cause

In [None]:
death_by_cause_path = DATA_PATH + "annual_number_of_deaths_by_cause.csv"
deaths_by_cause = Deaths("deaths_by_cause", "World", death_by_cause_path, rename_cols = {'Terrorism (deaths)': 'Terrorism', 'Deaths - Self-harm - Sex: Both - Age: All Ages (Number)': 'Suicide'}, drop_cols = ['Number of executions (Amnesty International)'])
deaths_by_cause.df

In [None]:
deaths_by_cause.df.columns

In [None]:
deaths_by_cause.percentage_df

### Deaths by category

In [None]:
death_by_category_path = DATA_PATH + "categorized_annual_number_of_deaths.csv"
deaths_by_category = Deaths("deaths_by_category",'World', death_by_category_path, rename_cols = {"Deaths - Communicable, maternal, neonatal, and nutritional diseases - Sex: Both - Age: All Ages (Number)": "Communicable diseases"})
deaths_by_category.df

In [None]:
deaths_by_category.quant_columns

In [None]:
deaths_by_category.percentage_df

## Plotting

### Deaths by cause

In [None]:
deaths_by_cause.plot_stacked_areas("", "Percentage of deaths by cause")

In [None]:
def pie_chart_plot(df, year, threshold):
    
    pc_df = df.iloc[: , 3:]
    pc_df = pc_df.iloc[[year - 2008]]
    
    other = 0
    sorted_pd = pd.DataFrame(np.sort(pc_df.values))
    
    for column in pc_df:
        if ((pc_df[column].values)) < sorted_pd.iloc[: ,-threshold].values:
            other += pc_df[column].values
            pc_df.drop([column], axis=1, inplace=True)
            
    pc_df = pc_df.assign(Other=other)
    title = "Deaths in " + str(year)
    pc_df.T.plot.pie(colormap=COLOR_MAP, subplots=True, figsize=(20, 6), legend=None, ylabel = '', startangle=270, autopct='%1.1f%%', pctdistance=0.7)
    save_plt(title)

In [None]:
for y in range(2008, 2018):
    pie_chart_plot(deaths_by_cause.percentage_df, y, 7)

### Deaths by category

In [None]:
deaths_by_category.plot_stacked_areas("Percentage deaths", "Percentage of deaths by category")

# RQ1

In [None]:
def plot_RQ1():
    

In [None]:
quotebank.quotes_percentage_df.columns

In [None]:
plot_RQ1()

# RQ5

In [None]:
def group_quotebank_cols():
    print(":)")

In [None]:
def plot_RQ5():
    bar_labels = ["quotes", "deaths"]
    width = 0.35 
    for year in range(BEGIN_YEAR, END_YEAR + 1):
        prev_values = [0, 0]
        print("Year:",  year)
        fig, ax = plt.subplots()

        for column in deaths_by_cause.quant_columns:
            values = [quotebank.quotes_percentage_df.loc[quotebank.quotes_percentage_df["Year"] == year, column].item(), 
                      deaths_by_cause.percentage_df.loc[deaths_by_cause.percentage_df["Year"] == year, column].item()]

            ax.bar(bar_labels, values, width, bottom = prev_values, label=column)
            prev_values[0] += values[0]
            prev_values[1] += values[1]

        ax.set_ylabel('Percentage')
        ax.set_title('Percentages of quotes and deaths')
        ax.legend()

        plt.show()

In [None]:
deaths_by_cause.quant_columns

In [None]:
plot_RQ5()