# Milestone 2

## Imports

# Quotebank preprocessing

Most of the code used for the quotebank preprocessing can be found in the classes QuotebankData and Keyword, in addition to in files for helper functions. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import random
import os
import bz2
from src.QuoteBankData import QuoteBankData
from src.Deaths import Deaths
import src.utilities.quotebank_preprocessing_utils as utils
from src.CONSTS import DATA_PATH, COLORS, KEYWORDS_JSON_FILE_PATH, KEYWORDS_FILE_PATH, BEGIN_YEAR, END_YEAR
from tqdm import tqdm
from src.utilities.synonym_utils import add_new_synonyms
from src.utilities.add_country import expand_line
from src.utilities.utils import *

[nltk_data] Downloading package wordnet to C:\Users\Henrik Øberg
[nltk_data]     Myhre\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Initialize quotebank
- read keyword and synonyms for death causes
- create folders and file names for quotes that matches keywords or synonyms
- match keywords and synonyms with quotes

load database of speakers


In [2]:
quotebank = QuoteBankData("Asymmetry of News", [])
quotebank.read_keywords_from_file()
utils.create_directories_for_every_year()
quotebank.create_json_dumps_filenames_for_each_keyword()

In [3]:
#quotebank.print_pretty_keywords_filenames()

In [12]:
import warnings
import collections

warnings.filterwarnings("ignore")

def save_meta_data_quotes():
    quotebank.quotes_occurrences_df = pd.DataFrame()
    for year in range(BEGIN_YEAR, END_YEAR + 1):
        dict_df_row = {"Year": year}
        for key in quotebank.keywords:
            file = key.output_filenames[year-BEGIN_YEAR]
            with open(file, 'rU') as f:
                quotes = pd.read_json(file, lines=True)
            # Save total number of quotes per year per topic in df
            num_of_quotes = quotes["n_occurences"].sum()
            occurences.append(num_of_quotes)
            dict_df_row[key.name] = num_of_quotes
            
            for line_quotes in quotes:
                # Safe number of quotes per topic per year per country of url
                country_urls = line_quotes['country_urls']
                for country in country_urls:
                    if country in key.country_url_occurences:
                        key.country_url_occurences[country] += 1
                    else:
                        key.country_url_occurences[country] = 1
                
                # Safe number of quotes per topic per year per country of speaker
                country_speaker = line_quotes['country_speaker']
                if isinstance(country_speaker, list):        # if multiple countries, take most occuring
                    country_speaker = collections.Counter(country_speaker).most_common(1)[0][0]
                    
                if country_speaker in key.country_url_occurences:
                    key.country_url_occurences[country_speaker +=1
                else:
                    key.country_url_occurences[country_speaker = 1 
            
            
        quotebank.quotes_occurrences_df = quotebank.quotes_occurrences_df.append(dict_df_row, ignore_index = True)
        cols = quotebank.quotes_occurrences_df.columns
        quotebank.quotes_occurrences_df[cols] = quotebank.quotes_occurrences_df[cols].astype(int)

In [13]:
save_meta_data_quotes()

Empty DataFrame
Columns: []
Index: []
Empty DataFrame
Columns: []
Index: []
Empty DataFrame
Columns: []
Index: []
Empty DataFrame
Columns: []
Index: []
Empty DataFrame
Columns: []
Index: []
Empty DataFrame
Columns: []
Index: []
Empty DataFrame
Columns: []
Index: []
Empty DataFrame
Columns: []
Index: []
Empty DataFrame
Columns: []
Index: []
Empty DataFrame
Columns: []
Index: []
             quoteID                                          quotation  \
0  2008-09-29-004287  as the number of older americans continues to ...   

         speaker         qids                date  numOccurrences  \
0  mary c. white  [Q43737912] 2008-09-29 19:06:53               3   

                                      probas  \
0  [[mary c. white, 0.7179], [None, 0.2821]]   

                                                urls phase  country_speaker  \
0  [http://reuters.com/article/healthNews/idUSTRE...     A              NaN   

                                      unique_urls  \
0  [us.rd.yahoo.com, 

KeyError: 'n_appearances'

In [None]:
quotebank.quotes_occurrences_df

To illustrate the results, we plot the number of quotes found for each issue in 2016

In [None]:
quotebank.quotes_percentage_df = percentage_of_total_count(quotebank.quotes_occurrences_df, quotebank.get_all_keyword_names())

In [None]:
quotebank.quotes_percentage_df

In [None]:
occurences = list()
x_labels = list()
x_number = list()
count = 0
for key in quotebank.keywords:
    with open(key.output_filenames[8], 'rU') as f:
        key.quotes = pd.read_json(key.output_filenames[8], lines=True)
    occurences.append(len(key.quotes))
    x_labels.append(key.name)
    x_number.append(count)
    count += 1

In [None]:
plt.bar(x_number, occurences, log = True)
plt.xticks(x_number, x_labels)
plt.xticks(rotation=90)
plt.title("Number of occurences per topic in 2016")
plt.xlabel("Topics")
plt.ylabel("Number of occurences")
plt.show()

In [None]:
def freq_per_month(df):
    freqs = np.zeros(12)
    for i in range(12):
        freqs[i] = np.sum(df["date"].dt.month == i+1)
    return freqs

In [None]:
all_freq = list()
for keys in quotebank.keywords:
    if not keys.quotes.empty:
        all_freq.append(freq_per_month(keys.quotes))
    else:
        all_freq.append(np.zeros(12))
df = pd.DataFrame(all_freq)

plt.plot(np.sum(df))
plt.title("Number of topic quotes found throughout the year 2016")
plt.show()

# Population data preprocessing
This is population data from the UN, which we might use in combination with the Our World in Data datasets to get a more realistic understanding of the relative deaths each year.

In [None]:
population_path = DATA_PATH + "annual_population_data.csv"
raw_population_df = pd.read_csv(population_path)
raw_population_df

Extracting world population in millions from 2008 to 2017

In [None]:
population_df = raw_population_df[(raw_population_df["Location"] == "World") & (raw_population_df["Time"] > 2007) & (raw_population_df["Time"] < 2018)]
population_df = population_df[["Time", "PopTotal"]].reset_index(drop=True)
population_df["PopTotal"] = population_df["PopTotal"].apply(lambda x: x/1000)
population_df = population_df.rename(columns = {"Time": "Year", "PopTotal": "World population millions"})
population_df

# Our World in Data preprocessing

This part of the notebook is more exploratory. We wanted to preprocess and look at the two main datasets form Our World in Data to be sure that the project is possible to do. 

## Preprocess death data

### Deaths by cause

In [None]:
death_by_cause_path = DATA_PATH + "annual_number_of_deaths_by_cause.csv"
deaths_by_cause = Deaths("deaths_by_cause", death_by_cause_path, rename_cols = {'Terrorism (deaths)': 'Terrorism', 'Self-harm': 'Suicide'}, drop_cols = ['Number of executions (Amnesty International)'])
deaths_by_cause.df

In [None]:
deaths_by_cause.df.columns

In [None]:
deaths_by_cause.percentage_df

### Deaths by category

In [None]:
death_by_category_path = DATA_PATH + "categorized_annual_number_of_deaths.csv"
deaths_by_category = Deaths("deaths_by_category", death_by_category_path)
deaths_by_category.df

In [None]:
deaths_by_category.percentage_df

## Plotting

### Deaths by cause

In [None]:
deaths_by_cause.plot_lines("Percentage deaths", "Percentage of deaths by cause")

### Deaths by category

In [None]:
deaths_by_category.plot_stacked_areas("Percentage deaths", "Percentage of deaths by category")

# RQ1

In [None]:
def plot_RQ1():
    

In [None]:
quotebank.quotes_percentage_df.columns

In [None]:
plot_RQ1()

# RQ5

In [None]:
def group_quotebank_cols():
    print(":)")

In [None]:
def plot_RQ5():
    bar_labels = ["quotes", "deaths"]
    width = 0.35 
    for year in range(BEGIN_YEAR, END_YEAR + 1):
        prev_values = [0, 0]
        print("Year:",  year)
        fig, ax = plt.subplots()

        for column in deaths_by_cause.quant_columns:
            values = [quotebank.quotes_percentage_df.loc[quotebank.quotes_percentage_df["Year"] == year, column].item(), 
                      deaths_by_cause.percentage_df.loc[deaths_by_cause.percentage_df["Year"] == year, column].item()]

            ax.bar(bar_labels, values, width, bottom = prev_values, label=column)
            prev_values[0] += values[0]
            prev_values[1] += values[1]

        ax.set_ylabel('Percentage')
        ax.set_title('Percentages of quotes and deaths')
        ax.legend()

        plt.show()

In [None]:
deaths_by_cause.quant_columns

In [None]:
plot_RQ5()