In [None]:
!pip install tld

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import bz2
import json

from tld import get_tld
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
#PATH_ROOT = '/content/drive/MyDrive/ADA'
PATH_ROOT = ''
PATH_PARQUET = PATH_ROOT + '/project_datasets'
PATH_QUOTEBANK = PATH_ROOT + 'Quotebank'
PATH_TO_QUOTES = PATH_QUOTEBANK + '/quotes-{year}.json.bz2'
PATH_TO_WORDS = PATH_ROOT + 'Data/environment_keywords.txt'
PATH_TO_OUT1 = PATH_ROOT + 'Data/quotes-{year}-labeled.json.bz2'
PATH_TO_OUT2 = PATH_ROOT + 'Data/quotes-{year}-filtered.json.bz2'
PATH_TO_OUT = PATH_ROOT + 'Data/time_series_{year}.json.bz2'

In [None]:
def get_domain(url):
    res = get_tld(url, as_object=True)
    return res.domain


def labeled_data(years, topic_name):
    '''
    Creates the labeled and filtered (only environment related) dataset.
    :years: List with all the valid years for the articles (the ones published on any other date will be ignored).
    :return: None.
    '''
    # Reading txt file with topic related words -> 1 word/string per line
    with open(PATH_TO_WORDS) as file:
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]


    for year in years:
        with bz2.open(PATH_TO_QUOTES.format(year=year), 'rb') as s_file:
            with bz2.open(PATH_TO_OUT1.format(year=year), 'xb') as d_file1, bz2.open(PATH_TO_OUT2.format(year=year), 'xb') as d_file2:
                for instance in s_file:

                    instance = json.loads(instance)                                 # loading a sample


                    urls = instance['urls']                                         # Extracting list of links
                    domains = []
                    for url in urls:
                        tld = get_domain(url)
                        domains.append(tld)
                    instance['domains'] = domains                                   # Updating the sample with domain name    
                    del instance['phase']                                           # Drop phase column since it won't be used.


                    # Adding the label column
                    column_name = topic_name + "_related"
                    if any(word in instance['quotation'] for word in lines):
                        instance[column_name] = 1
                        d_file2.write((json.dumps(instance)+'\n').encode('utf-8'))  # Save filtered dataset (only environment related).
                    else:
                        instance[column_name] = 0


                    d_file1.write((json.dumps(instance)+'\n').encode('utf-8'))      # Save labeled dataset.



In [None]:
labeled_data([2018], 'Enviroment')

In [None]:
labeled_data([2019], 'Enviroment')

In [None]:
labeled_data([2020], 'Enviroment')

In [None]:
def time_analysis_data(years, newspapers):
    '''
    Creates the labeled and filtered (only environment related) dataset.
    :years: List with all the valid years for the articles (the ones published on any other date will be ignored).
    :newspapers: Lis with all the newspapers (only quotes cited by any of these newspapers will be used).
    :return: None.
    '''

    for year in years:
        with bz2.open(PATH_TO_OUT2.format(year=year), 'rb') as s_file:
            with bz2.open(PATH_TO_OUT.format(year=year), 'xb') as d_file:
                for instance in s_file:

                    instance = json.loads(instance) # loading a sample
                    
                    for domain in instance['domains']:
                        if any(newspaper in domain for newspaper in newspapers):
                            new_instance = {'date': instance['date'], 'newspaper': domain}
                            d_file.write((json.dumps(new_instance)+'\n').encode('utf-8'))

In [None]:
time_analysis_data([2018], ['nytimes', 'washingtonpost', 'theguardian', 'wsj', 'bloomberg'])

In [None]:
time_analysis_data([2019], ['nytimes', 'washingtonpost', 'theguardian', 'wsj', 'bloomberg'])

In [None]:
time_analysis_data([2020], ['nytimes', 'washingtonpost', 'theguardian', 'wsj', 'bloomberg'])