In [12]:
import os
import json
import bz2

from topic import Topics
from tqdm import tqdm

Constants

In [13]:
DATA_PATH = './data/'
GENERATED_PATH = './generated/'
PATH_TO_QUOTEBANK_2020 = DATA_PATH + "quotes-2020.json.bz2" #TODO delete this line

Code

In [14]:
def start_quotation_classification():
    quotes_file_list = compose_quotebank_filenames()

    for index, filename in enumerate(quotes_file_list):
        print("Elaborating file: " + filename)
        elaborate_file_for_year(filename)
        quotebank.write_matching_quotes_to_file_for_year(index)
        quotebank.delete_json_line_for_all_keywords()

In [15]:
def elaborate_file_for_year(filename):
    try:
        with bz2.open(filename, "rb") as file:
            for i, line in tqdm(enumerate(file)):
                if i == 2000: break
                quotation = extract_quotation(line)
                found_keyword = quotebank.match_quotation_with_any_keyword(quotation)
                if found_keyword is not None:
                    found_keyword.json_lines.append(line)
    except OSError:
        print("Could not open/read file:", filename)

In [16]:
# TODO put this func inside topic class
def create_json_dumps_filenames_for_each_keyword():
    years_for_file = get_all_years()

    for k in quotebank.keywords:
        for year in years_for_file:
            directory_for_year = year + "/"
            relative_path_for_file_for_year = format_filenames_nicely(k.name) + "-" + year + ".json.bz2"
            # print(relative_path_for_file_for_year)
            k.output_filenames.append(GENERATED_PATH + directory_for_year + relative_path_for_file_for_year)

def get_all_years() -> list:
    years_for_file = []
    for i in range(8,18):
        year = create_year_string_from_number(i)
        years_for_file.append(year)
    return years_for_file

def create_directories_for_every_year():
    for i in range(8,18):
        year = create_year_string_from_number(i)
        path = GENERATED_PATH + year + "/"

        try:
            os.mkdir(path)
            print("Created Directory at: " + path)
        except OSError:
            continue

# TODO create class Utils
def format_filenames_nicely(filename) -> str:
    specialChars = "!#$%^&*()/ "
    for specialChar in specialChars:
        filename = filename.replace(specialChar, '_')

    filename_without_comma = filename.replace(",","")
    return filename_without_comma

In [17]:
def compose_quotebank_filenames() -> list:
    quotes_file_list = []
    for i in range(8,18):
        year = create_year_string_from_number(i)
        quotes_file_list.append(DATA_PATH + "quotes-" + year + ".json.bz2")
    return quotes_file_list

def create_year_string_from_number(number) -> str:
    if number < 10:
        year = "200" + str(number)
    else:
        year = "20" + str(number)
    return year

In [18]:
def extract_quotation(line) -> str:
    json_line = json.loads(line)
    return json_line['quotation']

In [19]:
quotebank = Topics("Asymmetry of News", [])
quotebank.read_keywords_from_file()
create_directories_for_every_year()
create_json_dumps_filenames_for_each_keyword()
start_quotation_classification()

190it [00:00, 1898.67it/s]

Elaborating file: ./data/quotes-2008.json.bz2
Could not open/read file: ./data/quotes-2008.json.bz2
Elaborating file: ./data/quotes-2009.json.bz2
Could not open/read file: ./data/quotes-2009.json.bz2
Elaborating file: ./data/quotes-2010.json.bz2
Could not open/read file: ./data/quotes-2010.json.bz2
Elaborating file: ./data/quotes-2011.json.bz2


2000it [00:00, 3116.37it/s]
498it [00:00, 2509.09it/s]

Elaborating file: ./data/quotes-2012.json.bz2


2000it [00:00, 2862.41it/s]
265it [00:00, 2582.37it/s]

Elaborating file: ./data/quotes-2013.json.bz2


2000it [00:00, 2868.99it/s]
208it [00:00, 2079.19it/s]

Elaborating file: ./data/quotes-2014.json.bz2


2000it [00:00, 2517.98it/s]
486it [00:00, 2480.98it/s]

Elaborating file: ./data/quotes-2015.json.bz2


2000it [00:00, 2598.69it/s]
231it [00:00, 2305.02it/s]

Elaborating file: ./data/quotes-2016.json.bz2


2000it [00:00, 2697.01it/s]
236it [00:00, 2318.85it/s]

Elaborating file: ./data/quotes-2017.json.bz2


2000it [00:00, 2447.45it/s]


In [20]:
print(quotebank.keywords[0].output_filenames)

['./generated/2008/meningitis-2008.json.bz2', './generated/2009/meningitis-2009.json.bz2', './generated/2010/meningitis-2010.json.bz2', './generated/2011/meningitis-2011.json.bz2', './generated/2012/meningitis-2012.json.bz2', './generated/2013/meningitis-2013.json.bz2', './generated/2014/meningitis-2014.json.bz2', './generated/2015/meningitis-2015.json.bz2', './generated/2016/meningitis-2016.json.bz2', './generated/2017/meningitis-2017.json.bz2']


In [21]:
quotebank.get_keyword_by_name('meningitis').output_filenames[0]

'./generated/2008/meningitis-2008.json.bz2'