In [17]:
import json
from pathlib import Path
from ast import literal_eval

In [25]:
ROOT_DIR = Path('notebooks/format_data.ipynb').resolve().parents[2]
DATA_DIR = ROOT_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PREPARED_DIR = DATA_DIR / "prepared"

In [26]:
with open(RAW_DIR / "CENTCOM/documents.json", "r") as fp:
    lines = fp.readlines()

In [27]:
lines[0]

'{ "_id" : "001C9C3F3DFE16B4921B1E906F66E161", "sourceName" : "CENTCOM", "sourceUrl" : "http://www.centcom.mil/MEDIA/PRESS-RELEASES/Press-Release-View/Article/904608/centcom-reinforces-support-for-syrian-arab-coalition/", "wordCount" : 193, "sentenceCount" : 8, "title" : "CENTCOM reinforces support for Syrian Arab Coalition", "text" : "Photographs of U.S. service members in Syria shown wearing patches associated with the Kurdish YPG have recently circulated through traditional and social media. To clarify: U.S. forces are advising and assisting Syrian Democratic Forces (SDF), primarily Syrian Arab forces operating in the northern part of the country who are leading efforts in the campaign to defeat Daesh.\\n\\nGen. Joseph Votel, commander of U.S. Central Command said, \\"We\'ve been very clear from the outset of this campaign that to defeat Daesh we would need to work with many different elements operating on the ground in Iraq and Syria. In Syria, with the support of the 60-plus natio

In [28]:
type(lines[0])

str

In [29]:
literal_eval(lines[0])

{'_id': '001C9C3F3DFE16B4921B1E906F66E161',
 'sourceName': 'CENTCOM',
 'sourceUrl': 'http://www.centcom.mil/MEDIA/PRESS-RELEASES/Press-Release-View/Article/904608/centcom-reinforces-support-for-syrian-arab-coalition/',
 'wordCount': 193,
 'sentenceCount': 8,
 'title': 'CENTCOM reinforces support for Syrian Arab Coalition',
 'text': 'Photographs of U.S. service members in Syria shown wearing patches associated with the Kurdish YPG have recently circulated through traditional and social media. To clarify: U.S. forces are advising and assisting Syrian Democratic Forces (SDF), primarily Syrian Arab forces operating in the northern part of the country who are leading efforts in the campaign to defeat Daesh.\n\nGen. Joseph Votel, commander of U.S. Central Command said, "We\'ve been very clear from the outset of this campaign that to defeat Daesh we would need to work with many different elements operating on the ground in Iraq and Syria. In Syria, with the support of the 60-plus nation Count

In [30]:
type(literal_eval(lines[0]))

dict

In [62]:
def prepare_document_json(raw_dir: Path, doc_path: Path, output_dir: Path) -> None:
    """Prepare re3d dataset document json files into one json object

    Args:
        raw_dir (Path): Directory with raw json files
        doc_path (Path): file path from raw_dir
        output_dir (Path): Output Directory
    """
    with open(raw_dir / doc_path, "r") as fp:
        lines = fp.readlines()

    lines_dict = {literal_eval(line)["_id"]:literal_eval(line) for line in lines}

    # make output dir if not exist
    sub_dir = output_dir / doc_path.parents[0]
    sub_dir.mkdir(parents=True, exist_ok=True)

    with open(output_dir / doc_path, "w") as fp:
        json.dump(lines_dict, fp)

In [63]:
prepare_document_json(RAW_DIR, Path("CENTCOM/documents.json"), PREPARED_DIR)

In [89]:
doc_jsons = RAW_DIR.glob("**/documents.json")

In [90]:
first = list(doc_jsons)

[PosixPath('/home/enyquist/repos/RLNER/data/raw/CENTCOM/documents.json'),
 PosixPath('/home/enyquist/repos/RLNER/data/raw/UK Government/documents.json'),
 PosixPath('/home/enyquist/repos/RLNER/data/raw/US State Department/documents.json'),
 PosixPath('/home/enyquist/repos/RLNER/data/raw/Australian Department of Foreign Affairs/documents.json'),
 PosixPath('/home/enyquist/repos/RLNER/data/raw/Wikipedia/documents.json'),
 PosixPath('/home/enyquist/repos/RLNER/data/raw/BBC Online/documents.json'),
 PosixPath('/home/enyquist/repos/RLNER/data/raw/Delegation of the European Union to Syria/documents.json')]

In [91]:
for doc_json in RAW_DIR.glob("**/documents.json"):
    sub_path = Path("/".join(doc_json.parts[-2:]))
    prepare_document_json(RAW_DIR, Path(sub_path), PREPARED_DIR)

In [82]:
list(first.parents)

[PosixPath('/home/enyquist/repos/RLNER/data/raw/CENTCOM'),
 PosixPath('/home/enyquist/repos/RLNER/data/raw'),
 PosixPath('/home/enyquist/repos/RLNER/data'),
 PosixPath('/home/enyquist/repos/RLNER'),
 PosixPath('/home/enyquist/repos'),
 PosixPath('/home/enyquist'),
 PosixPath('/home'),
 PosixPath('/')]

In [87]:
Path("/".join(first.parts[-2:]))

PosixPath('CENTCOM/documents.json')