In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import os
from glob import glob
from collections import defaultdict
import json
from tqdm import tqdm

In [2]:
def corpus_to_TXM(corpus: list, folder_path: str) -> None:
    """
    Script to convert json files on a folder to XML file
    """
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)

    list_metadata = []
    for file in tqdm(corpus):

#         file = self.open_file(file)
        with open(file) as f:
            file = json.load(f)
            
        issue_id = file['issue']['id']
        issue_date = file['issue']['date']
        issue_language = file['issue']['language']
        issue_full_text = file['issue']['full_text']
        issue_en = file['issue']['named_entities']
        xml_text = file_to_XML(issue_full_text, issue_en)
        with open(f"{folder_path}/{issue_id}.xml", 'w') as f:
            f.write(xml_text)

        metadata = {
            "id": issue_id,
            "date": issue_date,
            "language": issue_language
        }
        list_metadata.append(metadata)

    df_metadata = pd.DataFrame.from_dict(list_metadata)
    df_metadata.to_csv(f"{folder_path}/metadata.csv", index = None)

def file_to_XML(issue_full_text: str, issue_en: list) -> str:
    """
    COnverts full issue text into XML. Manages to add named_entity tag
    in the text using their given position by issue_en in the original JSON files
    """
    len_tag = 0
    list_text = []
    start_text_idx = 0
    if issue_en:
        for ne in issue_en:
            ne_id = ne['id']
            link = ne['link']
            ne_type = ne['type']
            mention = ne['mention']
            start_idx = ne['start_idx']
            end_idx = ne['end_idx']
            stance = ne['stance']
            template_ne_tag = f'<named_entity id="{ne_id}" link="{link}" type="{ne_type}" start_idx="{start_idx}" end_idx="{end_idx}" stance="{stance}">{mention}</named_entity>'

            new_text = f"{issue_full_text[start_text_idx:start_idx]}{template_ne_tag}"
            list_text.append(new_text)
            start_text_idx = end_idx
        last_text_slice = issue_full_text[start_text_idx:]
        list_text.append(last_text_slice)

    else:
        list_text.append(issue_full_text)


    annotated_text = ''.join(list_text)
    xml_text = f"<issue>{annotated_text}</issue>"
    soup = BeautifulSoup(xml_text, features='xml')
    return soup.prettify()

In [3]:
txm_data = 'txm_data'

In [4]:
all_files = glob(f"export_hackathon/**/*.json", recursive=True)

In [9]:
def get_metadata(list_files):
    """
    """
    list_dict = []
    for file in list_files:
        list_path = file.split('/')
    #     print(list_path)
        year = list_path[1]
        foldername = list_path[2]
        list_dict.append({
            "foldername": foldername,
            "year": year,
            "file": file
        })

    df = pd.DataFrame.from_dict(list_dict)
    return df
df = get_metadata(all_files)

In [4]:
ny = glob(f"new_york_herald_named_entities/**/*.json", recursive=True)

In [6]:
ny = [file for file in all_files if '1920' in file]

In [7]:
ny

['new_york_herald_named_entities/1920/new_york_herald_12148-bd6t53949n.json',
 'new_york_herald_named_entities/1920/new_york_herald_12148-bd6t52596x.json',
 'new_york_herald_named_entities/1920/new_york_herald_12148-bd6t540533.json',
 'new_york_herald_named_entities/1920/new_york_herald_12148-bd6t53915k.json',
 'new_york_herald_named_entities/1920/new_york_herald_12148-bd6t52462m.json',
 'new_york_herald_named_entities/1920/new_york_herald_12148-bd6t52524c.json',
 'new_york_herald_named_entities/1920/new_york_herald_12148-bd6t53946q.json',
 'new_york_herald_named_entities/1920/new_york_herald_12148-bd6t53983w.json',
 'new_york_herald_named_entities/1920/new_york_herald_12148-bd6t52570h.json',
 'new_york_herald_named_entities/1920/new_york_herald_12148-bd6t539991.json',
 'new_york_herald_named_entities/1920/new_york_herald_12148-bd6t539874.json',
 'new_york_herald_named_entities/1920/new_york_herald_12148-bd6t54048c.json',
 'new_york_herald_named_entities/1920/new_york_herald_12148-bd6t

In [5]:
## ONLY FOR THE NEW YORK HERALDS
def get_metadata(list_files):
    """
    """
    list_dict = []
    for file in list_files:
        list_path = file.split('/')
    #     print(list_path)
        year = list_path[1]
        foldername = list_path[0]
        list_dict.append({
            "foldername": foldername,
            "year": year,
            "file": file
        })

    df = pd.DataFrame.from_dict(list_dict)
    return df
df = get_metadata(ny)

In [6]:
df

Unnamed: 0,foldername,year,file
0,new_york_herald_named_entities,1920,new_york_herald_named_entities/1920/new_york_h...
1,new_york_herald_named_entities,1920,new_york_herald_named_entities/1920/new_york_h...
2,new_york_herald_named_entities,1920,new_york_herald_named_entities/1920/new_york_h...
3,new_york_herald_named_entities,1920,new_york_herald_named_entities/1920/new_york_h...
4,new_york_herald_named_entities,1920,new_york_herald_named_entities/1920/new_york_h...
...,...,...,...
2526,new_york_herald_named_entities,1914,new_york_herald_named_entities/1914/new_york_h...
2527,new_york_herald_named_entities,1914,new_york_herald_named_entities/1914/new_york_h...
2528,new_york_herald_named_entities,1914,new_york_herald_named_entities/1914/new_york_h...
2529,new_york_herald_named_entities,1914,new_york_herald_named_entities/1914/new_york_h...


In [7]:
for folder in df['foldername'].unique():
    df_newspaper = df[df['foldername'] == folder]
    print(folder)
    folder_path = f"{txm_data}/{folder}"
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
        
    for year in df_newspaper['year'].unique():
        df_year = df_newspaper[df_newspaper['year'] == year]
        print(year)
        year_path = f"{folder_path}/{year}"
        if not os.path.exists(year_path):
            os.mkdir(year_path)
            
        corpus = df_year['file'].values
        corpus_to_TXM(corpus, year_path)
#         break
    print('------')
#     break

  0%|          | 0/365 [00:00<?, ?it/s]

new_york_herald_named_entities
1920


100%|██████████| 365/365 [01:31<00:00,  3.99it/s]
  0%|          | 1/343 [00:00<00:56,  6.05it/s]

1919


100%|██████████| 343/343 [01:16<00:00,  4.46it/s]
  0%|          | 0/549 [00:00<?, ?it/s]

1916


100%|██████████| 549/549 [02:07<00:00,  4.30it/s]
  0%|          | 0/365 [00:00<?, ?it/s]

1918


100%|██████████| 365/365 [01:13<00:00,  4.98it/s]
  0%|          | 0/365 [00:00<?, ?it/s]

1917


100%|██████████| 365/365 [01:15<00:00,  4.82it/s]
  0%|          | 0/363 [00:00<?, ?it/s]

1915


100%|██████████| 363/363 [01:21<00:00,  4.46it/s]
  0%|          | 0/181 [00:00<?, ?it/s]

1914


100%|██████████| 181/181 [01:25<00:00,  2.12it/s]

------



