this notebook exists to specifically fix an issue caused by the mdpi review spider:

files containing supplementary materials are wrongly logged in the metadata: all PDF/DOCX files are linked to every round, but in reality each only belongs to one round.

in this notebook the JSON files with sub-article metadata are fixed and updated, and the `metadata.json` files are updated to match them

In [50]:
import os
import re
import pandas as pd
import json
import shutil

In [51]:
def getj(a):
    """
    Takes a path to a directory and returns the metadata.json file as a dictionary.

    :param a: path to an article's folder.
    :return: A json object as a dict.
    :rtype: dict

    """
    path = os.path.join(a, "metadata.json")
    if not os.path.exists(path):
        print("no meta for a =", a)
        return pd.NA
    fp = open(path, encoding='utf-8')
    j = json.load(fp)
    fp.close()
    return j

# count number of files in sub-articles for each reviewed
def count_suba_files(x):
    if not x.has_suba: return 0
    else: return len(os.listdir(x.path+"/sub-articles"))

def load_arts_dir(dirpath, load_meta = False):
    """
    The load_arts_dir function takes a directory path and returns a dataframe df with the following columns:
        doi - the name of each subdirectory in dirpath, which should be dois
        path - the full path to each subdirectory in dirpath
        has_suba - whether or not there is a 'sub-articles' folder within each article's folder
        num_suba_files - how many files are contained within 'sub-articles' if it exists
        meta and meta.has_reviews - only if load_meta is True

    Setting the load_meta parameter to True will mean that all JSON files will be loaded into df as dicts and stored in the 'meta' column.
    This could take Extremely long to run, depending on the number of directories in the provided path.
    
    :param dirpath: Used to Specify the directory path of the articles.
    :param load_meta=False: Whether to Load the metadata from the articles into the column 'meta'.
    :return: A pandas dataframe
    """
    df = pd.DataFrame({'doi': os.listdir(dirpath)})  # folder names should be dois
    df = df.loc[df.doi.map(lambda x: os.path.isdir(os.path.join(dirpath, x)))]
    df['path'] = df.doi.map(lambda x: os.path.join(dirpath, x))  # path to the folder relative to cwd
    # check if articles have a sub-articles folder
    df['has_suba'] = df.path.map(lambda p: os.path.exists(os.path.join(p, "sub-articles")))
    df['num_suba_files'] = df.apply(count_suba_files, axis=1)
    if load_meta:
        df['meta'] = df.path.apply(getj)  # dangerous line
        df['meta.has_reviews'] = df.meta.map(lambda a: a['has_reviews'])

    return df
    

In [52]:
r_dir = os.path.join("output", "mdpi", "reviewed")  # this is where the reviewed articles should be stored

In [53]:
# WARNING: this takes ~ 11 minutes on my PC
rarts = load_arts_dir(r_dir, load_meta=True)

In [68]:
print("we have",len(rarts),"articles in",r_dir)

we have 135772 articles in output\mdpi\reviewed


In [83]:
def update_metadata_add_suba(rart):
    """
    The update_metadata_add_suba function takes an article and updates the metadata.json file
    with sub-article information if it exists.
    
    :return: A metadata object.
    """
    meta = rart.meta
    if not rart.has_suba: return meta
    if rart.num_suba_files == 0: return meta  # shouldn't happen really 

    sub_a_path = os.path.join(rart.path, 'sub-articles')
    
    sub_articles = []
    for json_file in [f for f in os.listdir(sub_a_path) if f.endswith(".json")]:
        filepath = os.path.join(sub_a_path, json_file)
        j = json.load(open(filepath, 'rb'))
        sub_articles.append(j)
    
    if len(sub_articles) > 0: 
        meta['sub_articles'] = sub_articles
        with open(os.path.join(rart.path, "metadata.json"), 'w', encoding="utf-8") as fp:
            json.dump(meta, fp, ensure_ascii=False)
    return meta


In [84]:
rarts['num_rounds'] = rarts.meta.apply(lambda m: max([int(x['round']) for x in m['sub_articles']]))
rarts['meta.num_sub_a'] = rarts.meta.apply(lambda m: len(m['sub_articles']))
rarts['meta.sub_articles'] = rarts.meta.apply(lambda m: m['sub_articles'])
assert (rarts['num_suba_files'] >= 2*rarts['meta.num_sub_a']).all()

In [85]:
rarts['meta.num_sub_a_with_supp_m'] = rarts.meta.apply(lambda m: sum(['supplementary_materials' in x for x in m['sub_articles']]))
assert (rarts['meta.num_sub_a_with_supp_m'] == rarts['meta.num_sub_a']).all()

In [86]:
# we only need to fix articles with more than one sub-article
rarts = rarts.loc[rarts['meta.num_sub_a'] > 1]
# actually, we only care about articles that have more than 2*num_rounds files in /sub-articles 
rarts = rarts.loc[rarts['num_suba_files'] > 2*rarts['num_rounds']]

In [87]:
## for each article, get all supplementary materials (in array of dicts)
def get_all_supp_ms(sub_articles):
    supplementary_materials = []
    gone_ids = []  # to avoid duplicates: keep track of id's that were already appended 
    for sub_a in sub_articles:
        for sup_m in sub_a['supplementary_materials']:
            if sup_m['id'] in gone_ids: continue
            gone_ids.append(sup_m['id'])
            supplementary_materials.append(sup_m)
    return supplementary_materials

rarts['r.all_supplementary_materials'] = rarts['meta.sub_articles'].apply(get_all_supp_ms)

In [88]:
## for each article, get the filenames of the reviews in plaintext
def get_plaintexts(sub_articles):
    plaintexts = []
    for sub_a in sub_articles:
        plaintexts += [sup_m['filename'] for sup_m in sub_a['supplementary_materials'] if sup_m['title'].endswith("plaintext.")]
    return plaintexts

rarts['r.plaintext_filenames'] = rarts['meta.sub_articles'].apply(get_plaintexts) 

In [107]:
def get_all_orig_filenames(rart):
    # read each plaintext file and find all original filenames of the supplementary materials for each round
    # the result may be empty
    all_orig_filenames = []
    for i, filename in enumerate(rart['r.plaintext_filenames']):
        filepath = os.path.join(rart['path'], 'sub-articles', filename)
        
        orig_filenames = {}

        with open(filepath, 'r', encoding='utf-8') as fp:
            lines = re.findall(r"File: .+", fp.read())
        names = [line.split(": ")[-1].strip() for line in lines]
        for name in names:
            if name in orig_filenames: orig_filenames[name] += 1
            else: orig_filenames[name] = 1
        all_orig_filenames.append(orig_filenames)
    return all_orig_filenames

# takes very long
rarts['r.all_orig_filenames'] = rarts.apply(get_all_orig_filenames, axis=1)

In [108]:
def reassign_supp_materials(rart):
    """
    The reassign_supp_materials function takes a rarticle, and returns a new, fixed list of sub-articles for that article.
    
    The purpose is to reassign supplementary materials to their respective sub-article. 
    This is necessary because in some cases, supplementary materials are assigned to all sub-articles.
    
    :return: A new list of sub-articles.
    
    """
    sub_articles = rart['meta.sub_articles']
    for sub_a in sub_articles[1:]:
        assert len(sub_articles[0]['supplementary_materials']) == len(sub_a['supplementary_materials'])
    plaintext_filenames = rart['r.plaintext_filenames']
    assert len(sub_articles) == len(plaintext_filenames)
    all_supp_ms = rart['r.all_supplementary_materials']
    assert len(all_supp_ms) > len(sub_articles)
    all_orig_filenames = rart['r.all_orig_filenames']
    assert len(all_orig_filenames) == rart['num_rounds']

    new_sub_articles = []
    gone_ids = []

    for i, d_round in enumerate(all_orig_filenames):
        new_sub_a = sub_articles[i].copy()
        new_sub_a['supplementary_materials'] = []
        assert int(new_sub_a['round']) == i+1

        for sup_m in all_supp_ms:
            if sup_m['id'] in gone_ids: continue
            # case when it's one of the plaintext files:
            if 'original_filename' not in sup_m.keys() or sup_m['original_filename'] not in d_round:
                if sup_m['id'].split('.r')[-1] == new_sub_a['round']:
                    new_sub_a['supplementary_materials'].append(sup_m)
                    gone_ids.append(sup_m['id'])
            # case when it's any other type of file
            elif d_round[sup_m['original_filename']] > 0:
                d_round[sup_m['original_filename']] -= 1
                new_sub_a['supplementary_materials'].append(sup_m)
                gone_ids.append(sup_m['id'])
        new_sub_articles.append(new_sub_a)
            
    return new_sub_articles

In [116]:
rarts['meta.sub_articles'] = rarts.apply(reassign_supp_materials, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rarts['meta.sub_articles'] = rarts.apply(reassign_supp_materials, axis=1)


In [117]:
def update_metadata_new_suba(rart):
    meta = rart['meta']
    num_rounds = rart['num_rounds']
    if num_rounds == 1 or len(rart['r.all_supplementary_materials']) == num_rounds:  # shouldn't happen really
        return meta  
    
    # read each JSON file with  sub-article metadata and update its contents
    sub_articles = rart['meta.sub_articles']
    for i, sub_a in enumerate(sub_articles):
        r_no = i+1
        meta_filename = f"{rart['doi']}.r{r_no}.json"
        filepath = os.path.join(rart['path'], 'sub-articles', meta_filename)
        with open(filepath, 'w', encoding='utf-8') as fp:
            json.dump(sub_a, fp, ensure_ascii=False)

    meta['sub_articles'] = sub_articles
    
    # update the metadata.json file itself
    with open(os.path.join(rart['path'], "metadata.json"), 'w', encoding="utf-8") as fp:
        json.dump(meta, fp, ensure_ascii=False)

    return meta

In [118]:
rarts['meta'] = rarts.apply(update_metadata_new_suba, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rarts['meta'] = rarts.apply(update_metadata_new_suba, axis=1)
