this file is for managing file directories containing corpuses, built with MDPI in mind

the idea is to scan the folder containing all scraped articles (a_dir), then move folders with articles that do have reviews to a different folder (r_dir, by default it's named 'reviewed')

this notebook is also used to fix issues with the mdpi corpus: missing 'sub_articles' field in the metadata files

In [5]:
import os
import pandas as pd
import json
import shutil

In [6]:
def getj(a):
    """
    Takes a path to a directory and returns the metadata.json file as a dictionary.

    :param a: path to an article's folder.
    :return: A json object as a dict.
    :rtype: dict

    """
    path = os.path.join(a, "metadata.json")
    if not os.path.exists(path):
        print("no meta for a =", a)
        return pd.NA
    fp = open(path, encoding='utf-8')
    j = json.load(fp)
    fp.close()
    return j

# count number of files in sub-articles for each reviewed
def count_suba_files(x):
    if not x.has_suba: return 0
    else: return len(os.listdir(x.path+"/sub-articles"))

def load_arts_dir(dirpath, load_meta = False):
    """
    The load_arts_dir function takes a directory path and returns a dataframe df with the following columns:
        doi - the name of each subdirectory in dirpath, which should be dois
        path - the full path to each subdirectory in dirpath
        has_suba - whether or not there is a 'sub-articles' folder within each article's folder
        num_suba_files - how many files are contained within 'sub-articles' if it exists
        meta and meta.has_reviews - only if load_meta is True

    Setting the load_meta parameter to True will mean that all JSON files will be loaded into df as dicts and stored in the 'meta' column.
    This could take Extremely long to run, depending on the number of directories in the provided path.
    
    :param dirpath: Used to Specify the directory path of the articles.
    :param load_meta=False: Whether to Load the metadata from the articles into the column 'meta'.
    :return: A pandas dataframe
    """
    df = pd.DataFrame({'doi': os.listdir(dirpath)})  # folder names should be dois
    df = df.loc[df.doi.map(lambda x: os.path.isdir(os.path.join(dirpath, x)))]
    df['path'] = df.doi.map(lambda x: os.path.join(dirpath, x))  # path to the folder relative to cwd
    # check if articles have a sub-articles folder
    df['has_suba'] = df.path.map(lambda p: os.path.exists(os.path.join(p, "sub-articles")))
    df['num_suba_files'] = df.apply(count_suba_files, axis=1)
    if load_meta:
        df['meta'] = df.path.apply(getj)  # dangerous line
        df['meta.has_reviews'] = df.meta.map(lambda a: a['has_reviews'])

    return df
    

In [7]:
a_dir = os.path.join('output', 'mdpi')  # change 'mdpi' to plos/elife to work with others
r_dir = os.path.join(a_dir, "reviewed")  # this is where the reviewed articles should be stored. could be different for you

In [4]:
# WARNING: Run at your own risk: this cell loads ALL metadata files from a_dir into the dataframe `arts`
# on my PC this took 42 minutes to run!
arts = load_arts_dir(a_dir, load_meta=True)

In [8]:
# WARNING: same as above, but for the folder with reviewed articles
# this takes ~ 11 minutes on my PC
rarts = load_arts_dir(r_dir, load_meta=True)

In [6]:
print("we have",len(arts),"articles in the root (all) folder and\t",len(rarts),"in the 'reviewed' folder")

we have 493681 articles in the root (all) folder and	 135772 in the 'reviewed' folder


In [None]:
print(sum(arts['meta.has_reviews']), "<- this many articles from " + a_dir + " have reviews!")

In [106]:
# count how many articles have a sub-articles folder
print(sum(arts.has_suba),  "out of",len(arts),"arts have a sub-articles folder")
print(sum(rarts.has_suba), "out of",len(rarts),"rarts have a sub-articles folder")

0 out of 493681 arts have a sub-articles folder
151302 out of 151817 rarts have a sub-articles folder


In [None]:
## move reviewed to a separate folder

reviewed_arts = arts.loc[arts['meta.has_reviews']]

# first check if already in rarts:
reviewed_arts.loc[:, 'in_rarts'] = reviewed_arts.loc[:, 'doi'].isin(rarts.loc[:, 'doi'])

print(f"{reviewed_arts['in_rarts'].mean()*100}% of dois are already in rarts")

In [105]:
if not reviewed_arts['in_rarts'].all():
    reviewed_arts['new_path'] = reviewed_arts['path'].apply(lambda x: '/'.join(str(x).split('/')[0:2]) + '/reviewed/' + str(x).split('/')[2])
    
    # do the new paths exist?
    # print(f"{reviewed_arts['new_path'].apply(os.path.exists).mean()*100}% of new_paths already exist")

    reviewed_arts.apply(lambda x: shutil.copytree(x.path, x.new_path), axis=1)

In [117]:
## remove reviewed articles from root folder

reviewed_arts.loc[:, 'path_exists'] = reviewed_arts.loc[:, "path"].apply(os.path.exists)

reviewed_arts = reviewed_arts.loc[reviewed_arts['path_exists']]

# print(reviewed_arts['path_exists'].sum(), len(reviewed_arts))

if reviewed_arts['path_exists'].any():
    reviewed_arts['path'].apply(shutil.rmtree)


In [None]:
reviewed_arts.loc[:, 'path_exists'] = reviewed_arts.loc[:, "path"].apply(os.path.exists)
reviewed_arts['path_exists'].any()

In [20]:
# bind new rows to rarts
reviewed_arts['path'] = reviewed_arts['new_path']
rarts = pd.concat([rarts, reviewed_arts.loc[~reviewed_arts['in_rarts']]])

In [42]:
lacking_suba = rarts[rarts['num_suba_files'] == 0]
print(len(lacking_suba), "<- this many reviewed articles have 0 files in their sub-articles folder")
print("That is",round(len(lacking_suba)*100/len(rarts)),"percent of all rarts")

16045 <- this many reviewed articles have 0 files in their sub-articles folder
That is 11 percent of all rarts


In [None]:
## move reviewed, but without sub-articles to a working dump_dir

dump_dir = "output/mdpi-to-scrape"

In [None]:
lacking_suba['new_path'] = lacking_suba.loc[:, 'doi'].map(lambda x: dump_dir + "/" + x)
lacking_suba.apply(lambda x: shutil.copytree(x.path, x.new_path), axis=1)

In [105]:
## remove reviewed articles without sub-articles from 'reviewed'

# first remove those form rarts
rarts = rarts.loc[~rarts.doi.isin(lacking_suba.doi)]

# check if path exists, if yes remove those files
lacking_suba.loc[:, 'path_exists'] = lacking_suba.loc[:, "path"].apply(os.path.exists)

lacking_suba = lacking_suba.loc[lacking_suba['path_exists']]

# print(lacking_suba['path_exists'].sum(), len(lacking_suba))

if lacking_suba['path_exists'].any():
    lacking_suba['path'].apply(shutil.rmtree)

run the mdpi review crawler on the folder `dump_dir` specified above to attempt to scrape the missing sub-articles files

then run the cells below to load the updated articles from that directory, and then update the 'reviewed' folder

In [50]:
lacking_suba2 = load_arts_dir(dump_dir, load_meta=True)

# check for how many articles we managed to get sub-articles:
print("sub-articles were scraped for",len(lacking_suba2[lacking_suba2['num_suba_files'] > 0]),
      "out of",len(lacking_suba2),"articles from",dump_dir)

lacking_suba2 = lacking_suba2[lacking_suba2['num_suba_files'] > 0]

sub-articles were scraped for 479 out of 16524 articles from output/mdpi-dump-dir/


In [53]:
## move these articles back to the 'reviewed' folder
lacking_suba2['new_path'] = lacking_suba2['doi'].apply(lambda x: os.path.join(r_dir, x))

lacking_suba2['path'] = lacking_suba2.apply(lambda x: shutil.copytree(x.path, x.new_path, dirs_exist_ok=True), axis=1)

In [None]:
# and finally update rarts:
assert len(rarts.loc[rarts.doi.isin(lacking_suba2.doi)] == len(lacking_suba2))

rarts = pd.concat([rarts, lacking_suba2])

In [55]:
## add sub-articles to metadata.json files

rarts = rarts.loc[rarts['num_suba_files'] > 0]
print(len(rarts), "articles have a sub-articles directory")

135772 articles have a sub-articles directory


In [None]:
rarts['meta.has_suba_obj'] = rarts['meta'].apply(lambda x: 'sub_articles' in x.keys())

# working copy to update
temp = rarts.loc[-rarts['meta.has_suba_obj']]
print(len(temp),"JSON files should end up with the key 'sub-articles'")

In [82]:
def update_metadata_add_suba(rart):
    """
    The update_metadata_add_suba function takes an article and updates the metadata.json file
    with sub-article information if it exists.
    
    :return: A metadata object.
    """
    meta = rart.meta
    if not rart.has_suba: return meta
    if rart.num_suba_files == 0: return meta  # shouldn't happen really 

    sub_a_path = os.path.join(rart.path, 'sub-articles')
    
    sub_articles = []
    for json_file in [f for f in os.listdir(sub_a_path) if f.endswith(".json")]:
        filepath = os.path.join(sub_a_path, json_file)
        j = json.load(open(filepath, 'rb'))
        sub_articles.append(j)
    
    if len(sub_articles) > 0: 
        meta['sub_articles'] = sub_articles
        with open(os.path.join(rart.path, "metadata.json"), 'w', encoding="utf-8") as fp:
            json.dump(meta, fp, ensure_ascii=False)
    return meta


In [83]:
# WARNING: this cell will probably also take very long to run, depending on how many articles are in `temp`
# 17 minutes on my PC
temp['meta'] = temp.apply(update_metadata_add_suba, axis=1)
temp['meta.has_suba_obj'] = temp['meta'].apply(lambda x: 'sub_articles' in x.keys())

In [86]:
# and finally update rarts:
assert len(rarts) == len(temp) + len(rarts.loc[rarts['num_suba_files'] > 0])

rarts = rarts.loc[~rarts.doi.isin(temp.doi)]
rarts = pd.concat([rarts, temp])

In [87]:
rarts['meta.has_suba_obj'] = rarts['meta'].apply(lambda x: 'sub_articles' in x.keys())
print(f"{round(rarts['meta.has_suba_obj'].mean()*100)}% of rarts now have sub-articles in their metadata.json")

100% of rarts now have sub-articles in their metadata.json
