# Get list of cited DOIs to analyze

In [None]:
import sys
import os
import subprocess
from bs4 import BeautifulSoup
import glob

import pandas as pd

sys.path.insert(0, "./modules")
import bs_preprocess

# Configuration

In [None]:
data_path = 'data'
FTP_PUBMED_papers = os.path.join(data_path, 'FTP_PUBMED_papers')

DOI_list = ["10.1126/science.1179052"]
ds_name = 'DOI_cited_science_1179052_retracted'

analysis_path = os.path.join(data_path, 'analysis')
citing_tsv = os.path.join(analysis_path, '%s.tsv' % ds_name)
citing_prep_tsv = os.path.join(analysis_path, '%s_prep.tsv' % ds_name)

# Find papers in FTP_PUBMED citing these DOIs

In [None]:
if not os.path.isdir(FTP_PUBMED_papers):
    raise RuntimeError('%s does not exist, did you download the data?' % FTP_PUBMED_papers)

os.makedirs(analysis_path, exist_ok=True)

if os.path.isfile(citing_tsv):
    os.remove(citing_tsv)

for doi_index, DOI in enumerate(DOI_list):
    print(DOI)
    DOI_cited_in  = " "
    for filename in sorted(glob.glob(FTP_PUBMED_papers + '/*')):
        if not os.path.isfile(filename):
            continue
        print('%s' % filename)
        command = ["zgrep", "-a", DOI, filename]
        try:
            DOI_cited_in_bytes = subprocess.check_output(
                command,
                stderr=subprocess.STDOUT
            )
            DOI_cited_in = DOI_cited_in + " " + DOI_cited_in_bytes.decode("utf-8") # conver bytes into strings

        except subprocess.CalledProcessError as e:
            if e.returncode == 1 and not e.output:
                # assume it just didn't find anything
                pass
            else:
                print("Subprocess output:", e.output)
                raise RuntimeError(
                    'non-zero exit status %d for command %s' % (e.returncode, command)
                )
    
    list_articles_citing = []
    DOI_cited_in_BS = BeautifulSoup(DOI_cited_in, "lxml")
    for article in DOI_cited_in_BS.prettify().split('</article>'):
        # Not all the articles start or finish with this tag.
        article_BS = BeautifulSoup(article, "lxml")
        # ('<article article-type'): #In some cases the seaparation between 2 articles i<\ref> <article article-type> 
        for article2 in article_BS.prettify().split('</back>'):
            # To remove spliting of some end-tags
            if (DOI in article2) and (len(str(article2)) > 800):
                list_articles_citing.append(article2)

    if not list_articles_citing:
        raise RuntimeError('no articles matching criteria found')

    df2 = pd.DataFrame([
        [DOI, article_citing]
        for article_citing in list_articles_citing
    ], columns=['DOI_cited', 'article_citing'])
    
    df2.to_csv(
        citing_tsv,
        header=doi_index == 0,
        index=False,
        mode='w' if doi_index == 0 else 'a',
        sep='\t',
        encoding='utf-8'
    )
    print("Data saved to", citing_tsv)

# Checking files

In [None]:
read_citing_tsv = lambda filename: pd.read_csv(
    filename,
    sep='\t',
    encoding='utf-8'
)

df_citing = read_citing_tsv(citing_tsv)
df_citing.describe()

In [None]:
df_citing.head()

In [None]:
for DOI in df_citing.DOI_cited.unique():
    print(df_citing[df_citing['DOI_cited'] == DOI].shape, '\t\t', DOI)

# Prepocessing data. Remove random spaces and store in a new file.

In [None]:
# read the file again (for good measure)
df_citing = read_citing_tsv(citing_tsv)

# preprocess data
df_prep = df_citing.copy()
df_prep['article_citing'] = df_citing['article_citing'].apply(bs_preprocess.bs_preprocess)    

df_prep.to_csv(citing_prep_tsv, index=False, sep='\t', encoding='utf-8')
print("wrote to:", citing_prep_tsv)