In [1]:
import requests
import os
import pandas as pd
import json
import time
from datetime import datetime

In [2]:
script_path = os.getcwd()
dump_path = os.path.join(script_path,'gpt_flagged_dumps')
manual_dump_path = os.path.join(script_path,'manually_flagged_spam')
term_dump_path = os.path.join(script_path,'term_based_spam')
data_path = os.path.join(script_path,'data')
spamlist_file = os.path.join(data_path,'GPT-categorized-ads.xlsx')
repo_dumps = os.path.join(script_path,'dumps_from_repos')
manual_spam_file = os.path.join(data_path,'manually_identified_spam.xlsx')

zenodo_spam = pd.read_excel(spamlist_file, 'Zenodo', engine='openpyxl', header=0)
mendeley_spam = pd.read_excel(spamlist_file, 'Mendeley', engine='openpyxl', header=0)
manually_found_spam = pd.read_excel(manual_spam_file, 'spam', engine='openpyxl', header=0)
spam_search_terms = pd.read_excel(manual_spam_file, 'spamterms', engine='openpyxl', header=0)

In [3]:
print(manually_found_spam.head(n=2))
print(spam_search_terms.head(n=2))

                    resource_id repository                  resource_url  \
0  dataverse_10.7910_dvn_hpqe8t  dataverse  dataverse_10.7910_dvn_hpqe8t   
1  dataverse_10.7910_dvn_eowrl9  dataverse  dataverse_10.7910_dvn_eowrl9   

                           access_url  
0  https://doi.org/10.7910/DVN/HPQE8T  
1  https://doi.org/10.7910/DVN/EOWRL9  
           phrases
0  discount coupon
1             keto


In [None]:
zenodo_ids = zenodo_spam['id'].unique().tolist()
clean_zenids = [x.replace('ZENODO_','') for x in zenodo_ids]
print(clean_zenids[0:1])

In [19]:
#### Download all records flagged as spam
def download_gpt_flagged_spam(dump_path, spam_ids):
    repo_deleted = []
    nde_missing = []
    for eachid in spam_ids:
        cleanid = eachid.lower()
        if "zenodo" in cleanid:
            repo = "zenodo"
        elif "mendeley" in cleanid:
            repo = "mendeley"
        elif "dataverse" in cleanid:
            repo = "dataverse"
        nde_api_url = f"https://api-staging.data.niaid.nih.gov/v1/query?&q={cleanid}"
        r = requests.get(nde_api_url)
        if r.status_code == 200:
            tmp = json.loads(r.text)
            if len(tmp['hits'])>0:
                tmphit = tmp['hits'][0]
                with open(os.path.join(dump_path,f"{cleanid}.json"),'w') as outfile:
                    outfile.write(json.dumps(tmphit, indent=4))
        else:
            nde_missing.append(cleanid)
        if repo == "zenodo":
            zenurlid = cleanid.replace("zenodo_","")
            zenrequest = requests.get(f"https://zenodo.org/api/records/{zenurlid}")
            if zenrequest.status_code == 410:
                repo_deleted.append(cleanid)
            time.sleep(1.125)
        elif repo == "mendeley":
            mendeley_id = cleanid.replace("mendeley_","")
            mendeleyrequest = requests.get(f"https://data.mendeley.com/oai?verb=GetRecord&metadataPrefix=datacite&identifier=oai:data.mendeley.com/{mendeley_id}")
            if "idDoesNotExist" in mendeleyrequest.text:
                repo_deleted.append(cleanid)
                time.sleep(1.125)                

    today = datetime.now()
    with open(os.path.join(repo_dumps,f'repo_deletions_{today.strftime("%Y-%m-%d")}.txt'),'w') as dumpfile:
        for eachrecord in repo_deleted:
            dumpfile.write(eachrecord+'\n')

In [20]:
%%time
mendeley_ids = mendeley_spam['id'].unique().tolist()
download_gpt_flagged_spam(dump_path, mendeley_ids)

CPU times: total: 35.9 s
Wall time: 6min 42s


In [None]:
print(r.text)

In [37]:
def download_spam_records(manual_dump_path,idlist):
    repo_deleted = []
    nde_missing = []
    repo_urlist = []
    for eachid in idlist:
        nde_api_url = f"https://api-staging.data.niaid.nih.gov/v1/query?&q={eachid}"
        r = requests.get(nde_api_url)
        if r.status_code == 200:
            tmp = json.loads(r.text)
            if len(tmp['hits'])>0:
                tmphit = tmp['hits'][0]
                with open(os.path.join(manual_dump_path,f"{eachid}.json"),'w') as outfile:
                    outfile.write(json.dumps(tmphit, indent=4))
                repo_urlist.append(tmphit['url'])
    return repo_urlist

def search_for_spam(term_dump_path,spamterms):
    greirepos = ["Zenodo","Mendeley","Harvard Dataverse"]
    summary = []
    for eachterm in spamterms:
        for eachrepo in greirepos:
            nde_search_url = f'https://api-staging.data.niaid.nih.gov/v1/query?&q=includedInDataCatalog.name:"{eachrepo}"+AND+"{eachterm}"&fetch_all=true'
            r = requests.get(nde_search_url)
            tmp = json.loads(r.text)
            if len(tmp['hits'])>0:
                for eachhit in tmp['hits']:
                    tmpid = eachhit['_id']
                    with open(os.path.join(term_dump_path,f"{tmpid}.json"),'w') as outfile:
                        outfile.write(json.dumps(eachhit, indent=4))
                    summary.append({"repo":eachrepo,"search_phrase":eachterm,"id":eachhit['_id'],"data_url":eachhit['url']})
    summary_df = pd.DataFrame(summary)
    return 




In [14]:
%%time
idlist = manually_found_spam['resource_id'].unique().tolist()
repo_url_list = download_spam_records(manual_dump_path,idlist)

CPU times: total: 4.17 s
Wall time: 24.4 s


In [38]:
%%time
spamterms = spam_search_terms["phrases"].unique().tolist()
summary_df = search_for_spam(term_dump_path,spamterms)

CPU times: total: 1.53 s
Wall time: 8.13 s


In [None]:
summary_df.to_csv(os.path.join(data_path,'search_based_spam.tsv'),sep='\t',header=True)

In [17]:
def test_repo_url(repo_url_list):
    failed_urls = []
    for eachurl in repo_url_list:
        if 'zenodo' in eachurl:
            r = requests.get(eachurl)
            if r.status_code == 410:
                failed_urls.append(eachurl)
        elif 'DVN' in eachurl:
            r = requests.get(eachurl)
            if r.status_code == 404:
                failed_urls.append(eachurl)
        elif 'mendeley' in eachurl:
            r = requests.get(eachurl)
            if "Dataset Not Found" in r.text:
                failed_urls.append(eachurl)
        time.sleep(1)
    return failed_urls

### Test for deleted record or broken link status codes

In [4]:
r = requests.get("https://zenodo.org/record/12778828")
print(r.status_code)

410


In [7]:
r = requests.get("https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/2TAGUN")
print(r.status_code)

404


In [10]:
#r = requests.get("https://data.mendeley.com/datasets/8kj2rn7m9z")
r = requests.get("https://data.mendeley.com/datasets/8kj2rn7m9x")
print(r.status_code)


200


In [14]:
summary_df = pd.read_csv(os.path.join(data_path,'search_based_spam.tsv'),delimiter='\t',header=0,index_col=0)
print(summary_df.head(n=2))

                repo    search_phrase                            id  \
0  Harvard Dataverse  discount coupon  dataverse_10.7910_dvn_2taguy   
1             Zenodo             keto               zenodo_10575214   

                             data_url  
0  https://doi.org/10.7910/DVN/2TAGUY  
1  https://zenodo.org/record/10575214  


In [18]:
%%time
repo_url_list = summary_df['data_url'].unique().tolist()
deleted_records = test_repo_url(repo_url_list)

with open(os.path.join(data_path,'deleted_records.txt'),'w') as outwrite:
    for eachrecord in deleted_records:
        outwrite.write(eachrecord+'\n')
print(len(deleted_records))

517
CPU times: total: 47.8 s
Wall time: 20min 28s


In [20]:
gpt_dumplist = os.listdir(dump_path)
manual_dumplist = os.listdir(manual_dump_path)

spam_urls = []
for eachjson in gpt_dumplist:
    with open(os.path.join(dump_path,eachjson),'r') as infile:
        tmpjson = json.load(infile)
        spam_urls.append(tmpjson['url'])
for eachjson in manual_dumplist:
    with open(os.path.join(manual_dump_path,eachjson),'r') as infile:
        tmpjson = json.load(infile)
        spam_urls.append(tmpjson['url'])

not_checked = [x for x in spam_urls if x not in repo_url_list]


In [21]:
more_deleted = test_repo_url(spam_urls)
with open(os.path.join(data_path,'deleted_records.txt'),'a') as outwrite:
    for eachrecord in more_deleted:
        outwrite.write(eachrecord+'\n')
print(len(deleted_records))

517


47
