## Duplicate record check

Determining the extent of the duplicate records issue

Zenodo versioning duplicates
1. Pull name and id fields for 50,000 zenodo records
2. Check for duplicate names on unique ids
3. Calculate rate of duplication

OmicsDI/GEO duplicates
1. Pull name and id fields 1000 GEO records
2. Search OMICS DI for matching names

Zenodo/Dryad duplicates
See OmicsDI/GEO duplicates



In [10]:
import json
import requests
import pandas as pd
import time
from datetime import datetime

In [None]:
%%time
r = requests.get('https://api.data.niaid.nih.gov/v1/query?q=includedInDataCatalog.name:"Zenodo"&fields=name&fetch_all=true')
cleanr = json.loads(r.text)
hits = cleanr['hits']
#print(len(cleanr['hits']))
df1 = pd.DataFrame(cleanr['hits'])
scroll_id = cleanr['_scroll_id']

In [None]:
%%time
i = 0
while i < 10:
    r2 = requests.get(f'https://api.data.niaid.nih.gov/v1/query?scroll_id={scroll_id}')
    tmp = json.loads(r2.text)
    scroll_id = tmp['_scroll_id']
    tmpdf = pd.DataFrame(tmp['hits'])
    df1 = pd.concat((df1,tmpdf),ignore_index=True)
    print(len(df1))
    i = i+1
    time.sleep(0.5)

In [None]:
## Check for replicated records (id and name)

check_for_reps = df1.groupby(['_id','name']).size().reset_index(name='counts')
replicates = check_for_reps.loc[check_for_reps['counts']>1]
nonreps = check_for_reps.loc[check_for_reps['counts']==1]
print("original length: ",len(df1)," replicates: ",len(replicates))

## Check for duplicate/version records (name only)
check_for_dups = nonreps.groupby(['name']).size().reset_index(name='dup_counts')
duplicates = check_for_dups.loc[check_for_dups['dup_counts']>1]
nondups = check_for_dups.loc[check_for_dups['dup_counts']==1]

## Stats
{"run":n,"samples":len(df1),"replicates":len(replicates),"duplicates":len(duplicates),"% dups":len(duplicates)/len(replicates)*100}


In [11]:
def fetch_zenodo_records(record_limit):
    r = requests.get('https://api.data.niaid.nih.gov/v1/query?q=includedInDataCatalog.name:"Zenodo"&fields=name&fetch_all=true')
    cleanr = json.loads(r.text)
    hits = cleanr['hits']
    #print(len(cleanr['hits']))
    df1 = pd.DataFrame(cleanr['hits'])
    scroll_id = cleanr['_scroll_id'] 
    i = 0
    while i < record_limit:
        r2 = requests.get(f'https://api.data.niaid.nih.gov/v1/query?scroll_id={scroll_id}')
        tmp = json.loads(r2.text)
        scroll_id = tmp['_scroll_id']
        tmpdf = pd.DataFrame(tmp['hits'])
        df1 = pd.concat((df1,tmpdf),ignore_index=True)
        #print(len(df1))
        i = i+1
        time.sleep(0.5)  
    return df1

def check_dups(df1):
    check_for_reps = df1.groupby(['_id','name']).size().reset_index(name='counts')
    replicates = check_for_reps.loc[check_for_reps['counts']>1]
    nonreps = check_for_reps.loc[check_for_reps['counts']==1]
    check_for_dups = nonreps.groupby(['name']).size().reset_index(name='dup_counts')
    duplicates = check_for_dups.loc[check_for_dups['dup_counts']>1]
    nondups = check_for_dups.loc[check_for_dups['dup_counts']==1]
    timecheck = datetime.now()
    run_info = timecheck.strftime("%Y-%m-%d")
    tmpdict = {"samples":len(df1),"replicates":len(replicates),
               "duplicates":len(duplicates),"unique records":len(nondups),
               "% dups":len(duplicates)/len(nonreps)*100,"run date":run_info}
    duplicates.to_csv(f"duplicates_{run_info}.tsv",sep='\t',header=True)
    return tmpdict

def get_zenodo_dup_stats(repetitions, record_limit):
    n = 0
    statlist = []
    while n < repetitions:
        print("now performing run #",n)
        df1 = fetch_zenodo_records(record_limit)
        tmpdict = check_dups(df1)
        tmpdict['run number'] = n
        statlist.append(tmpdict)
        time.sleep(300)
        n=n+1
    return statlist

In [12]:
%%time
repetitions = 1
record_limit = 49
statlist = get_zenodo_dup_stats(repetitions, record_limit)
statdf = pd.DataFrame(statlist)
statdf.to_csv('dup_stats.tsv')
print(statdf)

now performing run # 0
   samples  replicates  duplicates  unique records  % dups    run date  \
0    50000           0        2345           43819    4.69  2023-08-10   

   run number  
0           0  
CPU times: total: 3.77 s
Wall time: 6min 7s


## Checking Metadata differences between OMICS-DI and GEO

1. Compare lengths of names and descriptions
2. For duplicate records in this sample, pull 'species', 'measurementTechnique', and 'infectiousAgent' fields to compare the data from the two repos

In [None]:
import pandas as pd
from pandas import read_csv
import requests
import json
import time
import math

In [None]:
df3 = read_csv('data/citation_df_clean.tsv',delimiter='\t',header=0,index_col=0)

In [None]:
print(df3.head(n=2))

In [None]:
#### Find duplicate records
## Since each record has a unique id, if we group by the name and citation pmid, we'll find duplicate records
df3['pmid'] = df3['pmid'].astype(str)
df3_counts = df3.groupby(['name','pmid']).size().reset_index(name='counts')
rep_subset = df3_counts.loc[df3_counts['counts']>1]
print(len(rep_subset))

In [None]:
#### Check to see if the number of unique names matches that of the number of unique citation records
## Note, it does not. There are more unique names than pmids, therefore, some datasets cite the same pmid
unique_names = rep_subset['name'].unique().tolist()
unique_pmids = rep_subset['pmid'].unique().tolist()
print(len(unique_names),len(unique_pmids))

In [None]:
#### Check to see if there are replicates (multiples of more than 2) 
print(df3_counts.sort_values('counts',ascending=False).head(n=5))

In [None]:
#### Using only name and pmid can result in multiple replicates. These may need special handling
#### The issue of replicates may be due to both OMICS-DI ingestion of GEO and versioning
#### First address the duplicates only as these will likely be due to OMICS-DI ingestion of GEO


dup_freq_subset = rep_subset.loc[rep_subset['counts']<3]
dup_subset = dup_freq_subset.merge(df3,on=['name','pmid'],how='left')
print(dup_subset.head(n=6))

In [None]:
#### Get pairs of ids
## Sort the data frame by pmid (to get pairs), then by _id (to ensure orderting)
## Generate one dataframe by dropping duplicates (subset pmid, keeping first)
## Generate second dataframe by dropping duplicates (subset pmid, keeping list)
## Merge the two to get pairs of data

dup_subset.sort_values(by=['pmid','_id'], inplace=True)
keep_first = dup_subset.drop_duplicates(subset='pmid',keep='first').copy()
keep_last = dup_subset.drop_duplicates(subset='pmid',keep='last').copy()
keep_first.rename(columns={'_id':'GEO_id','description':'GEO_desc'},inplace=True)
keep_last.rename(columns={'_id':'OMICS_id','description':'OMICS_desc'},inplace=True)
print(keep_first.head(n=5))
print("===================")
print(keep_last.head(n=5))
print("===================")
clean_dup_df = keep_first.merge(keep_last,on=['name','pmid','counts'],how='inner')
print(clean_dup_df.head(n=2))

In [None]:
def compare_desc_length(row):
    if row['GEO_desc_len'] > row['OMICS_desc_len']:
        compare_result = 'GEO longer'
    elif row['GEO_desc_len'] < row['OMICS_desc_len']:
        compare_result = 'OMICS longer'
    elif row['GEO_desc_len'] == row['OMICS_desc_len']:
        compare_result = 'same length'
    return compare_result

In [None]:
## compare lengths of descriptions
clean_dup_df['GEO_desc_len'] = clean_dup_df['GEO_desc'].str.len()
clean_dup_df['OMICS_desc_len'] = clean_dup_df['OMICS_desc'].str.len()
clean_dup_df['compare'] = clean_dup_df.apply(lambda row : compare_desc_length(row), axis = 1)
print(clean_dup_df.head(n=2))

In [None]:
print(clean_dup_df.iloc[0]['GEO_id'],clean_dup_df.iloc[0]['GEO_desc'])
print('================================')
print(clean_dup_df.iloc[0]['OMICS_id'],clean_dup_df.iloc[0]['OMICS_desc'])

### Summary of comparison of duplicate descriptions

In [None]:
summarydf = clean_dup_df.groupby('compare').size()
print(summarydf)

rep_freq_subset = rep_subset.loc[rep_subset['counts']>3].copy()
trip_freq_subset = rep_subset.loc[rep_subset['counts']==3].copy()

print("replicates (>3): ", len(rep_freq_subset))
print("triplicates (=3): ",len(trip_freq_subset))
print("duplicates (=2): ", len(dup_freq_subset))

Issue of replicates and triplicates seems to primarily be due to the use of a species name as the name of the dataset. These types of datasets are likely to cite the same PMID paper describing the species and may consist of wholly different datasets based on the descriptions

### Investigate source of triplicate records

In [None]:

trip_subset = trip_freq_subset.merge(df3,on=['name','pmid'],how='left')
trip_subset.sort_values(by=['pmid','name'],inplace=True)
trip_subset.to_csv('data/triplicates_by_name_and_pmid.tsv', sep='\t',header=True)
print(trip_subset.head(n=21))

### Identify a heuristic for ommitting replicates based on name length or match to a species name

In [None]:
### Inspecting name lengths
rep_freq_subset['name_length'] = rep_freq_subset['name'].str.len()
rep_freq_subset.sort_values(by='name_length',ascending=True,inplace=True)
rep_name_mean = rep_freq_subset['name_length'].mean()
rep_name_min = rep_freq_subset['name_length'].min() 
rep_name_max = rep_freq_subset['name_length'].max()
print("replicates: ", "min: ", rep_name_min, "max: ", rep_name_max, "mean: ", rep_name_mean)

trip_freq_subset['name_length'] = trip_freq_subset['name'].str.len()
trip_freq_subset.sort_values(by='name_length',ascending=True,inplace=True)
trip_name_mean = trip_freq_subset['name_length'].mean()
trip_name_min = trip_freq_subset['name_length'].min() 
trip_name_max = trip_freq_subset['name_length'].max()
print("triplicates: ", "min: ", trip_name_min, "max: ", trip_name_max, "mean: ", trip_name_mean)

dup_freq_subset['name_length'] = dup_freq_subset['name'].str.len()
dup_freq_subset.sort_values(by='name_length',ascending=True,inplace=True)
dup_name_mean = dup_freq_subset['name_length'].mean()
dup_name_min = dup_freq_subset['name_length'].min() 
dup_name_max = dup_freq_subset['name_length'].max()
print("duplicates: ", "min: ", dup_name_min, "max: ", dup_name_max, "mean: ", dup_name_mean)

In [None]:
print(rep_freq_subset.head(n=3))

print(trip_freq_subset.head(n=3))

print(dup_freq_subset.head(n=3))

In [None]:
cutoff_test = [25, 50, 75, 90]

for eachcutoff in cutoff_test:
    tmprepdf = rep_freq_subset.loc[rep_freq_subset['name_length']>eachcutoff]
    tmprepdf.sort_values('name_length',ascending=True,inplace=True)
    print("reps at "+str(eachcutoff),": ",tmprepdf.head(n=3))
    tmptripdf = trip_freq_subset.loc[trip_freq_subset['name_length']>eachcutoff]
    tmptripdf.sort_values('name_length',ascending=True,inplace=True)
    print("trips at "+str(eachcutoff),": ",tmptripdf.head(n=3))
    tmpdupdf = dup_freq_subset.loc[dup_freq_subset['name_length']>eachcutoff]
    tmpdupdf.sort_values('name_length',ascending=True,inplace=True)
    print("trips at "+str(eachcutoff),": ",tmpdupdf.head(n=3))

In [None]:
print(df3.head(n=2))
replicatesdf = df3.merge(tmprepdf, on=['name','pmid'], how='inner')
print(len(replicatesdf))