In [16]:
import pandas as pd

df = pd.read_csv("replications_database_2025_11_11_180242.csv")

df.columns

Index(['original_url', 'replication_url', 'description', 'result',
       'original_authors', 'original_title', 'original_journal',
       'original_volume', 'original_issue', 'original_pages', 'original_year',
       'replication_authors', 'replication_title', 'replication_journal',
       'replication_volume', 'replication_issue', 'replication_pages',
       'replication_year', 'original_n', 'original_es', 'original_es_type',
       'original_es_95_CI', 'original_p_value', 'original_p_value_type',
       'original_p_value_tails', 'replication_n', 'replication_es',
       'replication_es_type', 'replication_es_95_CI', 'replication_p_value',
       'replication_p_value_type', 'replication_p_value_tails', 'discipline',
       'tags', 'validated', 'validated_person', 'openalex_field',
       'openalex_subfield', 'replication_citation_html',
       'original_citation_html', 'Unnamed: 0', 'original_es_r',
       'replication_es_r'],
      dtype='object')

In [17]:
len(set(df['original_url'].values)) + len(set(df['replication_url'].values))

905

In [18]:
import os
import pandas as pd

df = pd.read_csv("replications_database_2025_11_11_180242.csv")

save_dir = "/home/dan/Dropbox/AAA_METASCIENCE_OBSERVATORY/PDFs/in_ground_truth_dataset/"

# ---------------------------------------------------
# Helpers
# ---------------------------------------------------

def extract_doi(url: str) -> str | None:
    """Extract and normalize DOI from an original/replication URL."""
    if not isinstance(url, str) or not url.strip():
        return None

    url = url.strip()

    # Only accept URLs that actually contain doi.org
    if "doi.org/" not in url.lower():
        return None

    doi = (
        url.replace("https://doi.org/", "")
           .replace("http://doi.org/", "")
           .strip()
           .lower()
    )

    return doi if doi else None


def pdf_path_from_doi(doi: str) -> str:
    """Map normalized DOI to expected PDF path."""
    safe_filename = doi.replace("/", "--") + ".pdf"
    return os.path.join(save_dir, safe_filename)


def pdf_exists_for_url(url: str) -> bool:
    """Check if PDF for this URL exists."""
    doi = extract_doi(url)
    if not doi:
        return False

    path = pdf_path_from_doi(doi)
    return os.path.exists(path)


# ---------------------------------------------------
# Apply checks to dataframe
# ---------------------------------------------------

# Boolean columns showing whether the PDFs exist
df["original_pdf_exists"] = df["original_url"].apply(pdf_exists_for_url)
df["replication_pdf_exists"] = df["replication_url"].apply(pdf_exists_for_url)

# Filter to keep only rows where BOTH PDFs exist
filtered_df = df[df["original_pdf_exists"] & df["replication_pdf_exists"]].copy()

# Optional: drop helper columns
filtered_df = filtered_df.drop(columns=["original_pdf_exists", "replication_pdf_exists"])

print("Rows originally:", len(df))
print("Rows after filtering:", len(filtered_df))

# Save if you want
# filtered_df.to_csv("replications_with_pdfs_only.csv", index=False)


Rows originally: 872
Rows after filtering: 672


In [22]:
df["original_es_r"] = pd.to_numeric(df["original_es_r"], errors="coerce")
df["replication_es_r"] = pd.to_numeric(df["replication_es_r"], errors="coerce")
df = df[df["original_es_r"].notna() & df["replication_es_r"].notna()]


In [23]:
df.to_csv("ground_truth.csv")

In [24]:
len(df)

438

In [15]:
df

Unnamed: 0.1,original_url,replication_url,description,result,original_authors,original_title,original_journal,original_volume,original_issue,original_pages,...,tags,validated,validated_person,openalex_field,openalex_subfield,replication_citation_html,original_citation_html,Unnamed: 0,original_es_r,replication_es_r
791,http://doi.org/10.1177/0956797612447820,https://web.archive.org/web/20200206205411/htt...,Liberal participants rated liberal groups more...,success,"Chambers, J. R., Schlenker, B. R.; Collisson, B",Ideology and Prejudice The Role of Value Confl...,Psychological Science,24.0,2.0,140,...,Political psychology,yes,Dan Elton,,,<i></i> 2013,"<a href=""https://doi.org/10.1177/0956797612447...",,,
792,http://doi.org/10.1177/0956797612447820,https://web.archive.org/web/20200206205411/htt...,Conservative rated conservative groups more po...,,John R. Chambers; Barry R. Schlenker; Brian Co...,Ideology and Prejudice,Psychological Science,24.0,2.0,140,...,Political psychology,yes,Dan Elton,,,<i></i>,"<a href=""https://doi.org/10.1177/0956797612447...",,,
862,https://scholar.google.com/scholar?cluster=333...,https://doi.org/10.1371/journal.pone.0029081,Hearing about old age makes people walk slower,failure,Luca Giancardo; Diego Sona; Huiping Huang; Sar...,Automaticity of social behaviour,PLoS ONE,8.0,9.0,e74557,...,,yes,Dan Elton,,,"<a href=""https://doi.org/10.1371/journal.pone....",Giancardo L. <i>et al.</i> <i>PLoS ONE</i> 2013,,,
863,http://doi.org/10.1037/0022-3514.37.10.1660,http://doi.org/10.1177/2515245918777487,The paper demonstrated that the ease with whic...,failure,Thomas K. Srull; Robert S. Wyer,The role of category accessibility in the inte...,Journal of Personality and Social Psychology,37.0,10.0,1660,...,,yes,Dan Elton,,,"<a href=""https://doi.org/10.1177/2515245918777...","<a href=""https://doi.org/10.1037/0022-3514.37....",,,
865,http://doi.org/10.14695/kjsos.2015.18.4.15,http://doi.org/10.1027/1864-9335/a000186,Participants who cleansed their hands before j...,failure,David J. Johnson; Felix Cheung; Brent Donnellan,Cleanliness Reduces the Severity of\nMoral Jud...,Korean Society for Emotion and Sensibility,18.0,4.0,15-24,...,priming,yes,Dan Elton,,,"<a href=""https://doi.org/10.1027/1864-9335/a00...","<a href=""https://doi.org/10.14695/kjsos.2015.1...",,,
866,http://doi.org/10.1509/jmkr.45.6.633,http://doi.org/10.1177/2515245918781032,When participants had more opportunity to chea...,failure,Nina Mažar; On Amir; Dan Ariely,The Dishonesty of Honest People: A Theory of S...,Journal of Marketing Research,45.0,6.0,633,...,,yes,Dan Elton,,,"<a href=""https://doi.org/10.1177/2515245918781...","<a href=""https://doi.org/10.1509/jmkr.45.6.633...",,,
867,https://doi.org/10.1111/j.1467-9280.2008.02084.x,https://doi.org/10.1371/journal.pone.0042510,Priming people with cues of physical spatial d...,failure,Lawrence E. Williams; John A. Bargh,Keeping One's Distance,Psychological Science,19.0,3.0,302,...,,yes,Dan Elton,,,"<a href=""https://doi.org/10.1371/journal.pone....","<a href=""https://doi.org/10.1111/j.1467-9280.2...",,,
868,http://doi.org/10.1177/0956797611414726,https://psycnet.apa.org/fulltext/2014-20922-00...,Exposure to the American Shifts Support Toward...,inconclusive,Travis J. Carter; Melissa J. Ferguson; Ran R. ...,A Single Exposure to the American Flag Shifts ...,Psychology Science,22.0,8.0,1011,...,,yes,Dan Elton,,,2014 K. <i>et al.</i> <i>Advances in Methods a...,"<a href=""https://doi.org/10.1177/0956797611414...",,,
869,https://doi.org/10.1111/j.1467-9280.2008.02062.x,http://doi.org/10.31234/osf.io/ux8ef,When people process stimuli with low fluency (...,failure,Adam L. Alter; Daniel M. Oppenheimer,Effects of Fluency on Psychological Distance a...,Psychological Science,19.0,2.0,161,...,,yes,Dan Elton,,,"<a href=""https://doi.org/10.31234/osf.io/ux8ef...","<a href=""https://doi.org/10.1111/j.1467-9280.2...",,,
870,http://doi.org/10.1037/a0029288,http://doi.org/10.1037/xge0000570,Mere exposure to money increases endorsement o...,failure,Eugene M. Caruso; Kathleen D. Vohs; Brittani B...,Mere exposure to money increases endorsement o...,Journal of Experimental Psychology: General,142.0,2.0,301,...,,yes,Dan Elton,,,"<a href=""https://doi.org/10.1037/xge0000570"" t...","<a href=""https://doi.org/10.1037/a0029288"" tar...",,,
