# Step 1 Get New Data

## Libraries

In [1]:
#!/usr/bin/env python3
import math, requests, pandas as pd, re
import textwrap
import numpy as np

## OpenAlex

We will use OpenAlex to mimic the searches from the original LAB within EBSCO given the overlap in content and moving to a reproducible pipeline. This open resource includes the journals from previously investigated searches including Behavior Research Methods, Language Resources and Evaluation, and PLoS One. 

In [2]:
base_url = (
    "https://api.openalex.org/works?"
    "filter=title_and_abstract.search:lexical+database+OR+lexical+norms+OR+linguistic+database+OR+linguistic+norms,"
    "publication_year:2018-2025,"
    "type:types/article|types/dataset|types/preprint|types/supplementary-materials|types/report|types/book-chapter"
    "&sort=relevance_score:desc"
    "&per_page=200"     # bump page size to reduce calls
)

def decode_abstract(inv):
    if not isinstance(inv, dict) or not inv: return None
    pos2tok = {p:t for t,ps in inv.items() for p in ps}
    txt = " ".join(pos2tok.get(i,"") for i in range(max(pos2tok)+1))
    txt = re.sub(r"\s+([,.!?;:])", r"\1", txt)
    return re.sub(r"\s{2,}", " ", txt).strip() or None

# probe for total
probe = requests.get(base_url + "&page=1", timeout=30)
probe.raise_for_status()
meta = probe.json()["meta"]
total, per_page = meta["count"], meta["per_page"]
pages = math.ceil(total / per_page)
print(f"total={total}, per_page={per_page}, pages={pages}")

rows = []
for p in range(1, pages+1):
    r = requests.get(base_url + f"&page={p}", timeout=60)
    r.raise_for_status()
    for w in r.json().get("results", []):
        rows.append({
            "title": w.get("title"),
            "year": w.get("publication_year"),
            "doi": (w.get("doi") or "").replace("https://doi.org/", ""),
            "venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
            "authors": "; ".join(a["author"]["display_name"] for a in w.get("authorships", [])),
            "abstract": decode_abstract(w.get("abstract_inverted_index")),
            # OpenAlex doesn’t store author-entered keywords; concepts are the closest proxy
            "keywords": [c["display_name"] for c in w.get("concepts", [])],
            "openalex_id": w.get("id"),
            "is_oa": (w.get("open_access") or {}).get("is_oa"),
            "cited_by": w.get("cited_by_count", 0),
        })
    print(f"page {p}/{pages}… collected {len(rows)}", end="\r")

df = pd.DataFrame(rows)
print("\nDone. Rows fetched:", len(df))
df.head()

total=1641, per_page=200, pages=9
page 9/9… collected 1641
Done. Rows fetched: 1641


Unnamed: 0,title,year,doi,venue,authors,abstract,keywords,openalex_id,is_oa,cited_by
0,On the predictive validity of various corpus-b...,2018,10.3758/s13428-017-1001-8,Behavior Research Methods,Xiaocong Chen; Yanping Dong; Xiufen Yu,,"[Lexical diversity, Computer science, Lemma (b...",https://openalex.org/W2784175655,True,50
1,Predicting Lexical Norms: A Comparison between...,2018,10.5334/joc.50,Journal of Cognition,Hendrik Vankrunkelsven; Steven Verheyen; Gert ...,In two studies we compare a distributional sem...,"[Concreteness, Word Association, Word (group t...",https://openalex.org/W2902591385,True,57
2,Psycholinguistic norms for more than 300 lexic...,2021,10.3758/s13428-020-01524-y,Behavior Research Methods,Patrick C. Trettenbrein; Nina-Kristin Pendzich...,Sign language offers a unique perspective on t...,"[Iconicity, Age of Acquisition, German, Comput...",https://openalex.org/W2980432777,True,18
3,"Norms of conceptual familiarity for 3,596 Fren...",2018,10.3758/s13428-018-1106-8,Behavior Research Methods,Georges Chedid; Maximiliano A. Wilson; Christo...,,"[Noun, Linguistics, Lexical decision task, Psy...",https://openalex.org/W2888587255,True,20
4,Norm It! : Lexical Normalization for Italian a...,2020,,,Rob van der Goot; Alan Ramponi; Tommaso Casell...,,"[Computer science, Normalization (sociology), ...",https://openalex.org/W3026608847,False,14


## Examine Abstracts

We need to examine if all articles have abstracts for being able to predict.

In [3]:
def summarize_abstracts(df: pd.DataFrame, n_show: int = 5):
    # treat empty strings/whitespace as missing
    has_abs = df["abstract"].astype("string").str.strip().ne("").fillna(False)

    total = len(df)
    with_abs = int(has_abs.sum())
    without_abs = total - with_abs
    pct = (with_abs / total * 100) if total else 0.0

    print(f"Total rows: {total}")
    print(f"With abstract: {with_abs} ({pct:.1f}%)")
    print(f"Missing abstract: {without_abs}")

    if without_abs:
        # show a few examples that are missing
        missing = df.loc[~has_abs, ["title", "year", "doi", "venue", "openalex_id"]].head(n_show)
        print("\nExamples missing abstracts:")
        for _, r in missing.iterrows():
            print("•", r["year"], "|", (r["title"] or "")[:120].rstrip(), "|", r["venue"] or "", "| DOI:", r["doi"] or "—")

    return has_abs

# Run the summary on your df
has_abs_mask = summarize_abstracts(df, n_show=30)

Total rows: 1641
With abstract: 1623 (98.9%)
Missing abstract: 18

Examples missing abstracts:
• 2018 | On the predictive validity of various corpus-based frequency norms in L2 English lexical processing | Behavior Research Methods | DOI: 10.3758/s13428-017-1001-8
• 2018 | Norms of conceptual familiarity for 3,596 French nouns and their contribution in lexical decision | Behavior Research Methods | DOI: 10.3758/s13428-018-1106-8
• 2020 | Norm It! : Lexical Normalization for Italian and Its Downstream Effects for Dependency Parsing |  | DOI: —
• 2022 | Translation norms for Malay and English words: The effects of word class, semantic variability, lexical characteristics, | Behavior Research Methods | DOI: 10.3758/s13428-022-01977-3
• 2019 | LEXICAL QUANTOR GENESIS VS LANGUAGE NORM DYNAMICS |  | DOI: 10.36059/978-966-397-124-7/39-56
• 2023 | Lexical Norms in Business, Informal and Internet Communication | Studies in systems, decision and control | DOI: 10.1007/978-3-031-27506-7_4
• 2019 

In [4]:
# ---- polite headers (some APIs appreciate a contact) ----
CONTACT_EMAIL = "ebuchanan@harrisburgu.edu"  # set yours
HEADERS = {"Accept": "application/json", "User-Agent": f"LAB-abstract-enricher ({CONTACT_EMAIL})"}

def _clean_doi(doi: str) -> str:
    if not doi: return ""
    doi = doi.strip()
    return re.sub(r"^https?://(dx\.)?doi\.org/", "", doi, flags=re.I)

def safe_request(url, params=None, headers=None):
    try:
        r = requests.get(url, params=params, headers=headers, timeout=30)
        r.raise_for_status()
        return r.json(), None
    except requests.exceptions.HTTPError as e:
        return None, f"HTTP {r.status_code}: {r.text[:200]}"
    except Exception as e:
        return None, f"Other error: {str(e)}"

def fetch_crossref_abstract(doi):
    doi = _clean_doi(doi)
    url = f"https://api.crossref.org/works/{doi}"
    js, err = safe_request(url, headers=HEADERS)
    if err: return None, "ERROR", err
    abs_ = (js.get("message") or {}).get("abstract")
    if abs_:
        # strip tags
        abs_ = re.sub(r"<[^>]+>", "", abs_)
        return abs_.strip(), "Crossref", None
    return None, "MISSING", None

def fetch_europepmc_abstract(doi):
    url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
    params = {"query": f"DOI:{doi}", "format": "json", "pageSize": 1}
    js, err = safe_request(url, params=params, headers=HEADERS)
    if err: return None, "ERROR", err
    res = js.get("resultList", {}).get("result", [])
    if res and res[0].get("abstractText"):
        return res[0]["abstractText"], "EuropePMC", None
    return None, "MISSING", None

def get_abstract_by_doi(doi):
    doi = _clean_doi(doi)
    if not doi: return None, "MISSING", None
    # try Crossref then Europe PMC
    for fetcher in (fetch_crossref_abstract, fetch_europepmc_abstract):
        abs_, status, err = fetcher(doi)
        if status == "ERROR":  # API error
            return None, status, err
        if status != "MISSING":  # success
            return abs_, status, None
    return None, "MISSING", None

def enrich_missing_abstracts(df, doi_col="doi", abs_col="abstract", sleep=0.3):
    """
    For rows where df[abs_col] is empty, try to fetch an abstract by DOI.
    Prints status for each attempt; only writes into df[abs_col] on success.
    Expects get_abstract_by_doi() -> (abstract, status, err).
    """
    import time
    import pandas as pd

    if abs_col not in df.columns:
        df[abs_col] = None

    mask_missing = df[abs_col].isna() | (df[abs_col].astype(str).str.strip() == "")
    idxs = df.index[mask_missing].tolist()

    for i in idxs:
        doi = str(df.at[i, doi_col] or "").strip()
        if not doi:
            print(f"[Row {i}] No DOI, skipping.")
            continue

        abs_, status, err = get_abstract_by_doi(doi)

        if status == "ERROR":
            print(f"[Row {i}] DOI {doi}: API error -> {err}")
        elif status == "MISSING":
            print(f"[Row {i}] DOI {doi}: No abstract found.")
        else:
            print(f"[Row {i}] DOI {doi}: Abstract found via {status}.")
            df.at[i, abs_col] = abs_

        time.sleep(sleep)  # be polite to APIs

    return df

In [5]:
# df = <your dataframe from OpenAlex>
# Summarize before:
missing_before = df["abstract"].isna() | (df["abstract"].astype(str).str.strip() == "")
print("Missing before:", int(missing_before.sum()), "of", len(df))

Missing before: 18 of 1641


In [6]:
df = enrich_missing_abstracts(df)

[Row 0] DOI 10.3758/s13428-017-1001-8: No abstract found.
[Row 3] DOI 10.3758/s13428-018-1106-8: No abstract found.
[Row 4] No DOI, skipping.
[Row 8] DOI 10.3758/s13428-022-01977-3: No abstract found.
[Row 11] DOI 10.36059/978-966-397-124-7/39-56: No abstract found.
[Row 14] DOI 10.1007/978-3-031-27506-7_4: No abstract found.
[Row 16] DOI 10.17605/osf.io/vu2na: API error -> HTTP 404: Resource not found.
[Row 18] DOI 10.26226/morressier.606f15dd30a2e980041f238c: No abstract found.
[Row 47] No DOI, skipping.
[Row 53] DOI 10.5281/zenodo.7503056: API error -> HTTP 404: Resource not found.
[Row 57] DOI 10.5281/zenodo.7508859: API error -> HTTP 404: Resource not found.
[Row 73] DOI 10.52172/2587-6945_2021_17_3_13: No abstract found.
[Row 89] No DOI, skipping.
[Row 101] DOI 10.30842/26583755202202: No abstract found.
[Row 102] DOI 10.47743/phss-2024-0012: No abstract found.
[Row 104] No DOI, skipping.
[Row 110] No DOI, skipping.
[Row 114] DOI 10.24919/2308-4863/74-2-59: No abstract found.


In [8]:
missing_after = df["abstract"].isna() | (df["abstract"].astype(str).str.strip() == "")
print("Missing after:", int(missing_after.sum()), "of", len(df))

# Peek at newly-filled examples
# df.loc[df["abstract_source"].notna(), ["title", "doi", "abstract_source"]].head()

Missing after: 18 of 1641


## Write to CSV

Write the output for the next step. 

In [9]:
df.to_csv('new_data_to_classify.csv', index=False) 