# Prepare the Data for Classification

In this file, we de-duplicate the `new_data_to_classify.csv` dataset by using the `both_lab_table.csv`, so articles already processed are ignored. 

## Libraries

In [1]:
import os, re, unicodedata, pandas as pd
import ast
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

## Import Data

In [3]:
df_new = pd.read_csv("../01.get_new_data/new_data_to_classify.csv")
## remember you need to update this to read from the final final 
df_old = pd.read_csv("both_lab_table.csv")

In [4]:
# normalize DOIs in both dataframes
def clean_doi(x):
    if pd.isna(x): return ""
    x = str(x).strip().lower()
    x = re.sub(r"^https?://(dx\.)?doi\.org/", "", x)
    return x

df_old["doi_clean"] = df_old["DOI"].apply(clean_doi)
df_new["doi_clean"] = df_new["doi"].apply(clean_doi)

# --- find overlap ---
already = set(df_old["doi_clean"].dropna().unique())
mask_new = ~df_new["doi_clean"].isin(already)

df_new_only = df_new[mask_new].reset_index(drop=True)

print("Original new fetch:", len(df_new))
print("Already in previous data:", len(df_new) - mask_new.sum())
print("New unique rows after exclusion:", len(df_new_only))

Original new fetch: 1641
Already in previous data: 97
New unique rows after exclusion: 1544


## Create Prediction Information

Clean up the data for prediction purposes including the following steps:

- Paste together keywords, title, and abstract.
- Lowercase
- Convert to ASCII
- Remove any "keywords" or NA values
- Expand contractions
- Remove punctuation
- Remove stopwords
- Remove extra white space

In [5]:
# convert stringified lists into real lists
def coerce_keywords(val):
    if isinstance(val, list):
        return val
    if isinstance(val, str) and val.startswith("[") and val.endswith("]"):
        try:
            return ast.literal_eval(val)
        except Exception:
            return [val]
    if pd.isna(val):
        return []
    return [val]

df_new_only["keywords"] = df_new_only["keywords"].apply(coerce_keywords)

# now rebuild text
def make_text(row):
    parts = []
    if row["keywords"]:
        parts.append(" ".join(row["keywords"]))
    if pd.notna(row.get("title")):
        parts.append(str(row["title"]))
    if pd.notna(row.get("abstract")):
        parts.append(str(row["abstract"]))
    return " ".join(parts)

df_new_only["text"] = df_new_only.apply(make_text, axis=1)

print(df_new_only["text"].iloc[0][:500])

Concreteness Word Association Word (group theory) Natural language processing Computer science Association (psychology) Valence (chemistry) Word lists by frequency Lexical decision task Age of Acquisition Artificial intelligence Psychology Cognitive psychology Linguistics Cognition Philosophy Physics Quantum mechanics Neuroscience Psychotherapist Sentence Predicting Lexical Norms: A Comparison between a Word Association Model and Text-Based Word Co-occurrence Models In two studies we compare a d


In [6]:
# --- helpers ---
def to_ascii(s: str) -> str:
    # latin1 -> ascii-ish: normalize then drop non-ASCII
    if s is None:
        return ""
    s = unicodedata.normalize("NFKD", str(s))
    return s.encode("ascii", "ignore").decode("ascii")

# minimal contraction expander (no installs needed)
_CONTRACTIONS = {
    "can't":"cannot","won't":"will not","n't":" not",
    "'re":" are","'s":" is","'d":" would","'ll":" will","'t":" not",
    "'ve":" have","'m":" am","’re":" are","’s":" is","’d":" would",
    "’ll":" will","’t":" not","’ve":" have","’m":" am"
}
_contr_pat = re.compile("|".join(map(re.escape, sorted(_CONTRACTIONS, key=len, reverse=True))))

def expand_contractions(text: str) -> str:
    return _contr_pat.sub(lambda m: _CONTRACTIONS[m.group(0)], text)

STOPWORDS = set(ENGLISH_STOP_WORDS)

def remove_stopwords(text: str) -> str:
    # tokenise on whitespace after basic cleanup
    tokens = text.split()
    return " ".join(t for t in tokens if t not in STOPWORDS)

def clean_text_series(s: pd.Series) -> pd.Series:
    s = s.fillna("").astype(str)

    # 1) convert to ascii
    s = s.apply(to_ascii)

    # 2) remove leading "Keywords" (case-insensitive, optional colon)
    s = s.str.replace(r"^\s*keywords[:\-]?\s*", " ", case=False, regex=True)

    # 3) remove standalone "NA" (start or word boundary)
    s = s.str.replace(r"(^|\s)NA\b", " ", case=False, regex=True)

    # 4) lowercase
    s = s.str.lower()

    # 5) expand contractions
    s = s.apply(expand_contractions)

    # 6) remove punctuation
    s = s.str.replace(r"[^\w\s]", " ", regex=True)

    # 7) remove digits
    s = s.str.replace(r"\d+", " ", regex=True)

    # 8) remove stopwords
    s = s.apply(remove_stopwords)

    # 9) collapse extra whitespace
    s = s.str.replace(r"\s+", " ", regex=True).str.strip()

    return s

# run the cleaner
df_new_only["text"] = clean_text_series(df_new_only["text"])

df_new_only.to_csv("complete_processed_data_no_stem.csv", index=False)

print("saved -> complete_processed_data_no_stem.csv")
df_new_only["text"].head(3).to_list()

saved -> complete_processed_data_no_stem.csv


['concreteness word association word group theory natural language processing computer science association psychology valence chemistry word lists frequency lexical decision task age acquisition artificial intelligence psychology cognitive psychology linguistics cognition philosophy physics quantum mechanics neuroscience psychotherapist sentence predicting lexical norms comparison word association model text based word occurrence models studies compare distributional semantic model derived word occurrences word association based model ability predict properties affect lexical processing focus age acquisition concreteness affective variables valence arousal dominance variables shown fundamental word meaning studies use model based data obtained continued free word association task predict variables study directly compare model word occurrence model based syntactic dependency relations model better predicting variables scrutiny dutch study replicate findings english compare results repor