In [1]:
import json
import os
os.chdir("../") # resets notebook directory to repository root folder (DO ONLY ONCE!)
import gzip
import tqdm
import polars as pl
from tqdm import tqdm
from collections import defaultdict
# import pyarrow as pa

def flatten(matrix):
    return [item for row in matrix for item in row]

def simpleId(text):
    try:
        y=text.split('/')[-1]
    except:
        y=''
    return y

def flush_buffer(lines, output_dir):
    df = pl.DataFrame(lines, orient = "row", schema = columns)

    out_path = os.path.join(output_dir, f"sources.csv")
    file_exists = os.path.isfile(out_path)
    # # USING pandas
    # df.write_csv(
    #     out_path,
    #     has_header=not file_exists,
    #     separator=',',
    #     append=file_exists
    # )
    # USING polars
    csv_str = df.write_csv(separator=';', include_header=not file_exists)
    with open(out_path, 'a', encoding='utf-8') as f:
        f.write(csv_str)
    lines = []
    return lines

In [2]:
snapshot_subfolder = "data/openalex-snapshot/data/sources/"

In [3]:
listdir=[subfolder for subfolder in sorted(os.listdir(snapshot_subfolder)) if 'updated' in subfolder]
print(f"Found {len(listdir)} subfolders")

Found 134 subfolders


In [4]:
print(os.listdir(snapshot_subfolder+listdir[0]))

['part_000.gz']


In [5]:
files=flatten([[snapshot_subfolder+listdir[a]+'/'+i for i in os.listdir(snapshot_subfolder+listdir[a]) if 'part' in i] for a in range(len(listdir))])
print(f"Found {len(files)} files")

Found 134 files


In [6]:
print("Example file:", files[0])

Example file: data/openalex-snapshot/data/sources/updated_date=2023-08-02/part_000.gz


In [17]:
destination_csv_folder = "data/sources/"
os.makedirs(destination_csv_folder, exist_ok=True)

# # before doing any damage, check if there are files in the destination folder and stop if there are, 
# # because if there are already csv, it will append and might cause many duplicates
if os.listdir(destination_csv_folder):  # listdir returns [] if empty
    existing = os.listdir(destination_csv_folder)
    raise RuntimeError(
        f"Destination folder '{destination_csv_folder}' is not empty! "
        f"Found {len(existing)} file(s), e.g., {existing[:3]}. "
        f"Please clean it before running this script to avoid appending duplicates."
    )

columns=['id', 'display_name', 'issn', 'type', 'publisher', 'host_organization', 'country_code', 'apc_usd', 'works_count', 'cited_by_count', 
         'h_index', 'i10_index', '2yr_h_index', '2yr_i10_index', '2yr_mean_citedness',
         'is_oa', 'is_in_doaj', 'is_core', 'is_indexed_in_scopus']

lines=[]
for gzfile in tqdm(files):
    with gzip.open(gzfile, 'rt') as file:
        for line in file:
            try:
                data = json.loads(line)
                ll=[simpleId(data['id']), data['display_name'].replace(';','.'), data['issn_l'], data['type'], data['publisher'], simpleId(data['host_organization'])]
                country_code = data['country_code']
                if country_code is None:
                    country_code = ''
                ll.append(country_code)

                ll.append(data['apc_usd'])
                ll.append(data['works_count'])
                ll.append(data['cited_by_count'])
                ll.append(data['summary_stats']['h_index'])
                ll.append(data['summary_stats']['i10_index'])
                ll.append(data['summary_stats']['2yr_h_index'])
                ll.append(data['summary_stats']['2yr_i10_index'])
                try:
                    ll.append(data['summary_stats']['2yr_mean_citedness'])
                except KeyError:
                    ll.append('')
                ll.append(str(data['is_oa'])[0])
                ll.append(str(data['is_in_doaj'])[0])
                try:
                    ll.append(str(data['is_core'])[0])
                except KeyError:
                    ll.append('')
                try:
                    ll.append(str(data['is_indexed_in_scopus'])[0])
                except KeyError:
                    ll.append('')

                ll = [_ if _ is not None else '' for _ in ll]
                lines.append(ll)
            except json.JSONDecodeError as e:
                print(f"Errore nel parsing della riga: {e}")
            if len(lines) > 1000:
                lines = flush_buffer(lines, destination_csv_folder)
lines = flush_buffer(lines, destination_csv_folder)

100%|██████████| 134/134 [00:51<00:00,  2.62it/s]


# Check source done

In [35]:
sources_df = read_parquet(os.path.join("data", "sources", f"sources.parquet"))


Reading 'sources' from 'data/sources/sources.parquet' using engine='pyarrow'
Read 260,798 rows from 'sources' in 0.34 sec.
Converting dtypes took 0.35 sec. Size before: 0.06GB, after: 0.05GB


Unnamed: 0,id,display_name,issn,type,publisher,host_organization,country_code,apc_usd,works_count,cited_by_count,h_index,i10_index,2yr_h_index,2yr_i10_index,2yr_mean_citedness,is_oa,is_in_doaj,is_core,is_indexed_in_scopus
0,S4306525036,PubMed,,repository,,I1299303238,US,,33075702,1167920935,9,8,283,105289,0.0,F,F,F,F
1,S2764455111,PubMed Central,,repository,,I1299303238,,,8009760,354501581,12,16,263,102290,0.0,T,F,F,F
2,S4306400806,Europe PMC (PubMed Central),,repository,PubMed Central,I1303153112,,,5316261,299645030,0,0,22,29,0.0,T,F,F,F


In [36]:
predatory_list = pd.read_csv(os.path.join("data", "sources", "The Predatory Journals List 2025.csv"), header = None)
predatory_list.columns = ["id", "journal_name"]
predatory_list_df = predatory_list[["journal_name"]]
predatory_list = list(predatory_list_df.journal_name.values)
predatory_list_df

Unnamed: 0,journal_name
0,Abstract and Applied Analysis
1,Academic Exchange Quarterly
2,Academic Research Reviews
3,Academy of Contemporary Research Journal (AOCRJ)
4,ACME Intellects
...,...
2775,World Wide Journal of Multidisciplinary Resear...
2776,Wulfenia (hijacked)
2777,Yangtze Medicine
2778,Youth


In [37]:
sources_set = set(list(sources_df.display_name.values))

In [41]:
import re
from unidecode import unidecode
import pandas as pd

def normalize_name(s: str) -> str:
    if pd.isna(s):
        return ""
    s = unidecode(str(s)).lower()

    # remove parenthetical chunks *and any surrounding spaces*
    # e.g. "wulfenia (hijacked)" -> "wulfenia"
    s = re.sub(r"\s*\([^()]*\)\s*", " ", s)

    s = s.replace("&", " and ")
    s = re.sub(r"[^\w\s]", " ", s)      # drop punctuation
    s = re.sub(r"\s+", " ", s).strip()  # collapse & remove leading/trailing spaces
    return s


# -----------------------------
# Apply normalization
# -----------------------------
pred = predatory_df.copy()
src  = sources_df.copy()

pred["name_norm"] = pred["journal_name"].map(normalize_name)
src["name_norm"]  = src["display_name"].map(normalize_name)

# -----------------------------
# Membership flag
# -----------------------------
src_names = set(src["name_norm"].unique())
pred["exact_in_sources"] = pred["name_norm"].isin(src_names)

# -----------------------------
# Left join for matches
# -----------------------------
exact_matches = pred.merge(
    src[["id", "display_name", "name_norm"]],
    on="name_norm", how="left", suffixes=("_pred", "_src")
)

# -----------------------------
# Split found vs not found
# -----------------------------
predatory_found     = exact_matches[exact_matches["id"].notna()]
predatory_not_found = exact_matches[exact_matches["id"].isna()]

# -----------------------------
# Quick stats
# -----------------------------
print(f"Total predatory journals: {len(pred)}")
print(f"Matched in sources: {len(predatory_found)}")
print(f"Not matched: {len(predatory_not_found)}")


Total predatory journals: 2780
Matched in sources: 2319
Not matched: 860


In [45]:
predatory_found

Unnamed: 0,journal_name,name_norm,exact_in_sources,id,display_name
0,Abstract and Applied Analysis,abstract and applied analysis,True,S54871839,Abstract and Applied Analysis
1,Academic Exchange Quarterly,academic exchange quarterly,True,S2764863161,Academic exchange quarterly
5,Acoustics,acoustics,True,S4210213655,Acoustics
8,Acta Kinesiologica,acta kinesiologica,True,S4210219869,Acta kinesiologica
9,Acta Kinesiologica,acta kinesiologica,True,S4306500367,Acta Kinesiologica
...,...,...,...,...,...
3173,World Scientific News,world scientific news,True,S4306534956,World Scientific News
3175,Wulfenia (hijacked),wulfenia,True,S82240309,Wulfenia
3176,Yangtze Medicine,yangtze medicine,True,S4210176968,Yangtze Medicine
3177,Youth,youth,True,S4210240668,Youth


In [47]:
# make a set of predatory source IDs
pred_ids = set(predatory_found["id"].dropna().astype(str).unique())

# add the flag to sources_df
sources_df["is_predatory"] = sources_df["id"].astype(str).isin(pred_ids)

# (optional) quick check
# sources_df["is_predatory"].value_counts()

In [53]:
print(len(pred_ids))

2303


In [48]:
sources_df

Unnamed: 0,id,display_name,issn,type,publisher,host_organization,country_code,apc_usd,works_count,cited_by_count,h_index,i10_index,2yr_h_index,2yr_i10_index,2yr_mean_citedness,is_oa,is_in_doaj,is_core,is_indexed_in_scopus,is_predatory
0,S4306525036,PubMed,,repository,,I1299303238,US,,33075702,1167920935,9,8,283,105289,0.000000,F,F,F,F,False
1,S2764455111,PubMed Central,,repository,,I1299303238,,,8009760,354501581,12,16,263,102290,0.000000,T,F,F,F,False
2,S4306400806,Europe PMC (PubMed Central),,repository,PubMed Central,I1303153112,,,5316261,299645030,0,0,22,29,0.000000,T,F,F,F,False
3,S4306400194,arXiv (Cornell University),,repository,Cornell University,I205783295,US,,3015134,58177778,792,116952,186,20217,0.491031,T,F,F,F,False
4,S4306401280,DOAJ (DOAJ: Directory of Open Access Journals),,repository,DOAJ: Directory of Open Access Journals,,GB,,2672416,40746279,226,41295,20,70,0.173100,T,F,F,F,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260793,S4390648476,Landscape Architecture,1673-1530,journal,Science Press,P4310319982,,,0,0,4,0,0,0,0.000000,F,F,F,F,False
260794,S4390648488,Substance Use Research and Treatment,2976-8357,journal,SAGE Publishing,P4310320017,,,0,0,2,0,0,0,0.000000,F,F,T,F,False
260795,S4393248858,Journal of Geophysical Research Machine Learni...,2993-5210,journal,Wiley,P4310320595,,,0,0,4,0,0,0,0.000000,F,F,F,F,False
260796,S4404533546,Discover Plants.,3005-1207,journal,"Academy of Medicine, Singapore",P4310317082,SI,,0,0,4,0,0,0,0.000000,F,F,F,F,False


In [49]:
sources_df.is_predatory.value_counts()

is_predatory
False    258495
True       2303
Name: count, dtype: int64

In [50]:
sources_df.to_parquet(os.path.join("data", "sources", f"sources_with_predatory.parquet"))