In [1]:
import json
import os
os.chdir("../") # resets notebook directory to repository root folder (DO ONLY ONCE!)
import gzip
import tqdm
import polars as pl
from tqdm import tqdm
from collections import defaultdict
# import pyarrow as pa

def flatten(matrix):
    return [item for row in matrix for item in row]

def simpleId(text):
    try:
        y=text.split('/')[-1]
    except:
        y=''
    return y

def flush_buffer(lines, output_dir):
    df = pl.DataFrame(lines, orient = "row", schema = columns)

    out_path = os.path.join(output_dir, f"sources.csv")
    file_exists = os.path.isfile(out_path)
    # # USING pandas
    # df.write_csv(
    #     out_path,
    #     has_header=not file_exists,
    #     separator=',',
    #     append=file_exists
    # )
    # USING polars
    csv_str = df.write_csv(separator=';', include_header=not file_exists)
    with open(out_path, 'a', encoding='utf-8') as f:
        f.write(csv_str)
    lines = []
    return lines

In [2]:
snapshot_subfolder = "data/openalex-snapshot/data/sources/"

In [3]:
listdir=[subfolder for subfolder in sorted(os.listdir(snapshot_subfolder)) if 'updated' in subfolder]
print(f"Found {len(listdir)} subfolders")

Found 42 subfolders


In [4]:
print(os.listdir(snapshot_subfolder+listdir[0]))

['part_000.gz']


In [5]:
files=flatten([[snapshot_subfolder+listdir[a]+'/'+i for i in os.listdir(snapshot_subfolder+listdir[a]) if 'part' in i] for a in range(len(listdir))])
print(f"Found {len(files)} files")

Found 42 files


In [6]:
print("Example file:", files[0])

Example file: data/openalex-snapshot/data/sources/updated_date=2016-06-24/part_000.gz


In [7]:
destination_csv_folder = "data/sources/"
os.makedirs(destination_csv_folder, exist_ok=True)

# # before doing any damage, check if there are files in the destination folder and stop if there are, 
# # because if there are already csv, it will append and might cause many duplicates
if os.listdir(destination_csv_folder):  # listdir returns [] if empty
    existing = os.listdir(destination_csv_folder)
    raise RuntimeError(
        f"Destination folder '{destination_csv_folder}' is not empty! "
        f"Found {len(existing)} file(s), e.g., {existing[:3]}. "
        f"Please clean it before running this script to avoid appending duplicates."
    )

columns=['id', 'display_name', 'issn', 'type', 'publisher', 'host_organization', 'country_code', 'apc_usd', 'works_count', 'cited_by_count', 
         'h_index', 'i10_index', '2yr_h_index', '2yr_i10_index', '2yr_mean_citedness',
         'is_oa', 'is_in_doaj', 'is_core', 'is_indexed_in_scopus']

lines=[]
for gzfile in tqdm(files):
    with gzip.open(gzfile, 'rt') as file:
        for line in file:
            try:
                data = json.loads(line)
                ll=[simpleId(data['id']), data['display_name'].replace(';','.'), data['issn_l'], data['type']]
                
                host_organization = data['host_organization']
                if host_organization is None:
                    host_organization = ''
                ll.append(host_organization)
                
                host_organization_name = data['host_organization_name']
                if host_organization_name is None:
                    host_organization_name = ''
                ll.append(host_organization_name)
                
                country_code = data['country_code']
                if country_code is None:
                    country_code = ''
                ll.append(country_code)

                ll.append(data['apc_usd'])
                ll.append(data['works_count'])
                ll.append(data['cited_by_count'])
                ll.append(data['summary_stats']['h_index'])
                ll.append(data['summary_stats']['i10_index'])
                try:
                    ll.append(data['summary_stats']['2yr_h_index'])
                except KeyError:
                    ll.append('')
                try:
                    ll.append(data['summary_stats']['2yr_i10_index'])
                except KeyError:
                    ll.append('')
                try:
                    ll.append(data['summary_stats']['2yr_mean_citedness'])
                except KeyError:
                    ll.append('')
                ll.append(str(data['is_oa'])[0])
                ll.append(str(data['is_in_doaj'])[0])
                try:
                    ll.append(str(data['is_core'])[0])
                except KeyError:
                    ll.append('')
                try:
                    ll.append(str(data['is_indexed_in_scopus'])[0])
                except KeyError:
                    ll.append('')

                ll = [_ if _ is not None else '' for _ in ll]
                lines.append(ll)
            except json.JSONDecodeError as e:
                print(f"Errore nel parsing della riga: {e}")
            if len(lines) > 1000:
                lines = flush_buffer(lines, destination_csv_folder)
lines = flush_buffer(lines, destination_csv_folder)

100%|██████████| 42/42 [00:40<00:00,  1.03it/s]


# Check source done

In [8]:
import sys
# Add utils directory in the list of directories to look for packages to import
sys.path.insert(0, os.path.join(os.getcwd(),'utils'))
from read_parquet import *

Using PyArrow strings!


In [9]:
def convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = None, separator=',', compression='brotli', do_peek = True, do_print = True):
    os.makedirs(os.path.dirname(parquet_file_path), exist_ok=True)
    df = pl.read_csv(csv_file_path, separator=separator, infer_schema_length=1000)
    if sort_by is not None:
        # sort first by first column name in title, than second if equal, etc.
        df = df.sort(sort_by, descending = True)
    df.write_parquet(parquet_file_path, compression=compression)
    if do_print:
        print(f"Successfully converted {csv_file_path} to {parquet_file_path}.")
    if do_peek:
        if do_print:
            print("Here's a peek.")
        peek_parquet(parquet_file_path)

In [10]:
csv_file_path = "data/sources/sources.csv"
parquet_file_path = "data/sources/sources.parquet"

convert_csv2parquet(csv_file_path, parquet_file_path, sort_by = "works_count", separator = ';')

Successfully converted data/sources/sources.csv to data/sources/sources.parquet.
Here's a peek.
Name: 'sources'
Path: 'data/sources/sources.parquet'
Files: 1
Rows: 255,250
Schema:
    id: large_string
    display_name: large_string
    issn: large_string
    type: large_string
    publisher: large_string
    host_organization: large_string
    country_code: large_string
    apc_usd: large_string
    works_count: int64
    cited_by_count: int64
    h_index: int64
    i10_index: int64
    2yr_h_index: large_string
    2yr_i10_index: large_string
    2yr_mean_citedness: double
    is_oa: large_string
    is_in_doaj: large_string
    is_core: large_string
    is_indexed_in_scopus: large_string
5 random rows:


Unnamed: 0,id,display_name,issn,type,publisher,host_organization,country_code,apc_usd,works_count,cited_by_count,h_index,i10_index,2yr_h_index,2yr_i10_index,2yr_mean_citedness,is_oa,is_in_doaj,is_core,is_indexed_in_scopus
0,S7407052672,NIFS,,repository,,National Institute for Fusion Science,,,28552162,0,0,0,,,0.0,F,F,F,
1,S4306400562,Zenodo (CERN European Organization for Nuclear...,,repository,,CERN European Organization for Nuclear Research,,,10147031,1250513,221,21500,,,0.063284,T,F,F,
2,S4306525036,PubMed,,repository,,,US,,7891251,46362456,939,965613,,,1.275151,F,F,F,
3,S4377196541,Internet Archive (Internet Archive),,repository,,,,,7808691,785301,305,11526,,,0.018623,F,F,F,
4,S4306400572,OPAL (Open@LaTrobe) (La Trobe University),,repository,,La Trobe University,,,6743036,591312,197,8617,,,0.03426,F,F,F,


In [11]:
sources_df = read_parquet(os.path.join("data", "sources", f"sources.parquet"))


Reading 'sources' from 'data/sources/sources.parquet' using engine='pyarrow'
Read 255,250 rows from 'sources' in 0.50 sec.
Converting dtypes took 0.04 sec. Size before: 0.06GB, after: 0.05GB


Unnamed: 0,id,display_name,issn,type,publisher,host_organization,country_code,apc_usd,works_count,cited_by_count,h_index,i10_index,2yr_h_index,2yr_i10_index,2yr_mean_citedness,is_oa,is_in_doaj,is_core,is_indexed_in_scopus
0,S7407052672,NIFS,,repository,,National Institute for Fusion Science,,,28552162,0,0,0,,,0.0,F,F,F,
1,S4306400562,Zenodo (CERN European Organization for Nuclear...,,repository,,CERN European Organization for Nuclear Research,,,10147031,1250513,221,21500,,,0.063284,T,F,F,
2,S4306525036,PubMed,,repository,,,US,,7891251,46362456,939,965613,,,1.275151,F,F,F,


In [13]:
predatory = pd.read_csv(os.path.join("data", "sources", "The Predatory Journals List 2025.csv"), header = None)
predatory.columns = ["id", "journal_name"]
predatory_df = predatory[["journal_name"]]
predatory = list(predatory_df.journal_name.values)
predatory_df

Unnamed: 0,journal_name
0,Abstract and Applied Analysis
1,Academic Exchange Quarterly
2,Academic Research Reviews
3,Academy of Contemporary Research Journal (AOCRJ)
4,ACME Intellects
...,...
2775,World Wide Journal of Multidisciplinary Resear...
2776,Wulfenia (hijacked)
2777,Yangtze Medicine
2778,Youth


In [14]:
sources_set = set(list(sources_df.display_name.values))

In [15]:
import re
from unidecode import unidecode
import pandas as pd

def normalize_name(s: str) -> str:
    if pd.isna(s):
        return ""
    s = unidecode(str(s)).lower()

    # remove parenthetical chunks *and any surrounding spaces*
    # e.g. "wulfenia (hijacked)" -> "wulfenia"
    s = re.sub(r"\s*\([^()]*\)\s*", " ", s)

    s = s.replace("&", " and ")
    s = re.sub(r"[^\w\s]", " ", s)      # drop punctuation
    s = re.sub(r"\s+", " ", s).strip()  # collapse & remove leading/trailing spaces
    return s


# -----------------------------
# Apply normalization
# -----------------------------
pred = predatory_df.copy()
src  = sources_df.copy()

pred["name_norm"] = pred["journal_name"].map(normalize_name)
src["name_norm"]  = src["display_name"].map(normalize_name)

# -----------------------------
# Membership flag
# -----------------------------
src_names = set(src["name_norm"].unique())
pred["exact_in_sources"] = pred["name_norm"].isin(src_names)

# -----------------------------
# Left join for matches
# -----------------------------
exact_matches = pred.merge(
    src[["id", "display_name", "name_norm"]],
    on="name_norm", how="left", suffixes=("_pred", "_src")
)

# -----------------------------
# Split found vs not found
# -----------------------------
predatory_found     = exact_matches[exact_matches["id"].notna()]
predatory_not_found = exact_matches[exact_matches["id"].isna()]

# -----------------------------
# Quick stats
# -----------------------------
print(f"Total predatory journals: {len(pred)}")
print(f"Matched in sources: {len(predatory_found)}")
print(f"Not matched: {len(predatory_not_found)}")


Total predatory journals: 2780
Matched in sources: 2242
Not matched: 831


In [16]:
predatory_found

Unnamed: 0,journal_name,name_norm,exact_in_sources,id,display_name
0,Abstract and Applied Analysis,abstract and applied analysis,True,S54871839,Abstract and Applied Analysis
1,Academic Exchange Quarterly,academic exchange quarterly,True,S2764863161,Academic exchange quarterly
5,Acoustics,acoustics,True,S4210213655,Acoustics
8,Acta Kinesiologica,acta kinesiologica,True,S4210219869,Acta kinesiologica
9,Acta Medica International,acta medica international,True,S4210195041,Acta Medica International
...,...,...,...,...,...
3067,World Scientific News,world scientific news,True,S4306534956,World Scientific News
3069,Wulfenia (hijacked),wulfenia,True,S82240309,Wulfenia
3070,Yangtze Medicine,yangtze medicine,True,S4210176968,Yangtze Medicine
3071,Youth,youth,True,S4210240668,Youth


In [17]:
# make a set of predatory source IDs
pred_ids = set(predatory_found["id"].dropna().astype(str).unique())

# add the flag to sources_df
sources_df["is_predatory"] = sources_df["id"].astype(str).isin(pred_ids)

# (optional) quick check
# sources_df["is_predatory"].value_counts()

In [18]:
print(len(pred_ids))

2223


In [19]:
sources_df

Unnamed: 0,id,display_name,issn,type,publisher,host_organization,country_code,apc_usd,works_count,cited_by_count,h_index,i10_index,2yr_h_index,2yr_i10_index,2yr_mean_citedness,is_oa,is_in_doaj,is_core,is_indexed_in_scopus,is_predatory
0,S7407052672,NIFS,,repository,,National Institute for Fusion Science,,,28552162,0,0,0,,,0.000000,F,F,F,,False
1,S4306400562,Zenodo (CERN European Organization for Nuclear...,,repository,,CERN European Organization for Nuclear Research,,,10147031,1250513,221,21500,,,0.063284,T,F,F,,False
2,S4306525036,PubMed,,repository,,,US,,7891251,46362456,939,965613,,,1.275151,F,F,F,,False
3,S4377196541,Internet Archive (Internet Archive),,repository,,,,,7808691,785301,305,11526,,,0.018623,F,F,F,,False
4,S4306400572,OPAL (Open@LaTrobe) (La Trobe University),,repository,,La Trobe University,,,6743036,591312,197,8617,,,0.034260,F,F,F,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
255245,S5407053255,Wichcha Journal Nakhon Si Thammarat Rajabhat U...,3027-737X,journal,,Nakhon Si Thammarat Rajabhat University,,,1,0,0,0,,,0.000000,T,F,F,,False
255246,S5407053264,Advanced Interventional Materials,3051-1577,journal,,Elsevier,,,1,0,0,0,,,0.000000,F,F,F,,False
255247,S5407053267,Omni Pengabdian Masyarakat,3063-7023,journal,,PT. Bantayang Omni Cendekia,,,1,0,0,0,,,0.000000,F,F,F,,False
255248,S5407053274,Muse,3103-3733,journal,,Milano University Press,,,1,0,0,0,,,0.000000,F,F,F,,False


In [20]:
sources_df.is_predatory.value_counts()

is_predatory
False    253027
True       2223
Name: count, dtype: int64

In [21]:
sources_df.to_parquet(os.path.join("data", "sources", f"sources_with_predatory.parquet"))