In [None]:
# if hit plotting library issues, try resetting LD path for julia
# can set in ~/.local/share/jupyter/kernels/
@assert ENV["LD_LIBRARY_PATH"] == ""
import Pkg
Pkg.activate(;temp=true)
Pkg.add("Revise")
import Revise

Pkg.develop(path="$(homedir())/workspace/Mycelia")
import Mycelia

pkgs = String[
    "DataFrames",
    "HTTP",
    "CSV",
    "StatsBase"
]
Pkg.add(pkgs)
for pkg in pkgs
    eval(Meta.parse("import $pkg"))
end

In [None]:
DIR = mkpath(joinpath(homedir(), "workspace", "20240930.journal-filter"))

In [None]:
# Download the file
url = "https://ftp.ncbi.nih.gov/pubmed/J_Medline.txt"
response = HTTP.get(url)
content = String(response.body)

# Split the content into individual journal entries
entries = split(content, "-"^56 * "\n")

# Function to parse a single journal entry
function parse_entry(entry)
    lines = split(strip(entry), "\n")
    data = Dict{String, String}()
    for line in lines
        if ':' in line
            key, value = split(line, ": ", limit=2)
            data[strip(key)] = strip(value)
        end
    end
    return data
end

medline_metadata = DataFrames.DataFrame()
for entry in entries
    push!(medline_metadata, parse_entry(entry), cols=:union)
end
medline_metadata

medline_metadata = medline_metadata[.!isempty.(medline_metadata[!, "ISSN (Online)"]), :]

medline_online_issn = Set(unique(medline_metadata[!, "ISSN (Online)"]))

scimagojr_metadata = CSV.read("$(DIR)/scimagojr 2023.csv", DataFrames.DataFrame, delim=';')

scimagojr_metadata[!, "ISSN (Online)"] .= ""

for (i, row) in enumerate(DataFrames.eachrow(scimagojr_metadata))
    issn_list = String.(split(row["Issn"], ", "))
    issn_list = [x[1:4] * "-" * x[5:8] for x in issn_list]
    online_hits = filter(x -> x in medline_online_issn, issn_list)
    if !isempty(online_hits)
        scimagojr_metadata[i, "ISSN (Online)"] = first(online_hits)
    end
end
scimagojr_metadata

joint_metadata = DataFrames.innerjoin(medline_metadata, scimagojr_metadata, on="ISSN (Online)")

target_areas = [
    "Agricultural and Biological Sciences",
    "Biochemistry, Genetics and Molecular Biology",
    "Medicine",
    "Immunology and Microbiology",
    "Neuroscience"
]

joint_metadata = joint_metadata[map(x -> !isempty(intersect(target_areas, split(x, "; "))), joint_metadata[!, "Areas"]), :]

In [None]:
target_categories = unique([
    "Artificial Intelligence",
    "Computer Science Applications",
    "Computational Mathematics",
    "Statistics and Probability",
    "Bioinformatics",
    "Genetics",
    "Molecular Biology",
    "Plant Science",
    "Agronomy and Crop Science",
    "Microbiology",
    "Virology",
    "Immunology and Microbiology",
    "Biotechnology",
    "Applied Microbiology and Biotechnology",
    "Agricultural and Biological Sciences",
    "Ecology, Evolution, Behavior and Systematics",
    "Cell Biology",
    "Biochemistry, Genetics and Molecular Biology",
    "Bioengineering",
    "Health Informatics",
    "Information Systems and Management",
    "Database Management",
    "Systems Biology",
    "Genomics",
    "Computational Biology",
    "Data Mining and Machine Learning",
    "Biostatistics"
])

function filter_by_categories(row)
    if typeof(row.Categories) <: AbstractString
        categories = split(row.Categories, "; ")
        for category in categories
            category = split(category, " (")[1]  # Remove the (Q#) rating at the end
            if strip(category) in target_categories
                return true
            end
        end
    end
    return false
end

filtered_df = filter(row -> filter_by_categories(row), joint_metadata)

# Filter by Region
filtered_df = filter(row -> row.Region in ["Western Europe", "Northern America"], filtered_df)

filtered_df = filter(row -> row["SJR Best Quartile"] in ["Q1", "Q2", "Q3"], filtered_df)

filtered_df = filter(row -> row["H index"] >= 15, filtered_df)

In [None]:
journals_string = "(\"arXiv\"[jour] OR \"medRxiv\"[jour] OR \"bioRxiv\"[jour] OR " * join(["\"$(x)\"" for x in filtered_df[!, "IsoAbbr"]], "[jour] OR ") * "[jour]" * ")"
println(journals_string)