In [28]:
import time
import yaml
import requests
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
from bs4 import BeautifulSoup

# load the configs
with open("../config_private.yaml", "r") as f:
    config_private = yaml.safe_load(f)
with open("../config_public.yaml", "r") as f:
    config_public = yaml.safe_load(f)
config = {**config_private, **config_public}

# create the client
client = OpenAI(api_key=config["OPENAI"]["API_KEY"])

In [2]:
# load the soap from url
url = config["DATA"]["JOURNALS_URL"]
headers = config["DATA"]["HEADERS"]
r = requests.get(url, headers=headers)
soap = BeautifulSoup(r.content, "html5lib")

# find all <td> with class 'journal-title'
journals_html = soap.find_all("td", class_="journal-name-cell")
print("Number of journals:", len(journals_html))
journals = []

# for each journal extract the title and the link
for j in journals_html:

    # extract the title and remove the \n
    title = j.find("a").text
    title = title.replace("\n", "")

    # extract the link
    link = f"https://www.mdpi.com{j.find('a').get('href')}"
    journals.append((title, link))

# convert journals to pandas
journals = pd.DataFrame(journals, columns=["title", "link"])
journals

Number of journals: 439


Unnamed: 0,title,link
0,Acoustics,https://www.mdpi.com/journal/acoustics
1,Acta Microbiologica Hellenica,https://www.mdpi.com/journal/amh
2,Actuators,https://www.mdpi.com/journal/actuators
3,Administrative Sciences,https://www.mdpi.com/journal/admsci
4,Adolescents,https://www.mdpi.com/journal/adolescents
...,...,...
434,Women,https://www.mdpi.com/journal/women
435,World,https://www.mdpi.com/journal/world
436,World Electric Vehicle Journal,https://www.mdpi.com/journal/wevj
437,Youth,https://www.mdpi.com/journal/youth


In [3]:
# for each journal navigate to the link and extract the description
descriptions = []
for link in tqdm(journals["link"]):
    r = requests.get(link, headers=headers)
    soap = BeautifulSoup(r.content, "html5lib")
    description = soap.find("div", class_="journal__description__content").text
    descriptions.append(description)

    # sleep for 1 second
    time.sleep(1)

# add the descriptions to the journals
journals["description"] = descriptions
journals

  0%|          | 0/439 [00:00<?, ?it/s]

100%|██████████| 439/439 [17:27<00:00,  2.39s/it]


Unnamed: 0,title,link,description
0,Acoustics,https://www.mdpi.com/journal/acoustics,"\nAcoustics\nis an international, peer-reviewe..."
1,Acta Microbiologica Hellenica,https://www.mdpi.com/journal/amh,\nActa Microbiologica Hellenica\nis an interna...
2,Actuators,https://www.mdpi.com/journal/actuators,"\nActuators\nis an international, peer-reviewe..."
3,Administrative Sciences,https://www.mdpi.com/journal/admsci,\nAdministrative Sciences\nis an international...
4,Adolescents,https://www.mdpi.com/journal/adolescents,"\nAdolescents\nis an international, peer-revie..."
...,...,...,...
434,Women,https://www.mdpi.com/journal/women,"\nWomen\nis an international, peer-reviewed, o..."
435,World,https://www.mdpi.com/journal/world,"\nWorld\nis an international, peer-reviewed, o..."
436,World Electric Vehicle Journal,https://www.mdpi.com/journal/wevj,\nWorld Electric Vehicle Journal\nis the first...
437,Youth,https://www.mdpi.com/journal/youth,"\nYouth\nis an international, peer-reviewed, o..."


In [4]:
# clean the descriptions
def extract_open_access(description):
    end = description.find("Open Access")
    return description[:end].strip().replace("\n", " ")


journals["description"] = journals["description"].apply(extract_open_access)
journals.to_csv("../data/journals.csv", index=False)
journals

Unnamed: 0,title,link,description
0,Acoustics,https://www.mdpi.com/journal/acoustics,"Acoustics is an international, peer-reviewed, ..."
1,Acta Microbiologica Hellenica,https://www.mdpi.com/journal/amh,Acta Microbiologica Hellenica is an internatio...
2,Actuators,https://www.mdpi.com/journal/actuators,"Actuators is an international, peer-reviewed, ..."
3,Administrative Sciences,https://www.mdpi.com/journal/admsci,"Administrative Sciences is an international, p..."
4,Adolescents,https://www.mdpi.com/journal/adolescents,"Adolescents is an international, peer-reviewed..."
...,...,...,...
434,Women,https://www.mdpi.com/journal/women,"Women is an international, peer-reviewed, open..."
435,World,https://www.mdpi.com/journal/world,"World is an international, peer-reviewed, open..."
436,World Electric Vehicle Journal,https://www.mdpi.com/journal/wevj,World Electric Vehicle Journal is the first pe...
437,Youth,https://www.mdpi.com/journal/youth,"Youth is an international, peer-reviewed, open..."


In [7]:
# synthetic manuscript
title = "PDBcor: An automated correlation extraction calculator for multi-state protein structures"
abstract = "Allostery and correlated motion are key elements linking protein dynamics with the mechanisms of action of proteins. Here, we present PDBCor, an automated and unbiased method for the detection and analysis of correlated motions from experimental multi-state protein structures. It uses torsion angle and distance statistics and does not require any structure superposition. Clustering of protein conformers allows us to extract correlations in the form of mutual information based on information theory. With PDBcor, we elucidated correlated motion in the WW domain of PIN1, the protein GB3, and the enzyme cyclophilin, in line with reported findings. Correlations extracted with PDBcor can be utilized in subsequent assays including nuclear magnetic resonance (NMR) multi-state structure optimization and validation. As a guide for the interpretation of PDBcor results, we provide a series of protein structure ensembles that exhibit different levels of correlation, including non-correlated, locally correlated, and globally correlated ensembles."

# create the user prompt
USER_PROMPT = config["GENAI"]["USER_PROMPT"].format(title=title, abstract=abstract)

# collect context on journal descriptions
context = ""
for i, row in journals.iterrows():
    context += f"""Journal {i+1}. {row['title']}\nLink: {row['link']}\nDescription: {row['description']}\n\n"""

# create the context prompt
CONTEXT_PROMPT = config["GENAI"]["CONTEXT_PROMPT"].format(context=context)

# create completion
completion = client.chat.completions.create(
    model="gpt-4-turbo",
    messages=[
        {"role": "system", "content": config["GENAI"]["SYSTEM_PROMPT"]},
        {"role": "user", "content": USER_PROMPT},
        {"role": "assistant", "content": CONTEXT_PROMPT},
    ],
)

print(completion.choices[0].message.content)

Based on your manuscript, here are the top three MDPI journals that are best suited for your work:

| Journal Title | Degree of the Match | MDPI Journal Link |
|---------------|---------------------|------------------|
| Biomolecules | High | [Biomolecules](https://www.mdpi.com/journal/biomolecules) |
| Biosensors | High | [Biosensors](https://www.mdpi.com/journal/biosensors) |
| Applied Sciences | Middle | [Applied Sciences](https://www.mdpi.com/journal/applsci) |

### Justification for Matching:

1. **Biomolecules**:
   - **High Match**: Your manuscript focuses on protein structure analysis, which is well-aligned with the scope of Biomolecules. This journal covers molecular mechanisms and interactions, which are crucial to understanding protein dynamics discussed in your study.

2. **Biosensors**:
   - **High Match**: As your tool, PDBcor, deals with the detection and analysis of protein structures, Biosensors is suitable because it focuses on the development and application of senso

In [30]:
# test Flask endpoint
import requests

# synthetic manuscript
title = "PDBcor: An automated correlation extraction calculator for multi-state protein structures"
abstract = "Allostery and correlated motion are key elements linking protein dynamics with the mechanisms of action of proteins. Here, we present PDBCor, an automated and unbiased method for the detection and analysis of correlated motions from experimental multi-state protein structures. It uses torsion angle and distance statistics and does not require any structure superposition. Clustering of protein conformers allows us to extract correlations in the form of mutual information based on information theory. With PDBcor, we elucidated correlated motion in the WW domain of PIN1, the protein GB3, and the enzyme cyclophilin, in line with reported findings. Correlations extracted with PDBcor can be utilized in subsequent assays including nuclear magnetic resonance (NMR) multi-state structure optimization and validation. As a guide for the interpretation of PDBcor results, we provide a series of protein structure ensembles that exhibit different levels of correlation, including non-correlated, locally correlated, and globally correlated ensembles."

manuscript = {
    "title": title,
    "abstract": abstract,
}
url = "http://127.0.0.1:5000/recommend_streamlit"
headers = {"Authorization": f'Bearer {config["FLASK"]["SECRET"]}'}

r = requests.post(url, headers=headers, json=manuscript)
print(r.json())

Based on the provided abstract for the manuscript titled "PDBcor: An automated correlation extraction calculator for multi-state protein structures," here are the three most suitable MDPI journals along with their degree of match, titles, and links:

| Degree of Match | Journal Title            | Journal Link                                              |
|-----------------|--------------------------|-----------------------------------------------------------|
| High            | Biomolecules             | [Biomolecules](https://www.mdpi.com/journal/biomolecules) |
| High            | Molecules                | [Molecules](https://www.mdpi.com/journal/molecules)       |
| Middle          | Computers                | [Computers](https://www.mdpi.com/journal/computers)       |

### Justification for Matching:

1. **[Biomolecules](https://www.mdpi.com/journal/biomolecules)**: This journal focuses on molecular mechanisms, biochemical processes, and structural biology, which align closely w

In [29]:
# test Flask endpoint
import requests

# synthetic manuscript
title = "PDBcor: An automated correlation extraction calculator for multi-state protein structures"
abstract = "Allostery and correlated motion are key elements linking protein dynamics with the mechanisms of action of proteins. Here, we present PDBCor, an automated and unbiased method for the detection and analysis of correlated motions from experimental multi-state protein structures. It uses torsion angle and distance statistics and does not require any structure superposition. Clustering of protein conformers allows us to extract correlations in the form of mutual information based on information theory. With PDBcor, we elucidated correlated motion in the WW domain of PIN1, the protein GB3, and the enzyme cyclophilin, in line with reported findings. Correlations extracted with PDBcor can be utilized in subsequent assays including nuclear magnetic resonance (NMR) multi-state structure optimization and validation. As a guide for the interpretation of PDBcor results, we provide a series of protein structure ensembles that exhibit different levels of correlation, including non-correlated, locally correlated, and globally correlated ensembles."

manuscript = {
    "title": title,
    "abstract": abstract,
}
url = "http://127.0.0.1:5000/recommend_api"
headers = {"Authorization": f'Bearer {config["FLASK"]["SECRET"]}'}

r = requests.post(url, headers=headers, json=manuscript)
print(r.json())

{'error': 'Unauthorized'}
