From 7cce6395af989983506dba30f54a2aa07a84f894 Mon Sep 17 00:00:00 2001 From: Yani Guan Date: Thu, 21 Nov 2024 14:26:15 -0800 Subject: [PATCH] update citations for uni-mol --- source/_data/pub.bib | 412 +++++++++++++++++++++++++++++++++ source/papers/index.md | 6 +- source/papers/uni-mol/index.md | 30 +++ 3 files changed, 444 insertions(+), 4 deletions(-) create mode 100644 source/papers/uni-mol/index.md diff --git a/source/_data/pub.bib b/source/_data/pub.bib index 02efeb8e..c882d0c5 100644 --- a/source/_data/pub.bib +++ b/source/_data/pub.bib @@ -12572,3 +12572,415 @@ @Article{Chen_MatterRadiatExtrem_2024_v9 transport coefficients of warm dense matter.{\ensuremath{<}}/jats:p{\ensuremath{>}}}, } +@Article{Yao_JacsAu_2024_v4_p992, + author = {Lin Yao and Wentao Guo and Zhen Wang and Shang Xiang and Wentan Liu + and Guolin Ke}, + title = {{Node-Aligned Graph-to-Graph: Elevating Template-free Deep Learning + Approaches in Single-Step Retrosynthesis}}, + journal = {Jacs Au}, + year = 2024, + volume = 4, + number = 3, + pages = {992--1003}, + doi = {10.1021/jacsau.3c00737}, + abstract = {Single-step retrosynthesis in organic chemistry increasingly benefits + from deep learning (DL) techniques in computer-aided synthesis design. + While template-free DL models are flexible and promising for + retrosynthesis prediction, they often ignore vital 2D molecular + information and struggle with atom alignment for node generation, + resulting in lower performance compared to the template-based and + semi-template-based methods. To address these issues, we introduce + node-aligned graph-to-graph (NAG2G), a transformer-based template-free + DL model. NAG2G combines 2D molecular graphs and 3D conformations to + retain comprehensive molecular details and incorporates product- + reactant atom mapping through node alignment, which determines the + order of the node-by-node graph outputs process in an autoregressive + manner. Through rigorous benchmarking and detailed case studies, we + have demonstrated that NAG2G stands out with its remarkable predictive + accuracy on the expansive data sets of USPTO-50k and USPTO-FULL. + Moreover, the model's practical utility is underscored by its + successful prediction of synthesis pathways for multiple drug + candidate molecules. This proves not only NAG2G's robustness but also + its potential to revolutionize the prediction of complex chemical + synthesis processes for future synthetic route design tasks.}, +} +@Article{Luo_arXiv_2024_p2406.09841, + author = {Yizhen Luo and Kai Yang and Massimo Hong and Xing Yi Liu and Zikun Nie + and Hao Zhou and Zaiqing Nie}, + title = {{Learning Multi-view Molecular Representations with Structured and + Unstructured Knowledge}}, + journal = {arXiv}, + year = 2024, + pages = {2406.09841}, + doi = {10.48550/arXiv.2406.09841}, + abstract = {Capturing molecular knowledge with representation learning approaches + holds significant potential in vast scientific fields such as + chemistry and life science. An effective and generalizable molecular + representation is expected to capture the consensus and complementary + molecular expertise from diverse views and perspectives. However, + existing works fall short in learning multi-view molecular + representations, due to challenges in explicitly incorporating view + information and handling molecular knowledge from heterogeneous + sources. To address these issues, we present MV-Mol, a molecular + representation learning model that harvests multi-view molecular + expertise from chemical structures, unstructured knowledge from + biomedical texts, and structured knowledge from knowledge graphs. We + utilize text prompts to model view information and design a fusion + architecture to extract view-based molecular representations. We + develop a two-stage pre-training procedure, exploiting heterogeneous + data of varying quality and quantity. Through extensive experiments, + we show that MV-Mol provides improved representations that + substantially benefit molecular property prediction. Additionally, MV- + Mol exhibits state-of-the-art performance in multi-modal comprehension + of molecular structures and texts. Code and data are available at + https://github.com/PharMolix/OpenBioMed.}, +} +@Article{Yang_Biorxiv_2024, + author = {Junwei Yang and Kangjie Zheng and Siyu Long and Zaiqing Nie and Ming + Zhang and Xinyu Dai and Wei-Ying Ma and Hao Zhou}, + title = {{Mol-AE: Auto-Encoder Based Molecular Representation Learning With 3D + Cloze Test Objective}}, + journal = {Biorxiv}, + year = 2024, + doi = {10.1101/2024.04.13.589331}, + abstract = {Abstract3D molecular representation learning has gained tremendous + interest and achieved promising performance in various downstream + tasks. A series of recent approaches follow a prevalent framework: an + encoder-only model coupled with a coordinate denoising objective. + However, through a series of analytical experiments, we prove that the + encoderonly model with coordinate denoising objective exhibits + inconsistency between pre-training and downstream objectives, as well + as issues with disrupted atomic identifiers. To address these two + issues, we propose Mol-AE for molecular representation learning, an + auto-encoder model using positional encoding as atomic identifiers. We + also propose a new training objective named 3D Cloze Test to make the + model learn better atom spatial relationships from real molecular + substructures. Empirical results demonstrate that Mol-AE achieves a + large margin performance gain compared to the current state-of-the-art + 3D molecular modeling approach. The source codes of Mol-AE are + publicly available athttps://github.com/yjwtheonly/MolAE.}, +} +@Article{Feng_arXiv_2024_p2406.17797, + author = {Shikun Feng and Jiaxin Zheng and Yinjun Jia and Yanwen Huang and + Fengfeng Zhou and Wei-Ying Ma and Yanyan Lan}, + title = {{MoleculeCLA: Rethinking Molecular Benchmark via Computational Ligand- + Target Binding Analysis}}, + journal = {arXiv}, + year = 2024, + pages = {2406.17797}, + doi = {10.48550/arXiv.2406.17797}, + abstract = {Molecular representation learning is pivotal for various molecular + property prediction tasks related to drug discovery. Robust and + accurate benchmarks are essential for refining and validating current + methods. Existing molecular property benchmarks derived from wet + experiments, however, face limitations such as data volume + constraints, unbalanced label distribution, and noisy labels. To + address these issues, we construct a large-scale and precise molecular + representation dataset of approximately 140,000 small molecules, + meticulously designed to capture an extensive array of chemical, + physical, and biological properties, derived through a robust + computational ligand-target binding analysis pipeline. We conduct + extensive experiments on various deep learning models, demonstrating + that our dataset offers significant physicochemical interpretability + to guide model development and design. Notably, the dataset's + properties are linked to binding affinity metrics, providing + additional insights into model performance in drug-target interaction + tasks. We believe this dataset will serve as a more accurate and + reliable benchmark for molecular representation learning, thereby + expediting progress in the field of artificial intelligence-driven + drug discovery.}, +} +@Article{Li_arXiv_2024_p2401.13923, + author = {Sihang Li and Zhiyuan Liu and Yanchen Luo and Xiang Wang and Xiangnan + He and Kenji Kawaguchi and Tat-Seng Chua and Qi Tian}, + title = {{Towards 3D Molecule-Text Interpretation in Language Models}}, + journal = {arXiv}, + year = 2024, + pages = {2401.13923}, + doi = {10.48550/arXiv.2401.13923}, + abstract = {Language Models (LMs) have greatly influenced diverse domains. + However, their inherent limitation in comprehending 3D molecular + structures has considerably constrained their potential in the + biomolecular domain. To bridge this gap, we focus on 3D molecule-text + interpretation, and propose 3D-MoLM: 3D-Molecular Language Modeling. + Specifically, 3D-MoLM enables an LM to interpret and analyze 3D + molecules by equipping the LM with a 3D molecular encoder. This + integration is achieved by a 3D molecule-text projector, bridging the + 3D molecular encoder's representation space and the LM's input space. + Moreover, to enhance 3D-MoLM's ability of cross-modal molecular + understanding and instruction following, we meticulously curated a 3D + molecule-centric instruction tuning dataset -- 3D-MoIT. Through 3D + molecule-text alignment and 3D molecule-centric instruction tuning, + 3D-MoLM establishes an integration of 3D molecular encoder and LM. It + significantly surpasses existing baselines on downstream tasks, + including molecule-text retrieval, molecule captioning, and more + challenging open-text molecular QA tasks, especially focusing on + 3D-dependent properties. We release our codes and datasets at + https://github.com/lsh0520/3D-MoLM.}, +} +@Article{Pei_arXiv_2024_p2406.05797, + author = {Qizhi Pei and Lijun Wu and Kaiyuan Gao and Jinhua Zhu and Rui Yan}, + title = {{3D-MolT5: Towards Unified 3D Molecule-Text Modeling with 3D Molecular + Tokenization}}, + journal = {arXiv}, + year = 2024, + pages = {2406.05797}, + doi = {10.48550/arXiv.2406.05797}, + abstract = {The integration of molecule and language has garnered increasing + attention in molecular science. Recent advancements in Language Models + (LMs) have demonstrated potential for the comprehensive modeling of + molecule and language. However, existing works exhibit notable + limitations. Most existing works overlook the modeling of 3D + information, which is crucial for understanding molecular structures + and also functions. While some attempts have been made to leverage + external structure encoding modules to inject the 3D molecular + information into LMs, there exist obvious difficulties that hinder the + integration of molecular structure and language text, such as modality + alignment and separate tuning. To bridge this gap, we propose + 3D-MolT5, a unified framework designed to model both 1D molecular + sequence and 3D molecular structure. The key innovation lies in our + methodology for mapping fine-grained 3D substructure representations + (based on 3D molecular fingerprints) to a specialized 3D token + vocabulary for 3D-MolT5. This 3D structure token vocabulary enables + the seamless combination of 1D sequence and 3D structure + representations in a tokenized format, allowing 3D-MolT5 to encode + molecular sequence (SELFIES), molecular structure, and text sequences + within a unified architecture. Alongside, we further introduce 1D and + 3D joint pre-training to enhance the model's comprehension of these + diverse modalities in a joint representation space and better + generalize to various tasks for our foundation model. Through + instruction tuning on multiple downstream datasets, our proposed + 3D-MolT5 shows superior performance than existing methods in molecular + property prediction, molecule captioning, and text-based molecule + generation tasks. Our code will be available on GitHub soon.}, +} +@Article{Gao_arXiv_2024_p2406.08980, + author = {Bowen Gao and Haichuan Tan and Yanwen Huang and Minsi Ren and Xiao + Huang and Wei-Ying Ma and Ya-Qin Zhang and Yanyan Lan}, + title = {{From Theory to Therapy: Reframing SBDD Model Evaluation via Practical + Metrics}}, + journal = {arXiv}, + year = 2024, + pages = {2406.08980}, + doi = {10.48550/arXiv.2406.08980}, + abstract = {Recent advancements in structure-based drug design (SBDD) have + significantly enhanced the efficiency and precision of drug discovery + by generating molecules tailored to bind specific protein pockets. + Despite these technological strides, their practical application in + real-world drug development remains challenging due to the + complexities of synthesizing and testing these molecules. The + reliability of the Vina docking score, the current standard for + assessing binding abilities, is increasingly questioned due to its + susceptibility to overfitting. To address these limitations, we + propose a comprehensive evaluation framework that includes assessing + the similarity of generated molecules to known active compounds, + introducing a virtual screening-based metric for practical deployment + capabilities, and re-evaluating binding affinity more rigorously. Our + experiments reveal that while current SBDD models achieve high Vina + scores, they fall short in practical usability metrics, highlighting a + significant gap between theoretical predictions and real-world + applicability. Our proposed metrics and dataset aim to bridge this + gap, enhancing the practical applicability of future SBDD models and + aligning them more closely with the needs of pharmaceutical research + and development.}, +} +@Article{Xiao_arXiv_2024_p2403.08167, + author = {Teng Xiao and Chao Cui and Huaisheng Zhu and Vasant G. Honavar}, + title = {{MolBind: Multimodal Alignment of Language, Molecules, and Proteins}}, + journal = {arXiv}, + year = 2024, + pages = {2403.08167}, + doi = {10.48550/arXiv.2403.08167}, + abstract = {Recent advancements in biology and chemistry have leveraged multi- + modal learning, integrating molecules and their natural language + descriptions to enhance drug discovery. However, current pre-training + frameworks are limited to two modalities, and designing a unified + network to process different modalities (e.g., natural language, 2D + molecular graphs, 3D molecular conformations, and 3D proteins) remains + challenging due to inherent gaps among them. In this work, we propose + MolBind, a framework that trains encoders for multiple modalities + through contrastive learning, mapping all modalities to a shared + feature space for multi-modal semantic alignment. To facilitate + effective pre-training of MolBind on multiple modalities, we also + build and collect a high-quality dataset with four modalities, + MolBind-M4, including graph-language, conformation-language, graph- + conformation, and conformation-protein paired data. MolBind shows + superior zero-shot learning performance across a wide range of tasks, + demonstrating its strong capability of capturing the underlying + semantics of multiple modalities.}, +} +@Article{Zhang_IntJComputIntellSyst_2024_v17_p165, + author = {Yi-Lun Zhang and Wen-Tao Wang and Jia-Hui Guan and Deepak Kumar Jain + and Tian-Yang Wang and Swalpa Kumar Roy}, + title = {{MocFormer: A Two-Stage Pre-training-Driven Transformer for + Drug{\textendash}Target Interactions Prediction}}, + journal = {Int J Comput. Intell Syst}, + year = 2024, + volume = 17, + number = 1, + pages = 165, + doi = {10.1007/s44196-024-00561-1}, + abstract = {AbstractDrug{\textendash}target interactions is essential for + advancing pharmaceuticals. Traditional drug{\textendash}target + interaction studies rely on labor-intensive laboratory techniques. + Still, recent advancements in computing power have elevated the + importance of deep learning methods, offering faster, more precise, + and cost-effective screening and prediction. Nonetheless, general deep + learning methods often yield low-confidence results due to the complex + nature of drugs and proteins, bias, limited labeled data, and feature + extraction challenges. To address these challenges, a novel two-stage + pre-trained framework is proposed for drug{\textendash}target + interactions prediction. In the first stage, pre-trained molecule and + protein models develop a comprehensive feature representation, + enhancing the framework{\textquoteright}s ability to handle drug and + protein diversity. This also reduces bias, improving prediction + accuracy. In the second stage, a transformer with bilinear pooling and + a fully connected layer enables predictions based on feature vectors. + Comprehensive experiments were conducted using public datasets from + DrugBank and Epigenetic-regulators datasets to evaluate the + framework{\textquoteright}s effectiveness. The results demonstrate + that the proposed framework outperforms the state-of-the-art methods + regarding accuracy, area under the receiver operating characteristic + curve, recall, and area under the precision-recall curve. The code is + available at: https://github.com/DHCGroup/MocFormer.}, +} +@Article{Feng_arXiv_2023_p2311.16160, + author = {Shikun Feng and Minghao Li and Yinjun Jia and Weiying Ma and Yanyan + Lan}, + title = {{Protein-ligand binding representation learning from fine-grained + interactions}}, + journal = {arXiv}, + year = 2023, + pages = {2311.16160}, + doi = {10.48550/arXiv.2311.16160}, + abstract = {The binding between proteins and ligands plays a crucial role in the + realm of drug discovery. Previous deep learning approaches have shown + promising results over traditional computationally intensive methods, + but resulting in poor generalization due to limited supervised data. + In this paper, we propose to learn protein-ligand binding + representation in a self-supervised learning manner. Different from + existing pre-training approaches which treat proteins and ligands + individually, we emphasize to discern the intricate binding patterns + from fine-grained interactions. Specifically, this self-supervised + learning problem is formulated as a prediction of the conclusive + binding complex structure given a pocket and ligand with a Transformer + based interaction module, which naturally emulates the binding + process. To ensure the representation of rich binding information, we + introduce two pre-training tasks, i.e.{\textasciitilde}atomic pairwise + distance map prediction and mask ligand reconstruction, which + comprehensively model the fine-grained interactions from both + structure and feature space. Extensive experiments have demonstrated + the superiority of our method across various binding tasks, including + protein-ligand affinity prediction, virtual screening and protein- + ligand docking.}, +} +@Article{Gao_arXiv_2024_p2310.07229, + author = {Bowen Gao and Yinjun Jia and Yuanle Mo and Yuyan Ni and Weiying Ma and + Zhiming Ma and Yanyan Lan}, + title = {{ProFSA: Self-supervised Pocket Pretraining via Protein Fragment- + Surroundings Alignment}}, + journal = {arXiv}, + year = 2024, + pages = {2310.07229}, + doi = {10.48550/arXiv.2310.07229}, + abstract = {Pocket representations play a vital role in various biomedical + applications, such as druggability estimation, ligand affinity + prediction, and de novo drug design. While existing geometric features + and pretrained representations have demonstrated promising results, + they usually treat pockets independent of ligands, neglecting the + fundamental interactions between them. However, the limited pocket- + ligand complex structures available in the PDB database (less than 100 + thousand non-redundant pairs) hampers large-scale pretraining + endeavors for interaction modeling. To address this constraint, we + propose a novel pocket pretraining approach that leverages knowledge + from high-resolution atomic protein structures, assisted by highly + effective pretrained small molecule representations. By segmenting + protein structures into drug-like fragments and their corresponding + pockets, we obtain a reasonable simulation of ligand-receptor + interactions, resulting in the generation of over 5 million complexes. + Subsequently, the pocket encoder is trained in a contrastive manner to + align with the representation of pseudo-ligand furnished by some + pretrained small molecule encoders. Our method, named ProFSA, achieves + state-of-the-art performance across various tasks, including pocket + druggability prediction, pocket matching, and ligand binding affinity + prediction. Notably, ProFSA surpasses other pretraining methods by a + substantial margin. Moreover, our work opens up a new avenue for + mitigating the scarcity of protein-ligand complex data through the + utilization of high-quality and diverse protein structure databases.}, +} +@Article{Zhang_GreenChem_2024_v26_p4181, + author = {Jun Zhang and Qin Wang and Yang Lei and Weifeng Shen}, + title = {{An interpretable 3D multi-hierarchical representation-based deep + neural network for environmental, health and safety properties + prediction of organic solvents}}, + journal = {Green Chem.}, + year = 2024, + volume = 26, + number = 7, + pages = {4181--4191}, + doi = {10.1039/D3GC04801B}, + abstract = {A 3D multi-hierarchical representation-based deep neural network + (3D-MrDNN) architecture for prediction of the environmental, health + and safety properties of organic solvents.}, +} +@Article{Comajuncosa-Creus_JCheminformatics_2024_v16_p70, + author = {Arnau Comajuncosa-Creus and Aksel Lenes and Miguel S{\'a}nchez- + Palomino and Dylan Dalton and Patrick Aloy}, + title = {{Stereochemically-aware bioactivity descriptors for uncharacterized + chemical compounds}}, + journal = {J. Cheminformatics}, + year = 2024, + volume = 16, + number = 1, + pages = 70, + doi = {10.1186/s13321-024-00867-4}, + abstract = {Stereochemistry plays a fundamental role in pharmacology. Here, we + systematically investigate the relationship between stereoisomerism + and bioactivity on over 1{~}M compounds, finding that a very + significant fraction ({\textasciitilde}{\,}40{\%}) of spatial isomer + pairs show, to some extent, distinct bioactivities. We then use the 3D + representation of these molecules to train a collection of deep neural + networks (Signaturizers3D) to generate bioactivity descriptors + associated to small molecules, that capture their effects at + increasing levels of biological complexity (i.e. from protein targets + to clinical outcomes). Further, we assess the ability of the + descriptors to distinguish between stereoisomers and to recapitulate + their different target binding profiles. Overall, we show how these + new stereochemically-aware descriptors provide an even more faithful + description of complex small molecule bioactivity properties, + capturing key differences in the activity of stereoisomers.Scientific + contributionWe systematically assess the relationship between + stereoisomerism and bioactivity on a large scale, focusing on + compound-target binding events, and use our findings to train novel + deep learning models to generate stereochemically-aware bioactivity + signatures for any compound of interest.}, +} +@Article{Chang_JChemInfModel_2024_v64_p3149, + author = {Jiamin Chang and Xiaoyu Fan and Boxue Tian}, + title = {{DeepP450: Predicting Human P450 Activities of Small Molecules by + Integrating Pretrained Protein Language Model and Molecular + Representation}}, + journal = {J. Chem. Inf. Model.}, + year = 2024, + volume = 64, + number = 8, + pages = {3149--3160}, + doi = {10.1021/acs.jcim.4c00115}, + abstract = {Cytochrome P450 enzymes (CYPs) play a crucial role in Phase I drug + metabolism in the human body, and CYP activity toward compounds can + significantly affect druggability, making early prediction of CYP + activity and substrate identification essential for therapeutic + development. Here, we established a deep learning model for assessing + potential CYP substrates, DeepP450, by fine-tuning protein and + molecule pretrained models through feature integration with cross- + attention and self-attention layers. This model exhibited high + prediction accuracy (0.92) on the test set, with area under the + receiver operating characteristic curve (AUROC) values ranging from + 0.89 to 0.98 in substrate/nonsubstrate predictions across the nine + major human CYPs, surpassing current benchmarks for CYP activity + prediction. Notably, DeepP450 uses only one model to predict + substrates/nonsubstrates for any of the nine CYPs and exhibits certain + generalizability on novel compounds and different categories of human + CYPs, which could greatly facilitate early stage drug design by + avoiding CYP-reactive compounds.}, +} \ No newline at end of file diff --git a/source/papers/index.md b/source/papers/index.md index 420b2267..f66c7205 100644 --- a/source/papers/index.md +++ b/source/papers/index.md @@ -9,7 +9,5 @@ title: Publications citing DeepModeling's work - [DeePMD-kit](deepmd-kit/) - [DP-GEN](dpgen/) -- DeePKS-kit -- RiD-kit -- ABACUS -- FEALPy +- [ABACUS](abacus/) +- [Uni-Mol](unimol/) \ No newline at end of file diff --git a/source/papers/uni-mol/index.md b/source/papers/uni-mol/index.md new file mode 100644 index 00000000..0761ae40 --- /dev/null +++ b/source/papers/uni-mol/index.md @@ -0,0 +1,30 @@ +--- +title: Publications driven by Uni-Mol +date: 2023-03-06 +update: 2024-11-21 +mathjax: true +--- + +The following publications have used the Uni-Mol software. Publications that only mentioned the Uni-Mol will not be included below. + +We encourage explicitly mentioning Uni-Mol with proper citations in your publications, so we can more easily find and list these publications. + +Last update date: 11/21/2024 + +## 2024 +{% publications %} +Yao_JacsAu_2024_v4_p992, +Luo_arXiv_2024_p2406.09841, +Yang_Biorxiv_2024, +Feng_arXiv_2024_p2406.17797, +Li_arXiv_2024_p2401.13923, +Pei_arXiv_2024_p2406.05797, +Gao_arXiv_2024_p2406.08980, +Xiao_arXiv_2024_p2403.08167, +Zhang_IntJComputIntellSyst_2024_v17_p165, +Feng_arXiv_2023_p2311.16160, +Gao_arXiv_2024_p2310.07229, +Zhang_GreenChem_2024_v26_p4181, +Comajuncosa-Creus_JCheminformatics_2024_v16_p70, +Chang_JChemInfModel_2024_v64_p3149 +{% endpublications %} \ No newline at end of file