# Embedding Project Abstracts and Acronyms using Colab

1. Upload all downloaded and processed project data from `inputs/data/cordis/<fp>/project.csv`
2. Upload all downloaded and processed acronym data from `outputs/data/cordis/<fp>/acronyms.csv`
3. Run the cell below

In [None]:
!pip install git+https://github.com/georgerichardson/acronym.git

import pandas as pd
import numpy as np
from acronym.pipeline.cordis.embed_text import (
    remove_matches,
    fetch_encoder,
    embed,
)


for fp in ["fp1", "fp2", "fp3", "fp3", "fp4", "fp5", "fp6", "fp7", "h2020"]:
    cordis_config = get_yaml_config(
        convert_str_to_pathlib_path(f"{PROJECT_DIR}/acronym/config/cordis.yml")
    )
    embed_config = get_yaml_config(
        convert_str_to_pathlib_path(f"{PROJECT_DIR}/acronym/config/embedding.yml")
    )
    
    projects_fp = pd.read_csv("project_{fp}.csv", usecols=["rcn", "objective"])
    acronyms_fp = pd.read_csv("acronym_{fp}.csv", usecols=["rcn", "acronym"])
    
    abstracts_modified = remove_mentions(
        acronyms_fp["acronym"],
        projects_fp["objective"].fillna(""),
    )
    
    encoder = fetch_encoder(embed_config["sentence_transformer_model"])
    
    abstract_embeddings_fp = embed(encoder, abstracts_modified)
    acronym_embeddings_fp = embed(encoder, acronyms_fp["acronym"].tolist())
    
    np.save(
        f"abstract_embeddings_{fp}",
        abstract_embeddings_fp,
    )
    np.save(
        f"acronym_embeddings_{fp}",
        acronym_embeddings_fp,
    )