# Embedding Project Abstracts and Acronyms using Colab

1. Upload all downloaded and processed project data from `inputs/data/cordis/<fp>/project.csv`
2. Upload all downloaded and processed acronym data from `outputs/data/cordis/<fp>/acronyms.csv`
3. Run the cell below

In [None]:
import numpy as np
import os
import pandas as pd
import regex
from sentence_transformers import SentenceTransformer
from typing import Optional, List, Sequence


ENCODER_NAME = "all-MiniLM-L12-v2"
FRAMEWORK_PROGRAMMES = ["fp1", "fp2", "fp3", "fp3", "fp4", "fp5", "fp6", "fp7", "h2020"]

In [None]:
def fetch_encoder(model_name: str) -> SentenceTransformer:
    """Fetches a sentence transformer model."""
    return SentenceTransformer(model_name)


def embed(
    model: SentenceTransformer, texts: Sequence, chunk_size: Optional[int] = None
) -> np.array:
    """Embeds a sequence of texts using a sentence transformer.

    Args:
        model: A sentence transformer.
        texts: A sequence of texts.
        chunk_size: Splits the texts into chunks to be embedded sequentially.
            Useful for breaking up large sequences which might exceed memory.
    """
    return encoder.encode(texts)


def remove_mentions(acronyms: Sequence[str], abstracts: Sequence[str]) -> List[str]:
    """Removes close and exact matches of the acronym from the abstract (ignores case).

    Args:
        acronyms (Sequence[str]): Project acronyms.
        abstracts (Sequence[str]): Project abstracts.

    Returns:
        List[str]: Modified abstracts.
    """
    abstracts_mod = []
    for acronym, abstract in zip(acronyms, abstracts):
        r = rf"({acronym}){{s<=2,i<=1,d<=2,e<=2}}"
        matches = regex.findall(r, abstract, flags=regex.IGNORECASE)
        for match in matches:
            abstract = abstract.replace(match, " ")
        abstracts_mod.append(abstract)
    return abstracts_mod

In [None]:
for fp in FRAMEWORK_PROGRAMMES:
    os.mkdir(fp)
    projects_fp = projects(f"projects_{fp}.csv")
    acronyms_fp = acronymity(f"acronyms_{fp}.csv")

    abstracts_modified = remove_mentions(
        acronyms_fp["acronym"],
        projects_fp["objective"].fillna(""),
    )

    encoder = fetch_encoder(embed_config["sentence_transformer_model"])

    abstract_embeddings_fp = embed(encoder, abstracts_modified)
    acronym_embeddings_fp = embed(encoder, acronyms_fp["acronym"].tolist())

    np.save(
        f"{fp}/abstract_embeddings",
        abstract_embeddings_fp,
    )
    np.save(
        f"{fp}/acronym_embeddings",
        acronym_embeddings_fp,
    )