# Semantic alignment

In [1]:
import pandas as pd


In [2]:
topics_df = pd.read_csv("data/processed/topics_df.csv")

topic_labels = {
    22: "Sustainable Pharmaceutical Manufacturing Technologies",
    21: "Chronic Kidney Disease and Organoids",
    20: "Retinal and Ocular Disease Mechanisms and Therapies",
    19: "Cancer Prevention and Patient-Centered Care",
    18: "Infectious Disease Dynamics and Pandemic Response",
    17: "Catalysis and Synthetic Organic Chemistry",
    16: "Diabetes Monitoring and Hormonal Regulation Methods",
    15: "Brain Sensory and Cognitive Processing",
    14: "NAFLD and Translational Liver Research",
    13: "Regenerative Cardiac Models and Therapeutics",
    12: "Trustworthy AI and Data Privacy in Digital Health",
    11: "Next-Generation Cardiovascular Diagnostics and Therapies",
    10: "Neuromodulation and Neurotechnology Therapies",
    9: "Molecular and Ecological Plant Adaptation",
    8: "Precision Medicine and Therapeutics for Gut Microbiome Disorders",
    7: "Holistic Approaches to Antimicrobial and Antibiotic Resistance",
    6: "Bone Bioprinting and Regenerative Implant Technologies",
    5: "Sustainable Food Systems and Nutritional Innovations",
    4: "Molecular Mechanisms and Innovative Treatments for Neurodegeneration",
    3: "Multimodal Cancer Imaging and Diagnostics Technologies",
    2: "Digital, Social, and Policy Innovations in Health",
    1: "Innovative Vaccines, Antivirals, and Diagnostics Tools",
    0: "Cancer Immunotherapy and Tumor Biology",
}


In [3]:
from sentence_transformers import SentenceTransformer

# Load model (MiniLM is small & fast; mpnet is more accurate)
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")


In [32]:
import ast
from sklearn.metrics.pairwise import cosine_similarity

pos_kws = (
    topics_df[["topic", "POS"]]
    .drop_duplicates()
    .sort_values(by="topic")
    .reset_index(drop=True)
)


# Function to parse and flatten the string of lists
def parse_and_flatten(s):
    wrapped = f"[{s}]"  # Wrap to parse as list of lists
    list_of_lists = ast.literal_eval(wrapped)
    return [item for sublist in list_of_lists for item in sublist]

eu_kws = topics_df.groupby("topic", as_index=False).agg({"euroSciVocTitle": ", ".join})
eu_kws["euroSciVocTitle"] = eu_kws["euroSciVocTitle"].apply(parse_and_flatten)

# Remove duplicates preserving order
eu_kws["euroSciVocTitle"] = eu_kws["euroSciVocTitle"].apply(
    lambda x: list(dict.fromkeys(x))
)

merged_kws = pd.merge(pos_kws, eu_kws, on="topic")

def embed_keywords(keyword_list):
    # Join keywords as a space-separated string
    text = " ".join(keyword_list)
    return model.encode(text)

# Compute embeddings for each row
merged_kws["POS_embedding"] = merged_kws["POS"].apply(embed_keywords)
merged_kws["euroSciVocTitle_embedding"] = merged_kws["euroSciVocTitle"].apply(
    embed_keywords
)


# Compute cosine similarity for each row
def cosine_sim(row):
    emb1 = row["POS_embedding"].reshape(1, -1)
    emb2 = row["euroSciVocTitle_embedding"].reshape(1, -1)
    return cosine_similarity(emb1, emb2)[0][0]

merged_kws["cosine_similarity"] = merged_kws.apply(cosine_sim, axis=1)

# Keep relevant columns
result = merged_kws[["topic", "cosine_similarity"]]

print(result["cosine_similarity"].describe())

result["topic_label"] = result["topic"].map(topic_labels)

# Reoreder columns topic_label after topic
result = result[["topic", "topic_label", "cosine_similarity"]]
temp = result.sort_values(by="cosine_similarity", ascending=False)
temp.to_csv("data/processed/euscivoc_similarity.csv", index=False)
temp


count    23.000000
mean      0.226143
std       0.059245
min       0.106812
25%       0.196490
50%       0.238915
75%       0.262474
max       0.323647
Name: cosine_similarity, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result["topic_label"] = result["topic"].map(topic_labels)


Unnamed: 0,topic,topic_label,cosine_similarity
2,2,"Digital, Social, and Policy Innovations in Health",0.323647
5,5,Sustainable Food Systems and Nutritional Innov...,0.3012
12,12,Trustworthy AI and Data Privacy in Digital Health,0.296031
19,19,Cancer Prevention and Patient-Centered Care,0.277445
11,11,Next-Generation Cardiovascular Diagnostics and...,0.2682
18,18,Infectious Disease Dynamics and Pandemic Response,0.264402
8,8,Precision Medicine and Therapeutics for Gut Mi...,0.260545
1,1,"Innovative Vaccines, Antivirals, and Diagnosti...",0.2585
15,15,Brain Sensory and Cognitive Processing,0.247935
7,7,Holistic Approaches to Antimicrobial and Antib...,0.246912


In [29]:
import ast

topic_to_compare = 20

topic_row = merged_kws[merged_kws["topic"] == topic_to_compare]

if topic_row.empty:
    print(f"No topic found with topic = {topic_to_compare}")
else:
    POS_keywords = topic_row.iloc[0]["POS"]
    euroSciVocTitle_keywords = topic_row.iloc[0]["euroSciVocTitle"]

    # Convert POS_keywords string to list:
    if isinstance(POS_keywords, str):
        # Try parsing with ast.literal_eval first
        try:
            POS_keywords = ast.literal_eval(POS_keywords)
        except (ValueError, SyntaxError):
            # If fails, assume comma-separated string
            POS_keywords = [k.strip() for k in POS_keywords.split(",")]

    print(f"Topic ID: {topic_to_compare}")
    print("BERTopic Keywords:".ljust(40), "euroSciVocTitle Keywords:")

    max_len = max(len(POS_keywords), len(euroSciVocTitle_keywords))
    POS_keywords += [""] * (max_len - len(POS_keywords))
    euroSciVocTitle_keywords += [""] * (max_len - len(euroSciVocTitle_keywords))

    for k1, k2 in zip(POS_keywords, euroSciVocTitle_keywords):
        print(f"{k1.ljust(40)} {k2}")


Topic ID: 20
BERTopic Keywords:                       euroSciVocTitle Keywords:
eye                                      drug discovery
retinal                                  ophthalmology
ocular                                   stem cells
retina                                   biomaterials
vision                                   entrepreneurship
blindness                                pathology
diseases                                 cells technologies
degeneration                             software
macular                                  radiology
related macular                          parkinson
                                         surgery
                                         robotics
                                         medical biotechnology
                                         physical chemistry
                                         physiology
                                         gene therapy
                                         RNA
         

# Trend in topics

In [None]:
# Set Plotly theme globally
import plotly.io as pio

pio.templates.default = "plotly_white"


In [None]:
import pandas as pd
import plotly.express as px

# Ensure the date column is a datetime object
topics_df["ecSignatureDate"] = pd.to_datetime(topics_df["ecSignatureDate"])

# Optional: group by month or week
topics_df["time_period"] = topics_df["ecSignatureDate"].dt.to_period("M").dt.to_timestamp()  # use 'W' for weekly

# Count number of documents per topic per time period
topic_trend = topics_df.groupby(["time_period", "topic"]).size().reset_index(name="count")

# Plot using Plotly
fig = px.line(
    topic_trend,
    x="time_period",
    y="count",
    color="topic",
    markers=True,
    title="Topic Trends Over Time"
)

fig.update_layout(
    xaxis_title="Time",
    yaxis_title="Project Count",
    legend_title="Topic",
    hovermode="x unified",
)

fig.show()


# Funding allocation

In [None]:
topics_df.columns


In [None]:
topics_df["ecMaxContribution"] = (
    topics_df["ecMaxContribution"].str.replace(",", ".", regex=False).astype(float)
)

funding_by_topic = topics_df.groupby("topic")["ecMaxContribution"].sum().reset_index()
funding_by_topic["topic_label"] = funding_by_topic["topic"].map(topic_labels)


In [None]:
# Plot bar chart
fig = px.bar(
    funding_by_topic,
    x="topic",
    y="ecMaxContribution",
    title="Total Funding Allocation by Topic",
    labels={"ecMaxContribution": "Total EC Contribution", "topic": "Topic"},
    hover_name="topic_label",
)

fig.update_layout(
    xaxis_title="Topic",
    yaxis_title="Total EC Contribution (€)",
    xaxis=dict(type="category"),
    hovermode="x unified",
)

fig.show()


In [None]:
avg_funding_by_topic = topics_df.groupby("topic")["ecMaxContribution"].mean().reset_index()
avg_funding_by_topic["topic_label"] = funding_by_topic["topic"].map(topic_labels)


In [None]:
fig = px.bar(
    avg_funding_by_topic,
    x="topic",
    y="ecMaxContribution",
    title="Average Funding Per Project by Topic",
    labels={"ecMaxContribution": "Average EC Contribution Per Project", "topic": "Topic"},
    hover_name="topic_label",
)

fig.update_layout(
    xaxis_title="Topic",
    yaxis_title="Average EC Contribution Per Project (€)",
    xaxis=dict(type="category"),
    hovermode="x unified",
)

fig.show()


# Publication Outcome

In [None]:
pub = pd.read_excel("data/raw/projectPublications.xlsx")
pub_count = pub.groupby("projectID").size().reset_index(name="publication_count")
topics_df = topics_df.merge(pub_count, on="projectID", how="left")


In [None]:
topics_df.columns


In [None]:
pub_by_topic = topics_df.groupby("topic")["publication_count"].sum().reset_index()
pub_by_topic["topic_label"] = pub_by_topic["topic"].map(topic_labels)


In [None]:
fig = px.bar(
    pub_by_topic,
    x="topic",
    y="publication_count",
    title="Average Funding Per Project by Topic",
    labels={
        "publication_count": "Total Publications",
        "topic": "Topic",
    },
    hover_name="topic_label",
)

fig.update_layout(
    xaxis_title="Topic",
    yaxis_title="Number of Publications",
    xaxis=dict(type="category"),
    hovermode="x unified",
)

fig.show()


In [None]:
avg_pub_by_topic = topics_df.groupby("topic")["publication_count"].mean().reset_index()
avg_pub_by_topic["topic_label"] = avg_pub_by_topic["topic"].map(topic_labels)


In [None]:
fig = px.bar(
    avg_pub_by_topic,
    x="topic",
    y="publication_count",
    title="Average Publication Count Per Project by Topic",
    labels={
        "publication_count": "Total Publications",
        "topic": "Topic",
    },
    hover_name="topic_label",
)

fig.update_layout(
    xaxis_title="Topic",
    yaxis_title="Number of Publications Per Project",
    xaxis=dict(type="category"),
    hovermode="x unified",
)

fig.show()
