# Semantic alignment

In [1]:
import pandas as pd


In [27]:
topics_df = pd.read_csv("data/processed/topics_df.csv")


In [None]:
from sentence_transformers import SentenceTransformer

# Load model (MiniLM is small & fast; mpnet is more accurate)
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
pos_kws = topics_df[["topic", "POS"]].drop_duplicates().sort_values(by="topic").reset_index(drop=True)


In [None]:
import ast

# Function to parse and flatten the string of lists
def parse_and_flatten(s):
    wrapped = f"[{s}]"  # Wrap to parse as list of lists
    list_of_lists = ast.literal_eval(wrapped)
    return [item for sublist in list_of_lists for item in sublist]


# eu_kws = topics_df.copy()
# eu_kws["euroSciVocTitle"] = eu_kws["euroSciVocTitle"].apply(
    # lambda x: x if isinstance(x, list) else [x]
# )
# eu_kws = topics_df.groupby("topic", as_index=False).agg({"euroSciVocTitle": ", ".join})

# Apply the function to column D

# eu_kws["euroSciVocTitle"] = eu_kws["euroSciVocTitle"].apply(parse_and_flatten)

eu_kws = topics_df.groupby("topic", as_index=False).agg({"euroSciVocTitle": ", ".join})
eu_kws["euroSciVocTitle"] = eu_kws["euroSciVocTitle"].apply(parse_and_flatten)

# Remove duplicates preserving order
eu_kws["euroSciVocTitle"] = eu_kws["euroSciVocTitle"].apply(
    lambda x: list(dict.fromkeys(x))
)
eu_kws


Unnamed: 0,topic,euroSciVocTitle
0,0,"[planets, sensors, oncology, microbiology, pha..."
1,1,"[bacteriology, antibiotics, drug discovery, ma..."
2,2,"[pathology, RNA, obesity, entrepreneurship, HI..."
3,3,"[reproductive medicine, fertility, business mo..."
4,4,"[pharmaceutical drugs, parkinson, neurobiology..."
5,5,"[proteins, nutrition, ecosystems, fisheries, m..."
6,6,"[proteins, oncology, biochemistry, rheumatolog..."
7,7,"[entrepreneurship, antibiotics, bacteriology, ..."
8,8,"[proteomics, bacteriology, homeostasis, asthma..."
9,9,"[sociology, oilseeds, pathology, ecosystems, m..."


In [63]:
merged_kws = pd.merge(
    pos_kws, eu_kws, on="topic"
)


In [64]:
def embed_keywords(keyword_list):
    # Join keywords as a space-separated string
    text = " ".join(keyword_list)
    return model.encode(text)

# Compute embeddings for each row
merged_kws["POS_embedding"] = merged_kws["POS"].apply(embed_keywords)
merged_kws["euroSciVocTitle_embedding"] = merged_kws["euroSciVocTitle"].apply(
    embed_keywords
)


In [65]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity for each row
def cosine_sim(row):
    emb1 = row["POS_embedding"].reshape(1, -1)
    emb2 = row["euroSciVocTitle_embedding"].reshape(1, -1)
    return cosine_similarity(emb1, emb2)[0][0]

merged_kws["cosine_similarity"] = merged_kws.apply(cosine_sim, axis=1)

# Keep relevant columns
result = merged_kws[["topic", "cosine_similarity"]]
print(result)


    topic  cosine_similarity
0       0           0.217476
1       1           0.258500
2       2           0.323647
3       3           0.230977
4       4           0.215596
5       5           0.301200
6       6           0.231087
7       7           0.246912
8       8           0.260545
9       9           0.242003
10     10           0.238915
11     11           0.268200
12     12           0.296031
13     13           0.106812
14     14           0.165878
15     15           0.247935
16     16           0.228096
17     17           0.167242
18     18           0.264402
19     19           0.277445
20     20           0.107129
21     21           0.127872
22     22           0.177383


In [None]:
import ast

topic_to_compare = 20

topic_row = merged_kws[merged_kws["topic"] == topic_to_compare]

if topic_row.empty:
    print(f"No topic found with topic = {topic_to_compare}")
else:
    POS_keywords = topic_row.iloc[0]["POS"]
    euroSciVocTitle_keywords = topic_row.iloc[0]["euroSciVocTitle"]

    # Convert POS_keywords string to list:
    if isinstance(POS_keywords, str):
        # Try parsing with ast.literal_eval first
        try:
            POS_keywords = ast.literal_eval(POS_keywords)
        except (ValueError, SyntaxError):
            # If fails, assume comma-separated string
            POS_keywords = [k.strip() for k in POS_keywords.split(",")]

    print(f"Topic ID: {topic_to_compare}")
    print("BERTopic Keywords:".ljust(40), "Other Method Keywords:")

    max_len = max(len(POS_keywords), len(euroSciVocTitle_keywords))
    POS_keywords += [""] * (max_len - len(POS_keywords))
    euroSciVocTitle_keywords += [""] * (max_len - len(euroSciVocTitle_keywords))

    for k1, k2 in zip(POS_keywords, euroSciVocTitle_keywords):
        print(f"{k1.ljust(40)} {k2}")


Topic ID: 20
BERTopic Keywords:                       Other Method Keywords:
eye                                      drug discovery
retinal                                  ophthalmology
ocular                                   stem cells
retina                                   biomaterials
vision                                   entrepreneurship
blindness                                pathology
diseases                                 cells technologies
degeneration                             software
macular                                  radiology
related macular                          parkinson
                                         surgery
                                         robotics
                                         medical biotechnology
                                         physical chemistry
                                         physiology
                                         gene therapy
                                         RNA
            

# Trend in topics

In [68]:
topics_df.columns

Index(['projectID', 'title', 'abstract', 'topic', 'euroSciVocTitle',
       'ecSignatureDate', 'totalCost', 'ecMaxContribution', 'MMR', 'KeyBERT',
       'POS', 'Count'],
      dtype='object')

In [69]:
topics_df["ecSignatureDate"]


0       2025-02-18
1       2023-05-26
2       2023-05-31
3       2022-05-24
4       2023-06-01
           ...    
3460    2022-12-22
3461    2023-05-31
3462    2023-06-07
3463    2023-06-02
3464    2023-07-30
Name: ecSignatureDate, Length: 3465, dtype: object

In [75]:
import pandas as pd
import plotly.express as px

# Ensure the date column is a datetime object
topics_df["ecSignatureDate"] = pd.to_datetime(topics_df["ecSignatureDate"])

# Optional: group by month or week
topics_df["time_period"] = topics_df["ecSignatureDate"].dt.to_period("M").dt.to_timestamp()  # use 'W' for weekly

# Count number of documents per topic per time period
topic_trend = topics_df.groupby(["time_period", "topic"]).size().reset_index(name="count")

# Plot using Plotly
fig = px.line(
    topic_trend,
    x="time_period",
    y="count",
    color="topic",
    markers=True,
    title="Topic Trends Over Time",
    template="plotly_white"
)

fig.update_layout(
    xaxis_title="Time",
    yaxis_title="Project Count",
    legend_title="Topic",
    hovermode="x unified",
)

fig.show()


# Funding allocation

In [77]:
topics_df.columns

Index(['projectID', 'title', 'abstract', 'topic', 'euroSciVocTitle',
       'ecSignatureDate', 'totalCost', 'ecMaxContribution', 'MMR', 'KeyBERT',
       'POS', 'Count', 'time_period'],
      dtype='object')

In [78]:
import plotly.express as px

# Aggregate total funding per topic
funding_summary = topics_df.groupby("topic")["ecMaxContribution"].sum().reset_index()

# Plot bar chart
fig = px.bar(
    funding_summary,
    x="topic",
    y="ecMaxContribution",
    title="Total Funding Allocation by Topic",
    labels={"ecMaxContribution": "Total Funding", "topic": "Topic"},
    text_auto=".2s",
)

fig.update_layout(
    xaxis_title="Topic",
    yaxis_title="Total Funding",
    xaxis=dict(type="category"),
    hovermode="x unified",
)

fig.show()
