In [13]:
import os
import re
import time
import pickle
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings

Cargar los embbeding

In [2]:
model_name = "intfloat/e5-small-v2"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True, "batch_size": 32}

emb = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

vectorstore = FAISS.load_local("faiss_index_proyectos_10_txt", embeddings=emb, 
                               allow_dangerous_deserialization=True)

documents = list(vectorstore.docstore._dict.values())
embeddings = vectorstore.index.reconstruct_n(0, vectorstore.index.ntotal)
metadata = [doc.metadata for doc in documents]

  emb = HuggingFaceBgeEmbeddings(


CSV

In [3]:
csv_path = r"C:\Users\emolt\OneDrive - UMH\MASTER\TFM\BASE\cordis_data_processed.csv"
df = pd.read_csv(csv_path, encoding="utf-8")

# Crear diccionario: grant agreement -> type of proposal
proposal_map = dict(zip(df["grant agreement"], df["type of proposal"]))

TYPE 

In [4]:
def normalize_label(label: str) -> str:
    if "-" in label:
        return label.split("-")[0]
    return label

labels_original = []
labels_main = []

for meta in metadata:
    source = meta.get("source", "")
    match = re.search(r"grant_agreement_(\d+)", source)
    if match:
        grant_id = int(match.group(1)) 
        original_label = proposal_map.get(grant_id, "unknown")
        labels_original.append(original_label)
        labels_main.append(normalize_label(original_label))
    else:
        labels_original.append("unknown")
        labels_main.append("unknown")


In [5]:
from collections import Counter
print(Counter(labels_original))

Counter({'RIA': 107056, 'MSCA-IF-EF-ST': 52178, 'IA': 51935, 'CSA': 48082, 'SME-1': 44331, 'MSCA-IF': 33295, 'ERC-STG': 30377, 'ERC-COG': 24636, 'ERC-ADG': 17435, 'SME-2': 13383, 'MSCA-RISE': 12052, 'MSCA-ITN-ETN': 10381, 'MSCA-ITN': 9905, 'ERC-POC': 8428, 'MSCA-IF-GF': 7892, 'MSCA-IF-EF-RI': 5179, 'CS2-IA': 3283, 'MSCA-IF-EF-CAR': 3109, 'ERA-NET-Cofund': 2837, 'ERC-POC-LS': 2222, 'SESAR-RIA': 2089, 'MSCA-IF-EF-SE': 1888, 'CS2-RIA': 1880, 'SME-2b': 1856, 'MSCA-ITN-EID': 1586, 'ERC-SyG': 1353, 'SME': 1254, 'MSCA-COFUND-DP': 1179, 'MSCA-COFUND-FP': 1133, 'CSA-LSP': 1023, 'MSCA-ITN-EJD': 980, 'ECSEL-RIA': 957, 'FCH2-RIA': 912, 'Shift2Rail-RIA': 893, 'ECSEL-IA': 812, 'COFUND-EJP': 779, 'BBI-RIA': 760, 'IMI2-RIA': 757, 'MSCA-COFUND': 744, 'PCP': 679, 'SESAR-IA': 386, 'BBI-IA-DEMO': 266, 'SESAR-CSA': 237, 'FCH2-IA': 185, 'BBI-CSA': 174, 'RIA-LS': 114, 'IMI2-CSA': 92, 'BBI-IA-FLAG': 85, 'FCH2-CSA': 85, 'COFUND': 84, 'EuroHPC-IA': 83, 'H2020-EEN-SGA': 80, 'Shift2Rail-CSA': 76, 'Shift2Rail-IA':

PCA

In [9]:
start = time.time()

reducer_PCA = PCA(n_components=2)
reduced_PCA = reducer_PCA.fit_transform(embeddings)

end = time.time()
print(f"t-SNE ejecutado en {end - start:.2f} segundos")

In [11]:
os.makedirs("plots_type_proposal", exist_ok=True)

for target_label in sorted(set(labels_main)):
    sublabels = sorted(set(l for l, main in zip(labels_original, labels_main) if main == target_label))
    
    cmap = cm.get_cmap("tab20", len(sublabels))
    color_map = {sublabel: cmap(i) for i, sublabel in enumerate(sublabels)}
    
    colors = [color_map[l] if m == target_label else "lightgray"
              for l, m in zip(labels_original, labels_main)]
    
    plt.figure(figsize=(7, 6))

    # Primero los puntos que NO pertenecen a la categoría (grises)
    mask = [m != target_label for m in labels_main]
    plt.scatter(reduced_PCA[mask, 0], reduced_PCA[mask, 1], 
                c="lightgray", alpha=0.2, s=5)

    # Ahora los puntos de la categoría principal con sus sub-colores
    for sublabel in sublabels:
        mask = [(m == target_label) and (l == sublabel) 
                for l, m in zip(labels_original, labels_main)]
        plt.scatter(reduced_PCA[mask, 0], reduced_PCA[mask, 1], 
                    c=[color_map[sublabel]], alpha=0.8, s=10, label=sublabel)

    plt.title(f"Distribución destacando: {target_label}", fontsize=10)
    plt.xlabel("Primera PCA")
    plt.ylabel("Segunda PCA")
    plt.xticks([])
    plt.yticks([])
    plt.legend(title="Subtipos", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
    plt.tight_layout()

    safe_label = str(target_label).replace(" ", "_").replace("/", "_")
    plt.savefig(f"plots_type_proposal/pca_{safe_label}.png", dpi=300, bbox_inches="tight")
    plt.close()

  cmap = cm.get_cmap("tab20", len(sublabels))


TSNE

In [14]:
start = time.time()

reducer_TSNE = TSNE(n_components=2, perplexity=30, init="pca", random_state=42)
reduced_TSNE = reducer_TSNE.fit_transform(embeddings)

end = time.time()
print(f"t-SNE ejecutado en {end - start:.2f} segundos")

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



t-SNE ejecutado en 9042.23 segundos


In [15]:
os.makedirs("plots_type_proposal_tsne", exist_ok=True)

for target_label in sorted(set(labels_main)):
    sublabels = sorted(set(l for l, main in zip(labels_original, labels_main) if main == target_label))
    
    cmap = cm.get_cmap("tab20", len(sublabels))
    color_map = {sublabel: cmap(i) for i, sublabel in enumerate(sublabels)}
    
    plt.figure(figsize=(7, 6))

    mask = [m != target_label for m in labels_main]
    plt.scatter(reduced_TSNE[mask, 0], reduced_TSNE[mask, 1], 
                c="lightgray", alpha=0.2, s=5)

    for sublabel in sublabels:
        mask = [(m == target_label) and (l == sublabel) 
                for l, m in zip(labels_original, labels_main)]
        plt.scatter(reduced_TSNE[mask, 0], reduced_TSNE[mask, 1], 
                    c=[color_map[sublabel]], alpha=0.8, s=10, label=sublabel)

    plt.title(f"t-SNE destacando: {target_label}", fontsize=10)
    plt.xlabel("t-SNE dimensión 1")
    plt.ylabel("t-SNE dimensión 2")
    plt.xticks([])
    plt.yticks([])
    plt.legend(title="Subtipos", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8)
    plt.tight_layout()

    safe_label = str(target_label).replace(" ", "_").replace("/", "_")
    plt.savefig(f"plots_type_proposal_tsne/tsne_{safe_label}.png", dpi=300, bbox_inches="tight")
    plt.close()

  cmap = cm.get_cmap("tab20", len(sublabels))
