### Install the required packages

In [None]:
!pip install umap-learn

In [None]:
!pip install gliner

### Import packages

In [None]:
import pandas as pd
from gliner import GLiNER
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

### Load objects and articles csv files



In [None]:
articles_df = pd.read_csv('/content/articles_metadata.csv')
objects_df = pd.read_csv('/content/objects_metadata.csv')

In [None]:
object_texts = objects_df['Collection Online Title'].tolist()  # Convert object text to a list


### Identify 'object' entities in a single article
We do this using GLiNER, a 'universal' named entity recognition model.
Here we used gliner_medium-v2.1. In this instance we extract entities that the model recognises as 'objects' in a single article.

To see a full list of models in the GLiNER family, visit the model's [github repository](https://github.com/urchade/GLiNER).

In [None]:
model = GLiNER.from_pretrained("urchade/gliner_medium-v2.1")

In [None]:
# Retrieve the article text (e.g., the first article)
text = articles_df['combined_text'].iloc[55]

# Chunk the text within a max token length
def chunk_text(text, max_words=200):
    words = text.split()
    chunks = [" ".join(words[i:i + max_words]) for i in range(0, len(words), max_words)]
    return chunks

chunks = chunk_text(text, max_words=200)

In [None]:
# Define labels for entity prediction
labels = ["object"]

all_entities = []

# Run GLiNER on each chunk and collect the entities
for chunk in chunks:
    entities = model.predict_entities(chunk, labels, threshold=0.4)
    all_entities.extend(entities)

In [None]:
# Extract unique object terms from the entities
article_objects = list(set(entity['text'] for entity in all_entities if entity['label'] == "object"))
print("Identified Objects:", article_objects)

In [None]:
len(objects)

### Create embeddings
This section of code will create embeddings for:

*   the 'object' entities extracted using GLiNER
*   the full dataset of objects from a subset of 'textile' objects in the SMG collection



In [None]:
# Initialize embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each identified 'object'' entity
article_embeddings = {term: embedding_model.encode(term) for term in article_objects}

# Encode museum object descriptions with the same model
object_texts = objects_df['Collection Online Title'].tolist()
objects_embeddings = embedding_model.encode(object_texts)

# Find closest matches between textile machine terms and museum objects
matches = []
for term, term_embedding in article_embeddings.items():
    similarities = cosine_similarity([term_embedding], objects_embeddings)[0]
    best_match_idx = np.argmax(similarities)
    best_match_score = similarities[best_match_idx]

    matches.append({
        'textile_machine': term,
        'best_match_object': objects_df['Collection Online Title'].iloc[best_match_idx],
        'similarity_score': best_match_score
    })

# Convert matches to DataFrame for easy viewing
matches_df = pd.DataFrame(matches)
matches_df = matches_df.sort_values(by='similarity_score', ascending=False)
print(matches_df[['textile_machine', 'best_match_object', 'similarity_score']])

### Visualise the results

Here we visualise the results in vector space. This will help us to see how closely the extracted entities match the objects from the SMG dataset.

In [None]:
# Combine textile machine and object embeddings
combined_embeddings = np.vstack((list(textile_machine_embeddings.values()), objects_embeddings))

# Create labels for visualization
labels = ['NER Object'] * len(textile_machine_embeddings) + ['SMG Object'] * len(objects_embeddings)

In [None]:
from umap.umap_ import UMAP

reducer = UMAP(n_components=3, n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
embedding_3d = reducer.fit_transform(combined_embeddings)

In [None]:
import plotly.express as px

embedding_df = pd.DataFrame({
    'x': embedding_3d[:, 0],
    'y': embedding_3d[:, 1],
    'z': embedding_3d[:, 2],
    'label': labels
})

fig = px.scatter_3d(
    embedding_df, x='x', y='y', z='z', color='label',
    title='3D Visualization of NER Terms and Museum Objects',
    opacity=0.7
)

fig.update_traces(marker=dict(size=4))
fig.show()