In [28]:
from sentence_transformers import SentenceTransformer
import numpy as np
import json
import os

class SemanticMatcher:
    def __init__(self, entries):
        """
        Initialize the semantic matcher

        :param entries: Dict, {key: {description}}
        """
        # Add print for debugging
        print(f"Initializing SemanticMatcher with {len(entries)} entries")

        # Specify a local download path
        cache_folder = os.path.expanduser("/Users/shou/Code/huggingface_models")
        
        try:
            # Try downloading manually first
            self.model = SentenceTransformer(
                "intfloat/multilingual-e5-large",
                cache_folder=cache_folder,
                local_files_only=False,
            )
            print("Model loaded successfully")
        except Exception as e:
            print(f"Error loading model: {str(e)}")
            raise

        self.entries = entries
        # Embed all data
        self.entry_embeddings = {}
        print("Starting to embed entries...")
        
        for key, entry_info in entries.items():
            if "identification" not in entry_info:
                print(f"Warning: 'identification' field missing for key {key}")
                continue
                
            entry_text = entry_info["identification"]
            try:
                self.entry_embeddings[key] = self.model.encode(entry_text)
                if len(self.entry_embeddings) % 100 == 0:  # Progress indicator
                    print(f"Embedded {len(self.entry_embeddings)} entries")
            except Exception as e:
                print(f"Error embedding entry {key}: {str(e)}")
                
        print(f"Finished embedding {len(self.entry_embeddings)} entries")

    def match(self, query, top_k=3, threshold=0):
        """
        Search the queries.

        :param query: Query keywords
        :param top_k: Returns the top k most similar results
        :param threshold: Similarity threshold
        :return: Matching entries and their similarities
        """
        if not self.entry_embeddings:
            print("Warning: No entries were embedded. Check if entries were processed correctly.")
            return []

        print(f"Processing query: {query}")
        
        try:
            # Generate an embedding vector for the query
            query_with_prefix = f"query: {query}"
            query_embedding = self.model.encode(query_with_prefix)
            
            # Calculating similarity
            similarities = {}
            for key, entry_embedding in self.entry_embeddings.items():
                similarity = np.dot(query_embedding, entry_embedding) / (
                    np.linalg.norm(query_embedding) * np.linalg.norm(entry_embedding)
                )
                similarities[key] = similarity

            # Sort by similarity
            sorted_matches = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

            # Filter and return results
            filtered_matches = [
                (key, similarity)
                for key, similarity in sorted_matches
                if similarity >= threshold
            ]
            
            print(f"Found {len(filtered_matches)} matches above threshold {threshold}")
            return filtered_matches[:top_k]
            
        except Exception as e:
            print(f"Error during matching: {str(e)}")
            return []

In [29]:
# Input bird identifications
with open("ebird_data.json",'r', encoding='UTF-8') as f:
     entries = json.load(f)

# Encode the passages
matcher = SemanticMatcher(entries)

Initializing SemanticMatcher with 396 entries
Model loaded successfully
Starting to embed entries...
Embedded 100 entries
Embedded 200 entries
Embedded 300 entries
Finished embedding 396 entries


In [None]:
# Queries
test_queries = ["blue"]
for query in test_queries:
    print(f"\nSearch: {query}")
    results = matcher.match(query)

    if not results:
        print("No results found")
    else:
        for key, similarity in results:
            print(f"Matched: {key}, Similarity: {similarity:.4f}")
            print("Detail:", entries[key]["binomialName"], entries[key]["url"])


Search: 青い
Processing query: 青い
Found 396 matches above threshold 0
Matched: Brown-eared Bulbul, Similarity: 0.7675
Detail: Hypsipetes amaurotis https://ebird.org/species/brebul1/JP-13
Matched: Yellow Bittern, Similarity: 0.7674
Detail: Botaurus sinensis https://ebird.org/species/yelbit/JP-13
Matched: Brown-headed Thrush, Similarity: 0.7665
Detail: Turdus chrysolaus https://ebird.org/species/brhthr1/JP-13


In [None]:
from dash import Dash, dcc, html, Input, Output, State, callback

external_stylesheets = ["https://codepen.io/chriddyp/pen/bWLwgP.css"]

app = Dash(__name__, title='RAG-ebird', external_stylesheets=external_stylesheets)

app.layout = html.Div(
    [
        dcc.Input(id="input-text-state", type="text", value="blue",style={"width": "900px", "margin": "20px"}),
        html.Button(id="submit-button-state", n_clicks=0, children="Submit",style={"width": "120px"}),
        html.Div(id="output-state", style={"display": "flex"}),
    ]
)

def return_iframe(macaulayID):
    macaulayLink = "https://macaulaylibrary.org/asset/" + macaulayID + "/embed"

    iframeObj = html.Iframe(
        src=macaulayLink,
        height=600,
        width=320,
        style={"border": "none", "margin": "20px"},
        allow="fullscreen",
    )

    return iframeObj
    

@callback(
    Output("output-state", "children"),
    Input("submit-button-state", "n_clicks"),
    State("input-text-state", "value"),
)
def update_output(n_clicks, input_text):
    # 3 iframes
    results = matcher.match(input_text)
    iframes = []
    
    for key, similarity in results:
        thisIframe = return_iframe(entries[key]["macaulayID"])
        # iframes.append([html.P(str(similarity)), thisIframe])
        iframes.append(thisIframe)

    return iframes


if __name__ == "__main__":
    app.run(debug=True)

Processing query: blue
Found 396 matches above threshold 0
Processing query: blue
Found 396 matches above threshold 0
Processing query: blue
Found 396 matches above threshold 0
Processing query: blue
Found 396 matches above threshold 0
