In [None]:
!pip install -U sentence-transformers

In [None]:
import os
import json
import pandas as pd

# Path to the directory with JSON files
folder_path = "/Users/cpysleeper/comp631_proj/data_solana/separated_collections"

# Initialize a list to hold records
records = []

# Loop through all JSON files in the directory
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            try:
                data = json.load(file)
                records.append({
                    "title": data.get("name", ""),
                    "text": data.get("description", "")
                })
            except json.JSONDecodeError:
                print(f"⚠️ Skipping invalid JSON: {filename}")

# Create DataFrame and add document_id as a column
df = pd.DataFrame(records)
df.insert(0, "document_id", range(len(df)))  # Add integer index as a column

# Save to CSV
df.to_csv("NFT_collections.csv", index=False)

print("✅ Data has been extracted to 'NFT_collections.csv' with document_id.")


In [None]:
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np

# Load your NFT collection CSV
df = pd.read_csv("solana_collections.csv")

# Combine title and text for better embeddings
documents = (df["title"].fillna('') + " " + df["text"].fillna('')).tolist()

# Load the multilingual E5 model
model = SentenceTransformer("intfloat/multilingual-e5-large-instruct")

# Add instruction prefix for embedding queries and documents (E5-specific)
doc_embeddings = model.encode(
    [f"passage: {doc}" for doc in documents],
    convert_to_numpy=True,
    show_progress_bar=True
)

# Create FAISS index
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

# Map index to original document IDs
id_map = df[["document_id", "title", "text"]].reset_index(drop=True)

# --- SEARCH FUNCTION ---
def search_nft(query: str, k: int = 5):
    query_embed = model.encode(f"query: {query}", convert_to_numpy=True)
    query_embed = np.expand_dims(query_embed, axis=0)
    
    distances, indices = index.search(query_embed, k)
    
    results = []
    for idx in indices[0]:
        result = id_map.iloc[idx]
        results.append({
            "document_id": result["document_id"],
            "title": result["title"],
            "text": result["text"]
        })
    
    return results

# --- Example Usage ---
query = "lottery ticket with unique rewards"
results = search_nft(query, k=3)

for i, doc in enumerate(results, 1):
    print(f"\nResult {i}:")
    print(f"Document ID: {doc['document_id']}")
    print(f"Title: {doc['title']}")
    print(f"Text: {doc['text']}")


In [None]:
query = "gaming or animation"
results = search_nft(query, k=5)

for i, doc in enumerate(results, 1):
    print(f"\nResult {i}:")
    print(f"Document ID: {doc['document_id']}")
    print(f"Title: {doc['title']}")
    print(f"Text: {doc['text']}")

In [None]:
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer, models
import numpy as np
import torch

# Set device: MPS (Apple GPU) or CPU fallback
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"✅ Using device: {device}")

# Load your NFT CSV
df = pd.read_csv("solana_collections.csv")
documents = (df["title"].fillna('') + " " + df["text"].fillna('')).tolist()

# Load the E5 embedding model with MPS support
model = SentenceTransformer("intfloat/multilingual-e5-large-instruct")
model.to(device)

# Encode documents using MPS backend
doc_embeddings = model.encode(
    [f"passage: {doc}" for doc in documents],
    convert_to_numpy=True,
    show_progress_bar=True,
    device=device  # critical for MPS support
)

# Create FAISS index (CPU only)
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

# Map index to original documents
id_map = df[["document_id", "title", "text"]].reset_index(drop=True)

# --- Search Function ---
def search_nft(query: str, k: int = 5):
    query_embed = model.encode(
        f"query: {query}",
        convert_to_numpy=True,
        device=device
    )
    query_embed = np.expand_dims(query_embed, axis=0)
    
    distances, indices = index.search(query_embed, k)
    
    results = []
    for idx in indices[0]:
        result = id_map.iloc[idx]
        results.append({
            "document_id": result["document_id"],
            "title": result["title"],
            "text": result["text"]
        })
    
    return results

# --- Example Usage ---
query = "lottery ticket with essence rewards"
results = search_nft(query, k=3)

for i, doc in enumerate(results, 1):
    print(f"\n🔎 Result {i}:")
    print(f"ID: {doc['document_id']}")
    print(f"Title: {doc['title']}")
    print(f"Text: {doc['text']}")
