In [3]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.50.0-py3-none-any.whl.metadata (39 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.29.3-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl.metadata (3.8 kB)
Downloading sentence_transformers-3.4.1-py3-none-any.whl (275 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.9/275.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading huggingface_hub-0.29.3-py3-none-any.whl (468 kB)
[2K 

In [1]:
import os
import json
import pandas as pd

# Path to the directory with JSON files
folder_path = "/Users/cpysleeper/comp631_proj/data_solana/separated_collections"

# Initialize a list to hold records
records = []

# Loop through all JSON files in the directory
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as file:
            try:
                data = json.load(file)
                records.append({
                    "title": data.get("name", ""),
                    "text": data.get("description", "")
                })
            except json.JSONDecodeError:
                print(f"⚠️ Skipping invalid JSON: {filename}")

# Create DataFrame and add document_id as a column
df = pd.DataFrame(records)
df.insert(0, "document_id", range(len(df)))  # Add integer index as a column

# Save to CSV
df.to_csv("solana_collections.csv", index=False)

print("✅ Data has been extracted to 'solana_collections.csv' with document_id.")


✅ Data has been extracted to 'solana_collections.csv' with document_id.


In [4]:
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np

# Load your NFT collection CSV
df = pd.read_csv("solana_collections.csv")

# Combine title and text for better embeddings
documents = (df["title"].fillna('') + " " + df["text"].fillna('')).tolist()

# Load the multilingual E5 model
model = SentenceTransformer("intfloat/multilingual-e5-large-instruct")

# Add instruction prefix for embedding queries and documents (E5-specific)
doc_embeddings = model.encode(
    [f"passage: {doc}" for doc in documents],
    convert_to_numpy=True,
    show_progress_bar=True
)

# Create FAISS index
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

# Map index to original document IDs
id_map = df[["document_id", "title", "text"]].reset_index(drop=True)

# --- SEARCH FUNCTION ---
def search_nft(query: str, k: int = 5):
    query_embed = model.encode(f"query: {query}", convert_to_numpy=True)
    query_embed = np.expand_dims(query_embed, axis=0)
    
    distances, indices = index.search(query_embed, k)
    
    results = []
    for idx in indices[0]:
        result = id_map.iloc[idx]
        results.append({
            "document_id": result["document_id"],
            "title": result["title"],
            "text": result["text"]
        })
    
    return results

# --- Example Usage ---
query = "lottery ticket with unique rewards"
results = search_nft(query, k=3)

for i, doc in enumerate(results, 1):
    print(f"\nResult {i}:")
    print(f"Document ID: {doc['document_id']}")
    print(f"Title: {doc['title']}")
    print(f"Text: {doc['text']}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/140k [00:00<?, ?B/s]

sentence_xlm-roberta_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

Batches:   0%|          | 0/1240 [00:00<?, ?it/s]


Result 1:
Document ID: 20543
Title: monies TIX
Text: Essence lottery tickets

Result 2:
Document ID: 31037
Title: triggered TIX
Text: Essence lottery tickets

Result 3:
Document ID: 15551
Title: zombie lottery
Text: biggest lottery


In [10]:
query = "gaming or animation"
results = search_nft(query, k=5)

for i, doc in enumerate(results, 1):
    print(f"\nResult {i}:")
    print(f"Document ID: {doc['document_id']}")
    print(f"Title: {doc['title']}")
    print(f"Text: {doc['text']}")


Result 1:
Document ID: 33812
Title: Anime Art
Text: A collection of anime art
JPG and Animation

Result 2:
Document ID: 25554
Title: Gamer
Text: Game fantastics

Result 3:
Document ID: 25563
Title: Childhood Game
Text: Collections of animated pixel art, based on childhood game.

Result 4:
Document ID: 24993
Title: Genjitsu in Pixels
Text: Visions from my mind, thoughts, brought to life in pixels on a screen...

Result 5:
Document ID: 24013
Title: SC art space
Text: 1/1 cartoon 2D art



In [6]:
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer, models
import numpy as np
import torch

# Set device: MPS (Apple GPU) or CPU fallback
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"✅ Using device: {device}")

# Load your NFT CSV
df = pd.read_csv("solana_collections.csv")
documents = (df["title"].fillna('') + " " + df["text"].fillna('')).tolist()

# Load the E5 embedding model with MPS support
model = SentenceTransformer("intfloat/multilingual-e5-large-instruct")
model.to(device)

# Encode documents using MPS backend
doc_embeddings = model.encode(
    [f"passage: {doc}" for doc in documents],
    convert_to_numpy=True,
    show_progress_bar=True,
    device=device  # critical for MPS support
)

# Create FAISS index (CPU only)
dimension = doc_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(doc_embeddings)

# Map index to original documents
id_map = df[["document_id", "title", "text"]].reset_index(drop=True)

# --- Search Function ---
def search_nft(query: str, k: int = 5):
    query_embed = model.encode(
        f"query: {query}",
        convert_to_numpy=True,
        device=device
    )
    query_embed = np.expand_dims(query_embed, axis=0)
    
    distances, indices = index.search(query_embed, k)
    
    results = []
    for idx in indices[0]:
        result = id_map.iloc[idx]
        results.append({
            "document_id": result["document_id"],
            "title": result["title"],
            "text": result["text"]
        })
    
    return results

# --- Example Usage ---
query = "lottery ticket with essence rewards"
results = search_nft(query, k=3)

for i, doc in enumerate(results, 1):
    print(f"\n🔎 Result {i}:")
    print(f"ID: {doc['document_id']}")
    print(f"Title: {doc['title']}")
    print(f"Text: {doc['text']}")


✅ Using device: mps


Batches:   0%|          | 0/1240 [00:00<?, ?it/s]

KeyboardInterrupt: 