In [7]:
from fastembed import TextEmbedding
import numpy as np
from tagmatch.vec_db import VecDB
from pydantic_settings import BaseSettings
import pickle
from sklearn.decomposition import PCA

class Settings(BaseSettings):
    model_name: str
    cache_dir: str
    qdrant_host: str
    qdrant_port: int
    qdrant_collection: str
    reduced_embed_dim: int

    class Config:
        env_file = ".env"
settings = Settings()





In [2]:
tags = tags = [
    "Apollo 11",
    "Moon",
    "Spacecraft",
    "Orbiter",
    "Lander",
    "Rover",
    "Probe",
    "Shuttle",
    "Satellite",
    "Space Station",
    "Space Capsule",
    "Space Telescope",
    "Interstellar",
    "Extraterrestrial",
    "NASA",
    "ESA",
    "Roscosmos",
    "ISRO",
    "CNSA",
    "Launch Vehicle",
    "Rocket",
    "Booster",
    "Thruster",
    "Propulsion",
    "Fuel",
    "Solar Panel",
    "Antenna",
    "Docking",
    "Command Module",
    "Service Module",
    "Heat Shield",
    "Parachute",
    "Escape System",
    "Crewed Mission",
    "Uncrewed Mission",
    "Space Exploration",
    "Deep Space",
    "Low Earth Orbit",
    "Geostationary Orbit",
    "Spacewalk",
    "Extravehicular Activity",
    "Gravity Assist",
    "Orbital Insertion",
    "Reentry",
    "Landing",
    "Mars Mission",
    "Moon Mission",
    "Venus Mission",
    "Jupiter Mission",
    "Saturn Mission",
    "Asteroid Belt",
    "Kuiper Belt",
    "Oort Cloud",
    "Exoplanet",
    "Habitat",
    "Colonization",
    "Terraforming",
    "Space Mining",
    "Space Tourism",
    "Commercial Spaceflight",
    "Reusable Rocket",
    "Launch Pad",
    "Mission Control",
    "Telemetry",
    "Trajectory",
    "Orbital Mechanics",
    "Thrust-to-Weight Ratio",
    "Payload",
    "Fairing",
    "Stage Separation",
    "Vacuum",
    "Microgravity",
    "Artificial Gravity",
    "Radiation Shielding",
    "Life Support",
    "Space Suit",
    "EVA Suit",
    "MMU",
    "Space Habitat",
    "Space Colony",
    "Lunar Base",
    "Mars Base",
    "Astrobiology",
    "Astrophysics",
    "Cosmology",
    "Stellar Navigation",
    "In-Situ Resource Utilization",
    "Cryogenic Fuel",
    "Ion Propulsion",
    "Plasma Propulsion",
    "Fusion Propulsion",
    "Antimatter Propulsion",
    "Space Debris",
    "Orbital Decay",
    "Space Junk",
    "Space Weather",
    "Solar Flare",
    "Cosmic Rays",
    "Van Allen Belts",
    "Magnetosphere",
    "Atmospheric Entry",
    "Suborbital Flight",
    "Trans-lunar Injection",
    "Trans-Martian Injection",
    "Aerobraking",
    "Gravity Well",
    "Lagrange Points",
    "Hohmann Transfer",
    "Biological Contamination",
    "Planetary Protection",
    "Deep Space Network",
    "Space Communications",
    "Interplanetary Travel",
    "Intergalactic Travel",
    "Space Elevator",
    "Launch Window",
    "Mission Timeline",
    "Spacecraft Design",
    "Spacecraft Testing",
    "Spacecraft Manufacturing",
    "Rocket Engine",
    "Liquid Fuel",
    "Solid Fuel",
    "Hybrid Fuel",
    "Cryogenics",
    "Space Law",
    "Space Policy",
    "Astronaut",
    "Cosmonaut",
    "Taikonaut",
    "Space Program",
    "Space Agency",
    "Space Race",
    "Space Treaty",
    "Outer Space",
    "Space History",
    "Space Future",
    "Space Economy",
    "Space Business",
    "Space Industry",
    "Space Conference",
    "Space Symposium",
    "Space Research",
    "Space Innovation",
    "Space Technology",
    "Space Science",
    "Space Medicine",
    "Space Psychology",
    "Space Education",
    "Space Training",
    "Space Simulation",
    "Zero Gravity",
    "Microgravity Research",
    "Space Experiment",
    "Space Laboratory",
    "Space Observatory",
    "Space Exploration Rover",
    "Mars Rover",
    "Lunar Rover",
    "Space Probe",
    "Deep Space Probe",
    "Interstellar Probe",
    "Planetary Rover",
    "Solar System",
    "Galaxy",
    "Universe",
    "Black Hole",
    "Wormhole",
    "White Dwarf",
    "Red Giant",
    "Neutron Star",
    "Pulsar",
    "Quasar",
    "Supernova",
    "Dark Matter",
    "Dark Energy",
    "Big Bang",
    "Cosmic Microwave Background",
    "Gravitational Waves",
    "Space Art",
    "Space Culture",
    "Science Fiction",
    "Space Opera",
    "Starship",
    "Alien Life",
    "UFO",
    "Space Documentary",
    "Space Movie",
    "Space TV Show",
]

In [3]:
embedding_model = TextEmbedding()
embedding_dim: int = list(embedding_model.embed("Test for dims"))[
            0
        ].shape[0]

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 58743.75it/s]


In [9]:
tag_embeddings = [list(embedding_model.embed(tag)) for tag in tags]
embed_vec=np.asarray(tag_embeddings)

In [10]:
embed_vec=embed_vec.squeeze()

In [11]:
pca = PCA(n_components=settings.reduced_embed_dim)

pca.fit(embed_vec)
sum(pca.explained_variance_ratio_)

0.9513890146045014

In [45]:
vec_db = VecDB(
    host="http://localhost",
    port=6333,
    collection=settings.qdrant_collection,
    vector_size=settings.reduced_embed_dim,
)

In [46]:
for name,vec in zip(tags,red_embed_vecs):
    vec_db.store(vec,{"name":name})

In [47]:
reduced_vector_matches = [vec_db.find_closest(query_vector, 5) for query_vector in red_embed_vecs]

In [60]:
reduced_vector_matches[0]

[ScoredPoint(id=10356360874192234358, version=0, score=0.99999994, payload={'name': 'Apollo 11'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=9793711092478664242, version=46, score=0.3996141, payload={'name': 'Moon Mission'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=12207098595657543863, version=1, score=0.3779359, payload={'name': 'Moon'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=12894050092224864253, version=80, score=0.37239692, payload={'name': 'Lunar Base'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=11172108735014080718, version=49, score=0.28158426, payload={'name': 'Saturn Mission'}, vector=None, shard_key=None, order_value=None)]

In [56]:
vec_db_full = VecDB(
    host="http://localhost",
    port=6333,
    collection="full",
    vector_size=embedding_dim,
)

In [57]:
for name,vec in zip(tags,embed_vec):
    vec_db_full.store(vec,{"name":name})

In [58]:

vector_matches = [vec_db_full.find_closest(query_vector, 5) for query_vector in embed_vec]


In [59]:
vector_matches[0]

[ScoredPoint(id=13583038290980744031, version=0, score=0.9999999, payload={'name': 'Apollo 11'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=9869776390968345825, version=46, score=0.7471009, payload={'name': 'Moon Mission'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=9306135850958137005, version=1, score=0.72500336, payload={'name': 'Moon'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=13627112133842308861, version=80, score=0.7157799, payload={'name': 'Lunar Base'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=13746131591790535275, version=14, score=0.6981933, payload={'name': 'NASA'}, vector=None, shard_key=None, order_value=None)]

In [61]:
pickle.dump(pca,open("pca.pkl","wb"))