In [27]:
import pandas as pd

keyboards_df = pd.read_csv("keyboards_clean2.csv")
monitors_df = pd.read_csv("monitors_clean2.csv")
mice_df = pd.read_csv("mice_clean2.csv")
brands_df = pd.read_csv("brands.csv")

# Get all unique brands from product CSVs
product_brands = []
for df in [keyboards_df, monitors_df, mice_df]:
    product_brands.extend(df["Brand"].tolist())

product_brands_unique = set(product_brands)
existing_brands = set(brands_df["brand_name"].tolist())

# Find brands that are in products but NOT in brands.csv
missing_brands = product_brands_unique - existing_brands

print("Missing brands from brands.csv:")
for brand in sorted(missing_brands):
    print(f"{brand}")

print(f"\nTotal missing: {len(missing_brands)}")
print(f"Total unique product brands: {len(product_brands_unique)}")
print(f"Total brands in brands.csv: {len(existing_brands)}")

Missing brands from brands.csv:
3Dconnexion
AOC
ATTACK SHARK
Acer
Anker
BenQ
COUGAR
DELUX
DREVO
ELECOM
EQEOVGA
Endgame Gear
Fantech
Finalmouse
G-Wolves
GameBall
Gamesense
Gigabyte
HK Gaming
HP
Havit
Hitscan
INNOCN
J-Tech Digital
LAMZU
LG
Lepow
MSI
Mad Catz
Marsback
Mobile Pixels
Ninjutso
Nixeus
Philips
Pixio
Ploopy
Pulsar
Pwnage
Samsung
Sceptre
Scyrox
Sharkoon
Swiftpoint
UtechSmart
VGN
VXE
Vancer
Vaxee
Vegcoo
VicTsing
ViewSonic
WLmouse
XTRFY
Xenics
Xiaomi
Zaunkoenig
espresso

Total missing: 57
Total unique product brands: 122
Total brands in brands.csv: 65


In [None]:
import json
import re

with open("Guides-2.txt", "r") as f:
    texts = f.readlines()
    f.close()

category = None
spec = None
text = []
json_data = []
for line in texts:
    if "**" in line:
        category = line.replace("*", "").strip()
    elif "*" in line:
        if category and spec:
            json_data.append({
                "type": "spec",
                "category_name": category.lower() if "key" not in category.lower() else "keyboards",
                "spec_name": spec.lower(),
                "text": "\n".join(text)
            })
        spec = line.replace("*", "").strip()
        text = []
    else:
        text.append(line)

if spec and text:
    json_data.append({
        "category_name": category.lower(),
        "spec_name": spec,
        "text": " ".join(text)
    })

with open("texts.json", "w", encoding="utf-8") as f:
    json.dump(json_data, f, indent=2)

In [None]:
from perplexity import Perplexity
from dotenv import load_dotenv
import os

load_dotenv()

client = Perplexity(api_key=os.environ.get("PERPLEXITY_KEY"))

search = client.search.create(
    query=[
      ""
    ]
)

# for result in search.results:
#     print(f"{result.title}: {result.url}")
for result in search.results:
    print(result.snippet)
    break

# Today we are launching Comet.

Comet is a web browser built for today’s internet. In the last 30 years, the internet has evolved from something we simply “browse” or “search.” The internet is where we live, work, and connect.

It’s also where we ask questions.

Curious minds have questions everywhere, and they find answers on every page, in every idea, through every task. Yet we've been trapped in long lines of tabs and hyperlinks, disjointed experiences that interrupt our natural flow of thought.

In other words, the internet has become humanity's extended mind while our tools for using it remain primitive. Our interface for the web should be as fluid and responsive as human thought itself.

We built Comet to let the internet do what it has been begging to do: to amplify our intelligence.

### From Navigation To Cognition

Comet powers a shift from browsing to thinking.

Tabs that piled up waiting for your return now join one intelligent interface that understands how your mind work

## Create VectorDB

In [14]:
import os
import json
with open(os.path.join("guide", "texts.json"), "r") as f:
    data = json.load(f)
documents = [item["text"] for item in data]
metadatas = [
    {
        "type": item["type"],
        "category_name": item["category_name"],
        "spec_name": item["spec_name"]
    }
    for item in data
]
ids = [f"doc_{i}" for i in range(len(data))]

In [1]:
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

# Initialize OpenAI client
openai_client = OpenAI()

def get_embeddings(texts, model="text-embedding-3-small"):
    response = openai_client.embeddings.create(
        input=texts,
        model=model
    )
    return [d.embedding for d in response.data]

In [2]:
import chromadb

chroma_client = chromadb.PersistentClient(path="vectorDB")

# Create (or get) a collection
collection = chroma_client.get_or_create_collection(name="electronics")

In [None]:
embeddings = get_embeddings(documents)

# Add to Chroma collection
collection.add(
    embeddings=embeddings,
    documents=documents,
    metadatas=metadatas,
    ids=ids
)

In [1]:
import pandas as pd
import os

def validate_monitor_data(df):
    """
    Validate monitor CSV data against SQL schema types.
    Returns a report of problematic values.
    """
    
    # Define expected types for each column
    schema_mapping = {
        # Physical specifications
        'Size (inch)': ('float', 'DECIMAL(4, 1)'),
        'Curve Radius': ('str', 'VARCHAR(20)'),
        'Wall Mount': ('str', 'VARCHAR(50)'),
        'Borders Size (cm)': ('float', 'DECIMAL(4, 2)'),
        
        # Performance ratings
        'Brightness': ('float', 'DECIMAL(3, 1)'),
        'Response Time': ('float', 'DECIMAL(3, 1)'),
        'HDR Picture': ('float', 'DECIMAL(3, 1)'),
        'SDR Picture': ('float', 'DECIMAL(3, 1)'),
        'Color Accuracy': ('float', 'DECIMAL(3, 1)'),
        
        # Display technology
        'Pixel Type': ('str', 'VARCHAR(20)'),
        'Subpixel Layout': ('str', 'VARCHAR(15)'),
        'Backlight': ('str', 'VARCHAR(50)'),
        'Color Depth (Bit)': ('int', 'INTEGER'),
        
        # Contrast
        'Native Contrast': ('float', 'DECIMAL(8, 1)'),
        'Contrast With Local Dimming': ('float', 'DECIMAL(8, 1)'),
        'Local Dimming': ('bool', 'BOOLEAN'),
        
        # Brightness measurements
        'SDR Real Scene (cd/m2)': ('float', 'DECIMAL(6, 2)'),
        'SDR Peak 100% Window (cd/m2)': ('float', 'DECIMAL(6, 2)'),
        'SDR Sustained 100% Window (cd/m2)': ('float', 'DECIMAL(6, 2)'),
        'HDR Real Scene (cd/m2)': ('float', 'DECIMAL(6, 2)'),
        'HDR Peak 100% Window (cd/m2)': ('float', 'DECIMAL(6, 2)'),
        'HDR Sustained 100% Window (cd/m2)': ('float', 'DECIMAL(6, 2)'),
        'Minimum Brightness (cd/m2)': ('float', 'DECIMAL(6, 2)'),
        
        # Viewing angles
        'Color Washout From Left (degrees)': ('int', 'INTEGER'),
        'Color Washout From Right (degrees)': ('int', 'INTEGER'),
        'Color Shift From Left (degrees)': ('int', 'INTEGER'),
        'Color Shift From Right (degrees)': ('int', 'INTEGER'),
        'Brightness Loss From Left (degrees)': ('int', 'INTEGER'),
        'Brightness Loss From Right (degrees)': ('int', 'INTEGER'),
        'Black Level Raise From Left (degrees)': ('int', 'INTEGER'),
        'Black Level Raise From Right (degrees)': ('int', 'INTEGER'),
        
        # Color accuracy
        'Black Uniformity Native (Std Dev)': ('float', 'DECIMAL(6, 3)'),
        'White Balance (dE)': ('float', 'DECIMAL(4, 2)'),
        
        # Refresh rate
        'Native Refresh Rate (Hz)': ('int', 'INTEGER'),
        'Max Refresh Rate (Hz)': ('int', 'INTEGER'),
        'Native Resolution': ('str', 'VARCHAR(20)'),
        'Aspect Ratio': ('str', 'VARCHAR(10)'),
        'Flicker-Free': ('bool', 'BOOLEAN'),
        
        # Connectivity
        'Max Refresh Rate Over HDMI (Hz)': ('int', 'INTEGER'),
        'DisplayPort': ('str', 'VARCHAR(50)'),
        'HDMI': ('str', 'VARCHAR(50)'),
        'USB-C Ports': ('int', 'INTEGER'),
    }
    
    problems = []
    
    for col_name, (dtype, sql_type) in schema_mapping.items():
        if col_name not in df.columns:
            problems.append(f"❌ Column '{col_name}' not found in CSV")
            continue
        
        print(f"\n{'='*60}")
        print(f"Validating: {col_name} (Expected: {sql_type})")
        print(f"{'='*60}")
        
        col_data = df[col_name]
        
        # Check for problematic values
        if dtype == 'int':
            # Try to convert to int
            for idx, val in col_data.items():
                if pd.isna(val) or val == '' or val == 'nan':
                    continue
                try:
                    # Simulate your clean_value function
                    s = str(val).strip()
                    if '/' in s:
                        s = s.split('/')[0].strip()
                    import re
                    match = re.search(r'\d+(\.\d+)?', s)
                    if match:
                        int(round(float(match.group(0))))
                    else:
                        problems.append(f"❌ {col_name} Row {idx}: Cannot parse '{val}' as int")
                        print(f"  ⚠️  Row {idx}: '{val}' → FAILED")
                except Exception as e:
                    problems.append(f"❌ {col_name} Row {idx}: {val} → {e}")
                    print(f"  ⚠️  Row {idx}: '{val}' → ERROR: {e}")
        
        elif dtype == 'float':
            for idx, val in col_data.items():
                if pd.isna(val) or val == '' or val == 'nan':
                    continue
                try:
                    s = str(val).strip()
                    if '/' in s:
                        s = s.split('/')[0].strip()
                    import re
                    match = re.search(r'\d+(\.\d+)?', s)
                    if match:
                        float(match.group(0))
                    else:
                        problems.append(f"❌ {col_name} Row {idx}: Cannot parse '{val}' as float")
                        print(f"  ⚠️  Row {idx}: '{val}' → FAILED")
                except Exception as e:
                    problems.append(f"❌ {col_name} Row {idx}: {val} → {e}")
                    print(f"  ⚠️  Row {idx}: '{val}' → ERROR: {e}")
        
        elif dtype == 'bool':
            for idx, val in col_data.items():
                if pd.isna(val) or val == '' or val == 'nan':
                    continue
                val_str = str(val).strip().lower()
                if val_str not in ['yes', 'no', 'true', 'false', '1', '0', 't', 'f']:
                    problems.append(f"❌ {col_name} Row {idx}: '{val}' not a valid boolean")
                    print(f"  ⚠️  Row {idx}: '{val}' → INVALID BOOLEAN")
        
        elif dtype == 'str':
            # Check string length constraints
            if 'VARCHAR' in sql_type:
                max_len = int(sql_type.split('(')[1].split(')')[0])
                for idx, val in col_data.items():
                    if pd.isna(val) or val == '' or val == 'nan':
                        continue
                    if len(str(val)) > max_len:
                        problems.append(f"❌ {col_name} Row {idx}: '{val}' exceeds max length {max_len}")
                        print(f"  ⚠️  Row {idx}: '{val}' → TOO LONG (len={len(str(val))})")
        
        print(f"✅ {col_name} validation complete")
    
    # Summary
    print(f"\n\n{'='*60}")
    print("VALIDATION SUMMARY")
    print(f"{'='*60}")
    
    if problems:
        print(f"\n❌ Found {len(problems)} problems:\n")
        for p in problems:
            print(f"  {p}")
    else:
        print("\n✅ All columns validated successfully!")
    
    return problems


# Load and validate
df = pd.read_csv(os.path.join('data', 'monitors_clean2.csv'))
problems = validate_monitor_data(df)

# Focus on first few rows if too many errors
if len(problems) > 50:
    print("\n⚠️  Too many errors! Testing first 10 rows only...")
    df_test = df.head(10)
    problems_test = validate_monitor_data(df_test)


Validating: Size (inch) (Expected: DECIMAL(4, 1))
  ⚠️  Row 13: 'Nano-Texture Glass and Tilt Adjustable / Nano-texture Glass Tilt and Height-Adjustable / Nano-texture Glass and VESA Mount Adapter / Standard Glass Tilt and Height-Adjustable / Standard Glass and Tilt Adjustable / Standard Glass and VESA Mount Adapter' → FAILED
  ⚠️  Row 183: 'Blue / Green / Pink / White' → FAILED
✅ Size (inch) validation complete

Validating: Curve Radius (Expected: VARCHAR(20))
✅ Curve Radius validation complete

Validating: Wall Mount (Expected: VARCHAR(50))
✅ Wall Mount validation complete

Validating: Borders Size (cm) (Expected: DECIMAL(4, 2))
✅ Borders Size (cm) validation complete

Validating: Brightness (Expected: DECIMAL(3, 1))
✅ Brightness validation complete

Validating: Response Time (Expected: DECIMAL(3, 1))
✅ Response Time validation complete

Validating: HDR Picture (Expected: DECIMAL(3, 1))
✅ HDR Picture validation complete

Validating: SDR Picture (Expected: DECIMAL(3, 1))
✅ SDR Picture