In [3]:
import pickle
import json
import pandas as pd
import ast

In [4]:
with open("embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

df_map = pd.read_csv("item_list.txt", sep=" ")
id_map = {str(k): str(v) for v, k in zip(df_map.org_id, df_map.remap_id)}


import json
import ast
import re
from tqdm import tqdm

def fix_line(line: str):
    """Spróbuj naprawić najczęstsze błędy w Amazon JSON"""
    # Zamień HTML &amp; na &
    line = line.replace("&amp;", "&")

    # Czasem są ' zamiast " — ale tylko jeśli nie ma już "
    if line.count('"') < line.count("'"):
        # Zamień pojedyncze cudzysłowy na podwójne, ale nie wewnątrz liczb czy tytułów z apostrofami
        line = re.sub(r"(?<=[:\s])'([^']*)'", r'"\1"', line)
        line = line.replace("None", "null").replace("True", "true").replace("False", "false")
    return line

def try_parse(line: str):
    """Spróbuj różnych metod parsowania pojedynczej linii"""
    try:
        return json.loads(line)
    except json.JSONDecodeError:
        pass
    try:
        fixed = fix_line(line)
        return json.loads(fixed)
    except json.JSONDecodeError:
        pass
    try:
        return ast.literal_eval(line)
    except Exception:
        return None

def read_meta_file(path: str):
    data = []
    with open(path, "r") as f:
        for line in tqdm(f):
            obj = try_parse(line)
            if not obj:
                continue
            if "asin" not in obj or "categories" not in obj:
                continue
            data.append(obj)
    return data

# przykład użycia
meta = read_meta_file("meta_Sports_and_Outdoors.json")
print(f"Wczytano {len(meta)} rekordów.")
print(meta[0])


532197it [01:12, 7348.19it/s] 

Wczytano 532197 rekordów.
{'asin': '0000032069', 'title': 'Adult Ballet Tutu Cheetah Pink', 'price': 7.89, 'imUrl': 'http://ecx.images-amazon.com/images/I/51EzU6quNML._SX342_.jpg', 'related': {'also_bought': ['0000032050', 'B00D0DJAEG', '0000032042', 'B00D0F450I', 'B00D2JTMS2', 'B00D0FDUAY', 'B00D2JSRFQ', '0000032034', 'B00D0D5F6S', 'B00D2JRWWA', 'B00D0FIIJM', 'B00D0FCQQI', 'B00EXVN9PU', 'B0041EOTJO', 'B004PYEE8G', 'B001GTKPDQ', 'B00EON0SJ2', 'B005HMHOQ4', 'B002XZMGGQ'], 'also_viewed': ['B00D0F450I', '0000032050', 'B00D2JTMS2', '0000032042', 'B004PYEE8G', 'B00JHNSNSM', 'B00D0DJAEG', 'B00D2JSRFQ', 'B00D0FCQQI', 'B00D2JRWWA', 'B003AVNY6I', 'B0071KR2LC', 'B00GOR07RE', 'B00D0FIIJM', 'B005F50FXC', 'B0079MCIMU', 'B00D0FDUAY', 'B00H3RYN3I', 'B005C4Y4F6', 'B007IEFT84', 'B00D0D5F6S', 'B002BZX8Z6', 'B00JHONN1S', 'B008F0SU0Y', 'B00FNNFXAG', 'B007R2RM8W', 'B007VM3AMK', 'B00C0PLENA', 'B00BJGG6VG', 'B00E1YRI4C', 'B00IIK61WA', 'B009UC638W', 'B00KZN6RVI', 'B00CSFEENY', 'B002GZGI4E', 'B00HSOJJ94', 'B00




In [5]:
df = pd.DataFrame(meta)
X = [embeddings[idx] for idx in df["embedding_idx"]]
y = df["sub_cat"]

KeyError: 'embedding_idx'

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

for metric in ["cosine", "euclidean", "manhattan"]:
    knn = KNeighborsClassifier(n_neighbors=5, metric=metric)
    knn.fit(X_train, y_train)
    preds = knn.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"{metric=}, accuracy={acc:.3f}")