# NLP

In [10]:
import os, ast, re, json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from dotenv import load_dotenv

load_dotenv()

CSV_PATH            = os.getenv("CSV_PATH", "intern_data_ikarus.csv")
PINECONE_API_KEY    = os.getenv("PINECONE_API_KEY")
PINECONE_ENV        = os.getenv("PINECONE_ENV", "us-east-1-aws")
PINECONE_TEXT_INDEX = os.getenv("PINECONE_TEXT_INDEX", "products-text")

assert os.path.exists(CSV_PATH), f"CSV not found: {CSV_PATH}"
assert PINECONE_API_KEY, "Missing PINECONE_API_KEY in environment"

df = pd.read_csv(CSV_PATH)

# Ensure expected columns exist
for c in ["uniq_id","title","brand","description","price","categories","material","color","country_of_origin"]:
    if c not in df.columns: df[c] = pd.NA

# Basic coercions
df["uniq_id"] = df["uniq_id"].astype(str)

print("Rows:", len(df))
df.head(3)


Rows: 312


Unnamed: 0,title,brand,description,price,categories,images,manufacturer,package_dimensions,country_of_origin,material,color,uniq_id
0,"GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...",GOYMFK,"multiple shoes, coats, hats, and other items E...",$24.99,"['Home & Kitchen', 'Storage & Organization', '...",['https://m.media-amazon.com/images/I/416WaLx1...,GOYMFK,"2.36""D x 7.87""W x 21.6""H",China,Metal,White,02593e81-5c09-5069-8516-b0b29f439ded
1,"subrtex Leather ding Room, Dining Chairs Set o...",subrtex,subrtex Dining chairs Set of 2,,"['Home & Kitchen', 'Furniture', 'Dining Room F...",['https://m.media-amazon.com/images/I/31SejUEW...,Subrtex Houseware INC,"18.5""D x 16""W x 35""H",,Sponge,Black,5938d217-b8c5-5d3e-b1cf-e28e340f292e
2,Plant Repotting Mat MUYETOL Waterproof Transpl...,MUYETOL,,$5.98,"['Patio, Lawn & Garden', 'Outdoor Décor', 'Doo...",['https://m.media-amazon.com/images/I/41RgefVq...,MUYETOL,"26.8""L x 26.8""W",,Polyethylene,Green,b2ede786-3f51-5a45-9a5b-bcf856958cd8


In [11]:
import numpy as np

def s(x) -> str:
    """Safe string: None/NaN -> '', else str(x)."""
    if x is None: return ""
    if isinstance(x, float) and np.isnan(x): return ""
    return str(x)

def to_list(x):
    """Parse list-like string or comma list into a clean list[str]."""
    xs = s(x).strip()
    if not xs: return []
    if xs.startswith("["):
        try:
            val = ast.literal_eval(xs)
            return [s(t).strip() for t in val if s(t).strip()]
        except Exception:
            pass
    return [t for t in (p.strip() for p in xs.split(",")) if t]

def to_float(x):
    if x is None or (isinstance(x,float) and np.isnan(x)): return np.nan
    xs = s(x).replace(",", "").replace("₹", "").strip()
    m = re.findall(r"[-+]?\d*\.?\d+", xs)
    return float(m[0]) if m else np.nan

# Normalize fields to strings / lists / floats
if "categories" in df.columns:
    df["categories"] = df["categories"].apply(to_list)

for col in ["title","brand","description","material","color","country_of_origin"]:
    if col in df.columns:
        df[col] = df[col].apply(s)

if "price" in df.columns:
    df["price"] = df["price"].apply(to_float)

print("Nulls after normalization:\n", df[["title","brand","description","price","categories"]].isna().sum())


Nulls after normalization:
 title           0
brand           0
description     0
price          97
categories      0
dtype: int64


In [12]:
import torch
from sentence_transformers import SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"
text_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)

def build_text(row: dict) -> str:
    cats = ", ".join(row.get("categories", [])) if isinstance(row.get("categories"), list) else ""
    parts = [
        s(row.get("title")),
        f"brand {s(row.get('brand'))}" if s(row.get('brand')) else "",
        f"category {cats}" if cats else "",
        f"material {s(row.get('material'))}" if s(row.get('material')) else "",
        f"color {s(row.get('color'))}" if s(row.get('color')) else "",
        s(row.get("description")),
    ]
    return " | ".join([p for p in parts if p and p.strip()])

records = df.to_dict(orient="records")
texts   = [build_text(r) for r in records]
emb     = text_model.encode(texts, normalize_embeddings=True, convert_to_numpy=True)  # (N, 384)

emb.shape, device




((312, 384), 'cpu')

# K MEANS

In [13]:
from sklearn.cluster import KMeans
import hdbscan
import numpy as np

USE_HDBSCAN = True  # set False to use KMeans

if USE_HDBSCAN:
    clusterer = hdbscan.HDBSCAN(min_cluster_size=8, min_samples=2, metric='euclidean')
    labels = clusterer.fit_predict(emb)          # -1 = noise/outliers
else:
    K = 50  # tune per dataset size
    labels = KMeans(n_clusters=K, n_init="auto", random_state=42).fit_predict(emb)

df["cluster_id"] = labels
df["cluster_id"].value_counts().head(10)


cluster_id
 0    260
-1     32
 1     20
Name: count, dtype: int64

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

dup_flags = np.zeros(len(df), dtype=bool)
parent_id = ["" for _ in range(len(df))]

THRESH_DUP = 0.97  # very high similarity => duplicate

for cid in sorted(set(labels)):
    if cid == -1:
        continue
    idx = np.where(labels == cid)[0]
    if len(idx) < 2:
        continue
    sims = cosine_similarity(emb[idx])
    # pick the most central as representative
    center = sims.mean(axis=1).argmax()
    center_global = idx[center]
    center_uid = df.iloc[center_global]["uniq_id"]
    for i_local, i_global in enumerate(idx):
        if i_global == center_global:
            continue
        if sims[center, i_local] >= THRESH_DUP:
            dup_flags[i_global] = True
            parent_id[i_global] = center_uid

df["is_duplicate"] = dup_flags
df["dup_of"] = parent_id

df[["uniq_id","title","cluster_id","is_duplicate","dup_of"]].head(10)


Unnamed: 0,uniq_id,title,cluster_id,is_duplicate,dup_of
0,02593e81-5c09-5069-8516-b0b29f439ded,"GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...",0,False,
1,5938d217-b8c5-5d3e-b1cf-e28e340f292e,"subrtex Leather ding Room, Dining Chairs Set o...",0,False,
2,b2ede786-3f51-5a45-9a5b-bcf856958cd8,Plant Repotting Mat MUYETOL Waterproof Transpl...,-1,False,
3,8fd9377b-cfa6-5f10-835c-6b8eca2816b5,"Pickleball Doormat, Welcome Doormat Absorbent ...",1,False,
4,bdc9aa30-9439-50dc-8e89-213ea211d66a,JOIN IRON Foldable TV Trays for Eating Set of ...,0,False,
5,20da3703-26f1-53e5-aa0b-a8104527d1bb,"LOVMOR 30'' Bathroom Vanity Sink Base Cabine, ...",0,False,
6,aba4138e-6401-52ca-a099-02e30b638db4,Folews Bathroom Organizer Over The Toilet Stor...,0,False,
7,02593e81-5c09-5069-8516-b0b29f439ded,"GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...",0,False,
8,5938d217-b8c5-5d3e-b1cf-e28e340f292e,"subrtex Leather ding Room, Dining Chairs Set o...",0,False,
9,b2ede786-3f51-5a45-9a5b-bcf856958cd8,Plant Repotting Mat MUYETOL Waterproof Transpl...,-1,False,


In [15]:
from keybert import KeyBERT

kw_model = KeyBERT(model=text_model)  # reuse encoder

cluster_keywords = {}
for cid in sorted(set(labels)):
    idx = np.where(labels == cid)[0]
    if cid == -1 or len(idx) < 3:
        continue
    # small corpus per cluster (avoid huge strings)
    titles = df.iloc[idx]["title"].fillna("").astype(str).tolist()
    blob = " ".join(titles)[:5000]
    kw = kw_model.extract_keywords(blob, keyphrase_ngram_range=(1,2), stop_words="english", top_n=8)
    cluster_keywords[cid] = [k for (k,score) in kw]

def tag_for(cid):
    if cid in cluster_keywords and cluster_keywords[cid]:
        return ", ".join(cluster_keywords[cid][:3])
    return ""

df["cluster_tag"] = [tag_for(c) for c in df["cluster_id"]]

df[["cluster_id","cluster_tag"]].drop_duplicates().head(10)


Unnamed: 0,cluster_id,cluster_tag
0,0,"storage cabinet, bathroom organizer, shoe orga..."
2,-1,
3,1,"door mats, door mat, doormat 18x27"


In [17]:
from pinecone import Pinecone
from tqdm.auto import tqdm
import time

pc = Pinecone(api_key=PINECONE_API_KEY)
text_index = pc.Index(PINECONE_TEXT_INDEX)

def to_int_or_default(x, default=-1):
    try:
        return int(x)
    except Exception:
        return default

records = df[["uniq_id","cluster_id","is_duplicate","dup_of","cluster_tag"]].to_dict(orient="records")

updated, skipped = 0, 0

for r in tqdm(records, desc="pinecone.update (per-id)"):
    uid = str(r["uniq_id"])
    meta = {
        "cluster_id": to_int_or_default(r.get("cluster_id"), -1),
        "is_duplicate": bool(r.get("is_duplicate", False)),
        "dup_of": (r.get("dup_of") or ""),
        "cluster_tag": (r.get("cluster_tag") or "")
    }
    try:
        # Per-id update in v5
        text_index.update(id=uid, set_metadata=meta, namespace="default")
        updated += 1
    except Exception as e:
        # Common case: id not present in this index (e.g., rows that failed ingest)
        # You can log and continue.
        skipped += 1
        # print(f"skip {uid}: {e}")
        continue
    # (Optional) be gentle if your project has strict rate limits
    # time.sleep(0.001)

print(f"Updated items: {updated} | Skipped (not found/errored): {skipped}")


pinecone.update (per-id): 100%|██████████| 312/312 [01:57<00:00,  2.66it/s]

Updated items: 312 | Skipped (not found/errored): 0





In [18]:
def preview_neighbors(q, k=5):
    qv = text_model.encode([q], normalize_embeddings=True, convert_to_numpy=True)[0].tolist()
    res = text_index.query(vector=qv, top_k=k, include_metadata=True, namespace="default")
    for m in res.get("matches", []):
        md = m["metadata"] or {}
        print(f"{m['score']:.4f} | {md.get('title','')} | cl:{md.get('cluster_id')} dup:{md.get('is_duplicate')} tag:{md.get('cluster_tag')}")

preview_neighbors("wooden dining chair", 6)


0.6082 | Armen Living Julius 30" Cream Faux Leather and Walnut Wood Bar Stool | cl:0.0 dup:False tag:storage cabinet, bathroom organizer, shoe organizer
0.5851 | YuiHome Extendable Round, Farmhouse 16" Leaf Table for Dining Room, Kitchen,Natural Wood Wash | cl:0.0 dup:False tag:storage cabinet, bathroom organizer, shoe organizer
0.5774 | CangLong Mid Century Modern Side Chair with Wood Legs for Kitchen, Living Dining Room, Set of 1, Black | cl:0.0 dup:False tag:storage cabinet, bathroom organizer, shoe organizer
0.5770 | VECELO Modern Industrial Style 3-Piece Dining Room Kitchen Table and Pu Cushion Chair Sets for Small Space, 2, Retro Brown | cl:0.0 dup:False tag:storage cabinet, bathroom organizer, shoe organizer
0.5757 | Modway Baronet Button-Tufted Vegan Leather Parsons Dining Chair in Gray | cl:0.0 dup:False tag:storage cabinet, bathroom organizer, shoe organizer
0.5691 | Leather At Home, Decorative 13 Inch Rounded Pillow Handmade from Full Grain Leather - Chair Seat, Confortable 

# CV - ResNet18

In [19]:
import os, ast, re, json, random, glob
import numpy as np, pandas as pd
from PIL import Image, ImageFile
from tqdm.auto import tqdm
from dotenv import load_dotenv

ImageFile.LOAD_TRUNCATED_IMAGES = True
load_dotenv()

# --- Paths / env ---
CSV_PATH = os.getenv("CSV_PATH", "intern_data_ikarus.csv")
IMAGE_DIR = os.getenv("IMAGE_DIR", "./data/images_all")  # folder/uniq_id/*.jpg|.png|.webp

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_TEXT_INDEX = os.getenv("PINECONE_TEXT_INDEX", "products-text")
PINECONE_IMAGE_INDEX = os.getenv("PINECONE_IMAGE_INDEX", "products-image")

assert os.path.exists(CSV_PATH), f"CSV not found: {CSV_PATH}"
assert os.path.isdir(IMAGE_DIR), f"Image dir not found: {IMAGE_DIR}"
assert PINECONE_API_KEY, "Missing PINECONE_API_KEY"

df = pd.read_csv(CSV_PATH)
if "uniq_id" not in df.columns: raise ValueError("CSV must include uniq_id")
df["uniq_id"] = df["uniq_id"].astype(str)

# Parse categories -> list; simple label = first category token
def to_list(x):
    if pd.isna(x): return []
    xs = str(x).strip()
    if xs.startswith("["):
        try:
            return [str(t).strip() for t in ast.literal_eval(xs) if str(t).strip()]
        except Exception:
            pass
    return [t.strip() for t in xs.split(",") if t.strip()]

df["categories"] = df.get("categories", pd.Series([pd.NA]*len(df))).apply(to_list)

def primary_cat(cats):
    if not cats: return "unknown"
    return str(cats[0]).strip().lower()[:64]

df["label"] = df["categories"].apply(primary_cat)

# Map uniq_id -> image paths
def img_paths_for(uid):
    p = os.path.join(IMAGE_DIR, str(uid))
    if not os.path.isdir(p): return []
    exts = (".jpg",".jpeg",".png",".webp")
    return [q for q in glob.glob(os.path.join(p, "*")) if os.path.splitext(q)[1].lower() in exts]

uid_to_imgs = {uid: img_paths_for(uid) for uid in df["uniq_id"].unique()}
# Keep only uids that actually have images
uids_with_imgs = [u for u, paths in uid_to_imgs.items() if paths]
print(f"Products with images: {len(uids_with_imgs)} / {len(df)}")


Products with images: 305 / 312


In [24]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# Stratify by label when possible
df_img = df[df["uniq_id"].isin(uids_with_imgs)].copy()
labels = sorted(df_img["label"].unique().tolist())
cls2id = {c:i for i,c in enumerate(labels)}
id2cls = {i:c for c,i in cls2id.items()}

# Split by uid (not by image) to keep leakage low
uids = df_img["uniq_id"].unique().tolist()
random.seed(42); random.shuffle(uids)
cut = int(0.8*len(uids))
train_uids = set(uids[:cut]); val_uids = set(uids[cut:])

def collect_rows(uid_set, max_imgs_per_uid=2):
    rows=[]
    for uid in uid_set:
        paths = uid_to_imgs.get(uid, [])[:max_imgs_per_uid]
        label = df_img.loc[df_img["uniq_id"]==uid, "label"].iloc[0]
        for p in paths:
            rows.append((p, cls2id[label], uid))
    return rows

train_rows = collect_rows(train_uids, max_imgs_per_uid=2)
val_rows   = collect_rows(val_uids,   max_imgs_per_uid=2)

IMG_SIZE = 224  # 224 works for ResNet/EfficientNet/ViT (patch16)

train_tf = transforms.Compose([
    transforms.Resize((256,256)),
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.7,1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2,0.2,0.2,0.1),
    transforms.ToTensor(),
])

val_tf = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
])

class RowsDataset(Dataset):
    def __init__(self, rows, tf):
        self.rows = rows
        self.tf = tf
    def __len__(self): return len(self.rows)
    def __getitem__(self, i):
        p, y, uid = self.rows[i]
        im = Image.open(p).convert("RGB")
        im = self.tf(im)
        return im, y, uid

train_ds = RowsDataset(train_rows, train_tf)

val_ds   = RowsDataset(val_rows,   val_tf)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True,  num_workers=0, pin_memory=False)
val_dl   = DataLoader(val_ds,   batch_size=64, shuffle=False, num_workers=0, pin_memory=False)

len(train_ds), len(val_ds), len(labels)


Device: cpu


(482, 120, 7)

In [25]:
import timm, torch.nn as nn, torch

# Options:
#   "resnet18", "efficientnet_b0", "vit_base_patch16_224", "convnext_tiny"
BACKBONE = os.getenv("BACKBONE", "resnet18")

def build_model(backbone: str, num_classes: int):
    m = timm.create_model(backbone, pretrained=True, num_classes=num_classes)
    return m

model = build_model(BACKBONE, num_classes=len(labels)).to(device)
print(f"Backbone: {BACKBONE} | Params: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")


Backbone: resnet18 | Params: 11.18M


In [26]:
from torch.cuda.amp import autocast, GradScaler
import torch.optim as optim
import math, time

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
scaler = GradScaler(enabled=(device=="cuda"))

def run_epoch(dl, train=True):
    model.train(train)
    total, correct, loss_sum = 0, 0, 0.0
    for x, y, _ in dl:
        x, y = x.to(device), y.to(device)
        if train: optimizer.zero_grad(set_to_none=True)
        with autocast(enabled=(device=="cuda")):
            logits = model(x)
            loss = criterion(logits, y)
        if train:
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        loss_sum += loss.item() * x.size(0)
        pred = logits.argmax(1)
        correct += (pred==y).sum().item()
        total += x.size(0)
    return loss_sum/total, correct/total

best_val = math.inf
best_acc = 0.0
patience, patience_ctr = 3, 0

os.makedirs("./models", exist_ok=True)

EPOCHS = 6
for e in range(1, EPOCHS+1):
    t0=time.time()
    tr_loss, tr_acc = run_epoch(train_dl, True)
    va_loss, va_acc = run_epoch(val_dl, False)
    dt = time.time()-t0
    print(f"Epoch {e}/{EPOCHS} | train {tr_loss:.4f}/{tr_acc:.3f} | val {va_loss:.4f}/{va_acc:.3f} | {dt:.1f}s")

    if va_loss < best_val:
        best_val = va_loss
        best_acc = va_acc
        torch.save({
            "backbone": BACKBONE,
            "state_dict": model.state_dict(),
            "labels": labels,
            "img_size": IMG_SIZE
        }, f"./models/cv_{BACKBONE}.pt")
        patience_ctr = 0
    else:
        patience_ctr += 1
        if patience_ctr >= patience:
            print("Early stopping.")
            break

print("Best val:", best_val, "acc:", best_acc)


Epoch 1/6 | train 1.6536/0.585 | val 1.0976/0.925 | 58.7s
Epoch 2/6 | train 1.0123/0.790 | val 0.5586/0.925 | 48.1s
Epoch 3/6 | train 0.7581/0.790 | val 0.4640/0.925 | 47.5s
Epoch 4/6 | train 0.6764/0.790 | val 0.4306/0.925 | 47.1s
Epoch 5/6 | train 0.6039/0.790 | val 0.3631/0.925 | 48.5s
Epoch 6/6 | train 0.5392/0.793 | val 0.3423/0.925 | 48.4s
Best val: 0.34232182999451954 acc: 0.925


In [27]:
# Load best weights (optional if you just trained)
ckpt_path = f"./models/cv_{BACKBONE}.pt"
ckpt = torch.load(ckpt_path, map_location=device)
model.load_state_dict(ckpt["state_dict"])
model.eval()

import torchvision.transforms as T
infer_tf = T.Compose([T.Resize((IMG_SIZE, IMG_SIZE)), T.ToTensor()])

@torch.inference_mode()
def predict_paths(paths):
    outs=[]
    for p in paths:
        try:
            im = Image.open(p).convert("RGB")
            x = infer_tf(im).unsqueeze(0).to(device)
            logits = model(x)
            prob = torch.softmax(logits, dim=1).squeeze(0).cpu().numpy()
            i = int(prob.argmax())
            outs.append((p, labels[i], float(prob[i])))
        except Exception as e:
            outs.append((p, "unknown", 0.0))
    return outs

def classify_uid(uid: str, max_imgs=4):
    paths = uid_to_imgs.get(uid, [])[:max_imgs]
    if not paths: return "unknown", 0.0
    preds = predict_paths(paths)
    # Majority vote with confidence as tiebreaker
    counts, confs = {}, {}
    for p, cls, prob in preds:
        counts[cls]=counts.get(cls,0)+1
        confs[cls]=max(confs.get(cls,0.0), prob)
    cls = max(counts, key=lambda c: (counts[c], confs[c]))
    return cls, confs[cls]


In [28]:
predicted = []
for uid in tqdm(uids_with_imgs, desc="classify"):
    cls, prob = classify_uid(uid, max_imgs=4)
    predicted.append({"uniq_id": uid, "predicted_category": cls, "pred_conf": prob})

pred_df = pd.DataFrame(predicted)
pred_df.head()


classify: 100%|██████████| 305/305 [01:08<00:00,  4.44it/s]


Unnamed: 0,uniq_id,predicted_category,pred_conf
0,02593e81-5c09-5069-8516-b0b29f439ded,home & kitchen,0.952235
1,5938d217-b8c5-5d3e-b1cf-e28e340f292e,home & kitchen,0.998851
2,b2ede786-3f51-5a45-9a5b-bcf856958cd8,home & kitchen,0.673488
3,8fd9377b-cfa6-5f10-835c-6b8eca2816b5,home & kitchen,0.780289
4,bdc9aa30-9439-50dc-8e89-213ea211d66a,home & kitchen,0.988293


In [31]:
from pinecone import Pinecone
from tqdm.auto import tqdm

pc = Pinecone(api_key=PINECONE_API_KEY)
img_index  = pc.Index(PINECONE_IMAGE_INDEX)
text_index = pc.Index(PINECONE_TEXT_INDEX)

def update_meta(index, recs, label: str, namespace: str = "default"):
    updated, skipped = 0, 0
    for r in tqdm(recs, desc=f"update {label}"):
        uid = str(r["uniq_id"])
        meta = {
            "predicted_category": r.get("predicted_category", "") or "",
            "pred_conf": float(r.get("pred_conf", 0.0)),
        }
        try:
            index.update(id=uid, set_metadata=meta, namespace=namespace)
            updated += 1
        except Exception:
            skipped += 1
            # you can log the exception here if you want
            # print(f"skip {uid}: {e}")
            continue
    print(f"{label}: updated={updated} skipped={skipped}")

update_meta(img_index,  predicted, label=PINECONE_IMAGE_INDEX)
update_meta(text_index, predicted, label=PINECONE_TEXT_INDEX)

print("Done.")


update products-image: 100%|██████████| 305/305 [02:02<00:00,  2.49it/s]


products-image: updated=305 skipped=0


update products-text: 100%|██████████| 305/305 [01:58<00:00,  2.57it/s]

products-text: updated=305 skipped=0
Done.





In [32]:
sample_uid = random.choice(uids_with_imgs)
print("sample uid:", sample_uid)
print("paths:", uid_to_imgs[sample_uid][:2])
print("pred:", classify_uid(sample_uid))


sample uid: 91688563-4731-5976-bc36-85e98ca7ba1a
paths: ['./data/images_all\\91688563-4731-5976-bc36-85e98ca7ba1a\\000_eceafd48.jpg', './data/images_all\\91688563-4731-5976-bc36-85e98ca7ba1a\\001_474d31f7.jpg']
pred: ('home & kitchen', 0.9946603178977966)
