# 2) Multilingual Query Understanding: LangID, NER (brand/attr), Multilingual Embeddings

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
!pip -q install --upgrade pip
!pip -q install datasets transformers sentence-transformers faiss-cpu rank-bm25 torchmetrics scikit-learn lightgbm langdetect unidecode pandas matplotlib tqdm nltk

In [1]:
import os, re, numpy as np, pandas as pd, torch, faiss
from datasets import load_dataset
from langdetect import detect as lang_detect
from unidecode import unidecode
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForTokenClassification
from tqdm.auto import tqdm
device = "cuda" if torch.cuda.is_available() else "cpu"
CONFIG = {"language":"ja","N_DOCS":300000,"N_QUERIES":3000}

ModuleNotFoundError: No module named 'faiss'

In [10]:
# ds = load_dataset("amazon_reviews_multi", CONFIG["language"], split="train")
# df = ds.to_pandas()[["product_id","review_title","review_body"]].dropna().head(CONFIG["N_DOCS"]).reset_index(drop=True)
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Datasets/amazon_reviews_multi/en/train.csv")
df = df[df["language"] == CONFIG["language"]]
print(df.shape)
df = df[["product_id","review_title","review_body"]].dropna().head(CONFIG["N_DOCS"]).reset_index(drop=True)
df["query"] = df["review_title"].astype(str)
df["doc_text"] = (df["review_title"].astype(str)+" "+df["review_body"].astype(str)).str.strip()
df = df[df["doc_text"].str.len()>16].reset_index(drop=True)

(200000, 9)


In [5]:
def normalize(q):
    qn = str(q).strip()
    try: lang = lang_detect(qn)
    except: lang = "unk"
    return qn, lang, unidecode(qn)
print([normalize(q) for q in df['query'].head(5)])

[('本革でも防水でもない', 'ja', 'Ben Ge demoFang Shui demonai'), ('Amazonが転売を認めてるから…', 'ja', 'AmazongaZhuan Mai woRen meterukara...'), ('時計として使えません。', 'ja', 'Shi Ji toshiteShi emasen. '), ('安かろう、悪かろう', 'ja', 'An karou, E karou'), ('元々使えなかったのか？', 'ja', 'Yuan "Shi enakatsutanoka?')]


In [7]:
try:
    ner_tok = AutoTokenizer.from_pretrained("Davlan/xlm-roberta-base-ner-hrl")
    ner_model = AutoModelForTokenClassification.from_pretrained("Davlan/xlm-roberta-base-ner-hrl").to(device)
    id2label = ner_model.config.id2label
except Exception as e:
    ner_tok, ner_model, id2label = None, None, None
    print("NER fallback due to:", e)

def ner_extract(text):
    if ner_tok is None:
        toks = str(text).split(); brands = [t for t in toks if (len(t)>1 and t[0].isupper()) or any(ch.isdigit() for ch in t)]
        return {"BRAND":brands[:5], "ATTR":[]}
    enc = ner_tok(str(text), return_tensors="pt", truncation=True, max_length=128).to(device)
    with torch.no_grad():
        out = ner_model(**enc).logits[0].argmax(-1).detach().cpu().numpy()
    toks = ner_tok.convert_ids_to_tokens(enc["input_ids"][0].detach().cpu().numpy())
    labels = [id2label.get(int(i),"O") for i in out]
    brands, attrs = [], []
    for t,l in zip(toks, labels):
        if l.endswith("ORG"): brands.append(t)
        if l.endswith("MISC") or l.endswith("PROD"): attrs.append(t)
    return {"BRAND":brands[:5], "ATTR":attrs[:5]}
df_small = df.head(200).copy(); df_small["ner"] = df_small["doc_text"].apply(ner_extract)
df_small[["query","ner"]].head(30)

Unnamed: 0,query,ner
0,本革でも防水でもない,"{'BRAND': [], 'ATTR': []}"
1,Amazonが転売を認めてるから…,"{'BRAND': ['▁Amazon', 'Amazon', '▁Amazon'], 'A..."
2,時計として使えません。,"{'BRAND': [], 'ATTR': []}"
3,安かろう、悪かろう,"{'BRAND': [], 'ATTR': []}"
4,元々使えなかったのか？,"{'BRAND': [], 'ATTR': []}"
5,6000円無駄にした,"{'BRAND': [], 'ATTR': []}"
6,ゲームソフトが出来ないハードなんてゴミじゃないですかね？,"{'BRAND': [], 'ATTR': []}"
7,買ってはいけません！,"{'BRAND': [], 'ATTR': []}"
8,鍵が閉まらない,"{'BRAND': ['ア', 'マ', 'ゾ', 'ン'], 'ATTR': []}"
9,2回目の使用で壊れる。,"{'BRAND': [], 'ATTR': []}"


In [8]:
e5 = SentenceTransformer("intfloat/multilingual-e5-base", device=device)
def encode(texts, bs=128):
    vecs = []
    for i in tqdm(range(0,len(texts),bs), desc="Encoding"):
        emb = e5.encode(texts[i:i+bs], batch_size=bs, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
        vecs.append(emb)
    return np.vstack(vecs).astype("float32")
qs = df["query"].head(CONFIG["N_QUERIES"]).tolist(); ds_text = df["doc_text"].tolist()
qv = encode(qs, 128); dv = encode(ds_text, 128)
index = faiss.IndexFlatIP(dv.shape[1]); index.add(dv)
_, idx = index.search(qv, 10)
hits = sum([np.any(row < len(qs)) for row in idx])
print("Approx Recall@10:", hits/len(qs))

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Encoding:   0%|          | 0/24 [00:00<?, ?it/s]

Encoding:   0%|          | 0/235 [00:00<?, ?it/s]

Approx Recall@10: 0.839
