# Overview of MT Pipeline

### Step 1: Query Expansion CODE LOCATED IN queryexpansion.py
We first use DeepSeek v3 to carry out query expansion on our queries. This is implemented as a function in queryexpansion.py, but will be demonstrated here:

In [1]:
from openai import OpenAI
from dotenv import load_dotenv
import os
import json
import re

#load API key
load_dotenv(dotenv_path='../.env')
api_key = os.getenv('deepseek_API_KEY')

#set up connection
client = OpenAI(api_key=api_key, base_url="https://openrouter.ai/api/v1")


We define a function to expand our query:

In [2]:
def get_expanded_queries(user_query):
    prompt=f'''You are an expert search query optimizer. Your task is to expand the following e-commerce search query to improve retrieval of relevant products. Generate a list of semantically related terms, synonyms, and common user variations while preserving the original intent.

**Rules:**
1. Prioritize **contextual relevance** (e.g., "running shoes" → "jogging sneakers").
2. Include **common misspellings** (e.g., "earbuds" → "airbuds").
3. Add **technical/layman variants** (e.g., "4K TV" → "ultra HD television").
4. For non-English queries, provide **translations/transliterations** if applicable (e.g., "スマホ" → "smartphone").
5. Output in JSON format for easy parsing.

**Input Query:** "{user_query}"

**Output Format:**  
{{
  "original_query": "...",
  "expanded_terms": [
    {{"term": "...", "type": "synonym"}},
    {{"term": "...", "type": "misspelling"}},
    {{"term": "...", "type": "technical"}}
  ]
}}

**Example Output for "wireless headphones":**
{{
  "original_query": "wireless headphones",
  "expanded_terms": [
    {{"term": "Bluetooth headphones", "type": "synonym"}},
    {{"term": "cordless earphones", "type": "synonym"}},
    {{"term": "wireless headsets", "type": "synonym"}},
    {{"term": "airbuds", "type": "misspelling"}},
    {{"term": "noise-cancelling headphones", "type": "technical"}}
  ]
}}

**Now process this query:** "{user_query}"'''
    response = client.chat.completions.create(
        model="deepseek/deepseek-chat-v3-0324:free",
        messages=[
            {"role": "user", "content": prompt},
        ],
        temperature=0.3,
    )
    
    expanded_queries_raw=response.choices[0].message.content
    if not expanded_queries_raw or expanded_queries_raw.strip() == "":
      raise ValueError("API returned an empty response")
    expanded_queries_raw = re.search(r'```json\n({.*?})\n```', expanded_queries_raw, re.DOTALL)
    if expanded_queries_raw:
      expanded_queries_raw = expanded_queries_raw.group(1)
    else:
      expanded_queries_raw = expanded_queries_raw.strip()  # fallback to raw response
      
    #print(expanded_queries_raw)
    expanded_queries=json.loads(expanded_queries_raw)
    return expanded_queries

This query should return us an expanded version of the user's original query, accounting for misspellings, vague queries, etc

In [4]:
query="shir long sleeve"
#demo with misspelling
expanded_queries=get_expanded_queries(query)

expanded_queries

{'original_query': 'shir long sleeve',
 'expanded_terms': [{'term': 'shirt long sleeve', 'type': 'misspelling'},
  {'term': 'long sleeve shirt', 'type': 'synonym'},
  {'term': 'long sleeve t-shirt', 'type': 'synonym'},
  {'term': 'long sleeve blouse', 'type': 'synonym'},
  {'term': 'long sleeve top', 'type': 'synonym'},
  {'term': 'long sleeve tee', 'type': 'synonym'},
  {'term': 'long sleeve polo', 'type': 'synonym'},
  {'term': 'long sleeve button-up', 'type': 'synonym'},
  {'term': 'long sleeve dress shirt', 'type': 'synonym'},
  {'term': 'long sleeve casual shirt', 'type': 'synonym'},
  {'term': 'long sleeve henley', 'type': 'technical'},
  {'term': 'long sleeve oxford shirt', 'type': 'technical'},
  {'term': 'long sleeve flannel shirt', 'type': 'technical'},
  {'term': 'long sleeve thermal shirt', 'type': 'technical'},
  {'term': 'long sleeve knit shirt', 'type': 'technical'},
  {'term': 'shirt long sleeved', 'type': 'misspelling'},
  {'term': 'long sleeved shir', 'type': 'misspel

We then rank these expanded queries based on their types, giving the most importance to the original query

In [5]:
#weight the different output types
def assign_weights(term_type):
    weights = {
        "synonym": 0.8,
        "misspelling": 0.3,
        "technical": 0.7,
        "translation": 0.6
    }
    return weights.get(term_type, 0.5)  #default weight

def return_weighted_dict(expanded_queries, include_translations): #option to remove translations for certain pipelines
    weighted_terms = [
    {"term": expanded_queries["original_query"], "weight": 1.0}  # Original query (highest priority)
    ]

    if include_translations:
      for item in expanded_queries["expanded_terms"]:
          weighted_terms.append({
              "term": item["term"],
              "weight": assign_weights(item["type"])
          })
    else:
       for item in expanded_queries["expanded_terms"]:
          if item["type"]!="translation":
            weighted_terms.append({
                "term": item["term"],
                "weight": assign_weights(item["type"])
            })
    return weighted_terms

In [17]:
#demo with the above expansions
weighted_queries=return_weighted_dict(expanded_queries, include_translations=False)
weighted_queries

[{'term': 'shir long sleeve', 'weight': 1.0},
 {'term': 'shirt long sleeve', 'weight': 0.3},
 {'term': 'long sleeve shirt', 'weight': 0.8},
 {'term': 'long sleeve t-shirt', 'weight': 0.8},
 {'term': 'long sleeve blouse', 'weight': 0.8},
 {'term': 'long sleeve top', 'weight': 0.8},
 {'term': 'long sleeve tee', 'weight': 0.8},
 {'term': 'long sleeve polo', 'weight': 0.8},
 {'term': 'long sleeve button-up', 'weight': 0.8},
 {'term': 'long sleeve dress shirt', 'weight': 0.8},
 {'term': 'long sleeve casual shirt', 'weight': 0.8},
 {'term': 'long sleeve henley', 'weight': 0.7},
 {'term': 'long sleeve oxford shirt', 'weight': 0.7},
 {'term': 'long sleeve flannel shirt', 'weight': 0.7},
 {'term': 'long sleeve thermal shirt', 'weight': 0.7},
 {'term': 'long sleeve knit shirt', 'weight': 0.7},
 {'term': 'shirt long sleeved', 'weight': 0.3},
 {'term': 'long sleeved shir', 'weight': 0.3},
 {'term': 'long sleeved tshirt', 'weight': 0.3}]

### Step 2: Fine-tuning of mBART model
We first fine-tune an mBART model on our spanish, italian and chinese dataset to carry out our machine translation task. The code for fine-tuning can be found at finetune.py, while the model is saved in ./final

### Step 3: Machine Translation of expanded queries
We then translate these queries using our finetuned mBART model. Similarly, this is implemented in translate.py but showcased here. 

In [15]:
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

lang_code_map = {
    "en": "en_XX",
    "es": "es_XX",
    "it": "it_IT", 
    "cn": "zh_CN"
}

#function to load model and tokenizer
def load_model_and_tokenizer(model_path):
    """Load the model and tokenizer from the saved checkpoint"""
    model = MBartForConditionalGeneration.from_pretrained(model_path)
    tokenizer = MBart50TokenizerFast.from_pretrained(model_path)
    return model, tokenizer

#translation function.
def translate_sentence(model, tokenizer, text, src_lang, tgt_lang):
    """Translate a single sentence"""
    # Set source and target languages
    tokenizer.src_lang = lang_code_map[src_lang]
    
    # Tokenize input
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=64)
    
    # Generate translation
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id[lang_code_map[tgt_lang]],
            max_length=64,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=3,  # Prevent repeating n-grams
            repetition_penalty=2.0,   # Penalize repetition
            length_penalty=1.0,       # Balance between length and score
            temperature=0.7,          # Control randomness
            do_sample=True           # Enable sampling
        )

     # Decode the output
    translation = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return translation

def translate_expanded(model, tokenizer, query_list, src_lang, tgt_lang):
    for query in query_list:
        query['term']=translate_sentence(model, tokenizer, query['term'], src_lang, tgt_lang)
    return query_list


In [18]:
#demo using expanded queries
tgt_lang='cn'
model, tokenizer = load_model_and_tokenizer("./final")
weighted_queries = translate_expanded(model, tokenizer, weighted_queries, 'en', tgt_lang)

In [20]:
weighted_queries

[{'term': '雪龍袖', 'weight': 1.0},
 {'term': '衬衫長袖', 'weight': 0.3},
 {'term': '長袖衬衫', 'weight': 0.8},
 {'term': '長袖t恤衫', 'weight': 0.8},
 {'term': '長袖 blouse', 'weight': 0.8},
 {'term': '長袖上衣', 'weight': 0.8},
 {'term': '長袖 tee', 'weight': 0.8},
 {'term': '長袖保羅', 'weight': 0.8},
 {'term': '長袖上扣', 'weight': 0.8},
 {'term': '長袖裙衫', 'weight': 0.8},
 {'term': '長袖休闲衣', 'weight': 0.8},
 {'term': '長袖亨利', 'weight': 0.7},
 {'term': '長袖牛仔衫', 'weight': 0.7},
 {'term': '長袖flannel衬衫', 'weight': 0.7},
 {'term': '長袖熱帶衫', 'weight': 0.7},
 {'term': '長袖編織衬衫', 'weight': 0.7},
 {'term': '衬衫長袖', 'weight': 0.3},
 {'term': '長袖雪子', 'weight': 0.3},
 {'term': '長袖tshirt', 'weight': 0.3}]

### Step 4: Hybrid Search of expanded queries


#### 4.1 Data Loading

In [9]:
#returns a dicitonary of dfs
import pandas as pd

def get_data(data_paths):
    data = {} 
    for lang, path in data_paths.items():
        data[lang]=pd.read_pickle(path)
    return data

In [10]:
data_paths={'cn':'en_to_cn_embeddings.pkl', 'es':'en_to_sp_embeddings.pkl', 'it':'en_to_it_embeddings.pkl'}
data = get_data(data_paths)

#### BM25 Search

In [11]:
from rank_bm25 import BM25Okapi
import pandas as pd
import jieba

In [12]:
#Build BM_25 corpus
def build_BM25(data):
    #cn
    entocn_chinese_titles = data['cn']['chinese translation']
    entocn_tokenized_cn = [list(jieba.cut_for_search(title.lower())) for title in entocn_chinese_titles]
    bm25_cn = BM25Okapi(entocn_tokenized_cn)

    #es
    entoes_spanish_titles = data['es']['title_spanish']
    entoes_tokenized_es = [title.split() for title in entoes_spanish_titles]
    bm25_es = BM25Okapi(entoes_tokenized_es)

    #it
    entoit_italian_titles = data['it']['title_italian']
    entoit_tokenized_it = [title.split() for title in entoit_italian_titles]
    bm25_it = BM25Okapi(entoit_tokenized_it)

    bm25_corpus={'cn':bm25_cn, 'es':bm25_es, 'it':bm25_it}


    return bm25_corpus


#Search BM25
def search_bm25_expanded(query_list, corpus, tgt_lang='cn', top_k=5):
    #init scores as zeros

    scores = [0.0] * len(corpus[tgt_lang].doc_len)

    for query_dict in query_list:
        term=query_dict['term']
        weight=query_dict['weight']
        if tgt_lang=='cn':
            tokens=jieba.cut_for_search(term.lower())
            term_scores = corpus[tgt_lang].get_scores(tokens)        
        else:
            tokens = term.lower().split()
            term_scores = corpus[tgt_lang].get_scores(tokens)

        scores = [s + weight * ts for s, ts in zip(scores, term_scores)]

    # Get top-k ranked indices
    top_k_ids = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
    return top_k_ids, [scores[i] for i in top_k_ids]

In [13]:
bm25_corpus = build_BM25(data)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\liuru\AppData\Local\Temp\jieba.cache
Loading model cost 0.491 seconds.
Prefix dict has been built successfully.


In [21]:
top_ids, top_scores = search_bm25_expanded(weighted_queries, bm25_corpus)

In [22]:
#remember our original search was 'shir long sleeve', mispelled on purpose.

for i, score in zip(top_ids, top_scores):
    print(f"{score:.4f} | {data['cn']['title'][i]}  | {data['cn']['chinese translation'][i]}")

141.3012 | Polo Shirt Classic Denim Pocket Long Sleeve Shirt  | Polo衫 經典丹寧 口袋長袖襯衫
112.8441 | Kids Set Table Bay - Thin Long Sleeve Home Suit Magic Baby ~ K60092  | 兒童套裝 台灣製薄長袖居家套裝 魔法Baby~k60092
88.6328 | GAP Kids Long Sleeve Logo Patch Top  | GAP 童裝長袖Logo貼布上衣
71.6278 | Long version Pocket Cardigan Knit Coat  | 長版口袋開襟針織外套
65.9903 | IFairies large size long sleeve T shirt Tops ifairies [59000] 【 59000 】  | iFairies 中大尺碼長袖T恤上衣★ifairies【59000】【59000】


#### Dense Search

In [23]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("BAAI/bge-m3")

In [44]:
from pinecone import Pinecone
from pinecone import ServerlessSpec
from dotenv import load_dotenv
import os

#Embeds a dense embedding representing the weighted mean of the expanded queries
def embed_expanded(query_list, model):
    query_embeddings= []
    #embed expanded queries
    for query_dict in query_list:
        embedding=model.encode(query_dict['term'],  convert_to_tensor=True).cpu().numpy() #size1024
        query_embeddings.append(embedding * query_dict["weight"])

    query_embedding = sum(query_embeddings) / len(query_embeddings)  # Weighted mean
    return query_embedding


def init_index(pc, index_name, data, embedding_col, eng_col, tgt_col, tgt_lang):
    index_name = index_name
    dimension = 1024

    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric="cosine",  # by cosine similarity
            spec=ServerlessSpec(
                cloud="aws",  # or "gcp"
                region="us-east-1" 
            )
        )

    index = pc.Index(index_name)

    vectors_to_upsert = []
    for _, row in data.iterrows():
        vectors_to_upsert.append({
            "id": str(_),  # Use index or generate unique IDs
            "values": row[embedding_col],  # Using Chinese embeddings
            "metadata": {
                "title": row[eng_col],
                "chinese_title": row[tgt_col],
                "embedding_type": tgt_lang  # Track which embedding was used
            }
        })

    for i in range(0, len(vectors_to_upsert), 100):
        index.upsert(vectors=vectors_to_upsert[i:i+100])

def setup_pinecone(data):
    load_dotenv(dotenv_path='../.env')
    pinecone_api_key = os.getenv('pinecone_API_KEY')
    pc = Pinecone(api_key=pinecone_api_key)

    data = data
    
    indexes={'cn':'cn-search', 'it':'it-search', 'es':'es-search'}

    #setup cn
    init_index(pc, index_name=indexes['cn'], data=data['cn'],
     embedding_col='chinese_embedding',
     eng_col='title',
     tgt_col='chinese translation',
     tgt_lang='chinese')

    #setup it
    init_index(pc, index_name=indexes['it'], data=data['it'],
     embedding_col='italian_embedding',
     eng_col='title',
     tgt_col='title_italian',
     tgt_lang='italian')

    #setup es
    init_index(pc, index_name=indexes['es'], data=data['es'],
     embedding_col='spanish_embedding',
     eng_col='title',
     tgt_col='title_spanish',
     tgt_lang='spanish')

    return indexes

def search_pinecone(query_list, embedding_model, index_name, top_k=5):
    load_dotenv(dotenv_path='../.env')
    pinecone_api_key = os.getenv('pinecone_API_KEY')
    pc = Pinecone(api_key=pinecone_api_key)
    index = pc.Index(index_name)
    query_embedding=embed_expanded(query_list, embedding_model)
    results = index.query(
            vector=query_embedding.tolist(),
            top_k=top_k,
            include_metadata=False
        )
    id_list = []
    score_list = []
    for dict in results.matches:
        id_list.append(int(dict['id']))
        score_list.append(float(dict['score']))

    return id_list, score_list


In [25]:
pinecone_indices=setup_pinecone(data)

In [45]:
top_ids_pc, top_scores_pc =search_pinecone(weighted_queries, model, pinecone_indices['cn'])

In [47]:
#remember our original search was 'shir long sleeve', mispelled on purpose.

for i, score in zip(top_ids_pc, top_scores_pc):
    print(f"{score:.4f} | {data['cn']['title'][i]}  | {data['cn']['chinese translation'][i]}")

0.7519 | Long version Pocket Cardigan Knit Coat  | 長版口袋開襟針織外套
0.7019 | Polo Shirt Classic Denim Pocket Long Sleeve Shirt  | Polo衫 經典丹寧 口袋長袖襯衫
0.6916 | Shiny Glossy Long Edition Perspective Shirt  | 閃亮光澤長版透視襯衫
0.6759 | Korean Made. Thick Straps Cross Vest  | 韩制。粗肩带交叉背心
0.6581 | IFairies large size long sleeve T shirt Tops ifairies [59000] 【 59000 】  | iFairies 中大尺碼長袖T恤上衣★ifairies【59000】【59000】


#### RRF

In [31]:
#recap: Right now, we have BM25 results, returned as
print(top_ids, top_scores)

[656, 996, 701, 276, 418] [141.30117982812962, 112.84411227054272, 88.63276428673784, 71.6278067714302, 65.99026659028007]


In [48]:
#recap: We also have semantic results, returned as
print(top_ids_pc, top_scores_pc)

[276, 656, 134, 76, 418] [0.7518996, 0.701878309, 0.691553712, 0.675868392, 0.658072412]


In [None]:
import numpy as np

def scores_to_ranking(scores: list[float]) -> list[int]:
    """Convert float scores into int rankings (1 = best)."""
    return np.argsort(scores)[::-1] + 1  # ranks start at 1

def rrf(keyword_rank: int, semantic_rank: int, k: int = 60) -> float:
    """Combine keyword rank and semantic rank into a hybrid score using RRF."""
    return 1 / (k + keyword_rank) + 1 / (k + semantic_rank)


In [58]:
def hybrid_expanded_search(query_list, bm25_corpus, pinecone_indices, embedding_model, tgt_lang='cn', top_k=5 ):
    bm25_top_ids, bm25_top_scores = search_bm25_expanded(query_list, bm25_corpus, top_k=top_k)
    pc_top_ids, pc_top_scores =search_pinecone(query_list, embedding_model, pinecone_indices[tgt_lang], top_k=top_k)
    bm25_ranks = scores_to_ranking(bm25_top_scores)
    pc_ranks = scores_to_ranking(pc_top_scores)

    # Create dictionaries for quick rank lookup
    bm25_rank_dict = {doc_id: rank for doc_id, rank in zip(bm25_top_ids, bm25_ranks)}
    pc_rank_dict = {doc_id: rank for doc_id, rank in zip(pc_top_ids, pc_ranks)}
    
    # Combine all unique document IDs from both methods
    all_doc_ids = list(set(bm25_top_ids) | set(pc_top_ids))
    
    # Calculate RRF scores for each document
    rrf_scores = []
    for doc_id in all_doc_ids:
        # Get ranks from each method (use a high rank if document not found)
        bm25_rank = bm25_rank_dict.get(doc_id, top_k * 2)  # Penalize missing documents
        pc_rank = pc_rank_dict.get(doc_id, top_k * 2)
        
        # Calculate combined RRF score
        score = rrf(bm25_rank, pc_rank)
        rrf_scores.append((doc_id, score))
    
    # Sort documents by RRF score (descending)
    rrf_scores.sort(key=lambda x: -x[1])
    
    # Extract the top_k document IDs
    #hybrid_top_ids = [doc_id for doc_id, score in rrf_scores[:top_k]]
    hybrid_top_ids = [doc_id for doc_id, score in rrf_scores]

    #hybrid_top_scores = [score for doc_id, score in rrf_scores[:top_k]]
    hybrid_top_scores = [score for doc_id, score in rrf_scores]
    
    return hybrid_top_ids, hybrid_top_scores



In [59]:
hybrid_top_id, hybrid_top_scores=hybrid_expanded_search(weighted_queries, bm25_corpus, pinecone_indices, model)

In [60]:
for i, score in zip(hybrid_top_id, hybrid_top_scores):
    print(f"{score:.4f} | {data['cn']['title'][i]}  | {data['cn']['chinese translation'][i]}")

0.0325 | Polo Shirt Classic Denim Pocket Long Sleeve Shirt  | Polo衫 經典丹寧 口袋長袖襯衫
0.0320 | Long version Pocket Cardigan Knit Coat  | 長版口袋開襟針織外套
0.0308 | IFairies large size long sleeve T shirt Tops ifairies [59000] 【 59000 】  | iFairies 中大尺碼長袖T恤上衣★ifairies【59000】【59000】
0.0304 | Kids Set Table Bay - Thin Long Sleeve Home Suit Magic Baby ~ K60092  | 兒童套裝 台灣製薄長袖居家套裝 魔法Baby~k60092
0.0302 | Shiny Glossy Long Edition Perspective Shirt  | 閃亮光澤長版透視襯衫
0.0302 | GAP Kids Long Sleeve Logo Patch Top  | GAP 童裝長袖Logo貼布上衣
0.0299 | Korean Made. Thick Straps Cross Vest  | 韩制。粗肩带交叉背心


In [75]:
from bert_score import score
import warnings

def calculate_bertscore(candidate, reference, lang = "en"):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        # Compute scores
        P, R, F1 = score(
            [candidate], 
            [reference], 
            lang=lang,
            model_type="bert-base-multilingual-cased",  # Multilingual BERT
            verbose=False  # Disable progress messages
        )
    return P.item(), R.item(), F1.item()


def get_final_output(query, hybrid_top_id, data, tgt_lang='cn'):
    final_output={}
    for ids in hybrid_top_id:
        if tgt_lang=='cn':
            txt=data[tgt_lang]['chinese translation'][ids]
        elif tgt_lang=='es':
            txt=data[tgt_lang]['title_spanish'][ids]
        elif tgt_lang=='it':
            txt=data[tgt_lang]['title_italian'][ids]

        acc, precision, f1 = calculate_bertscore(txt, query)
        final_output[txt]=f1
    return final_output



In [76]:
query="shir long sleeve"

In [77]:
final_output = get_final_output(query, hybrid_top_id, data, tgt_lang='cn')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transf

In [74]:
final_output

{'Polo衫 經典丹寧 口袋長袖襯衫': 0.6623846292495728,
 '長版口袋開襟針織外套': 0.6716357469558716,
 'iFairies 中大尺碼長袖T恤上衣★ifairies【59000】【59000】': 0.6212053894996643,
 '兒童套裝 台灣製薄長袖居家套裝 魔法Baby~k60092': 0.6144145727157593,
 '閃亮光澤長版透視襯衫': 0.6694958806037903,
 'GAP 童裝長袖Logo貼布上衣': 0.659843921661377,
 '韩制。粗肩带交叉背心': 0.655915379524231}

In [None]:
import torch
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import argparse
from dotenv import load_dotenv
import os
from translate import load_model_and_tokenizer, translate_sentence
from queryexpansion import expand
from langdetect import detect
import jieba

In [None]:
def mtpipeline(input_query, tgt_lang="cn"):
    load_dotenv(dotenv_path='..../.env')
    model_path = os.getenv('mBART_path')
    model, tokenizer = load_model_and_tokenizer(model_path)
    input_query_expanded=expand(input_query, include_translations=False)
    for query in input_query_expanded:
        query['term']=translate_sentence(model, tokenizer, query['term'], 'en', tgt_lang)
    

SyntaxError: invalid syntax (2893126158.py, line 6)