In [3]:
import torch
import os
import openvino as ov
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
from optimum.intel.openvino import OVModelForCausalLM
from optimum.intel import OVQuantizer, OVConfig, OVQuantizationConfig, OVWeightQuantizationConfig

# ov_config = {"INFERENCE_PRECISION_HINT": "f16"}


In [4]:
from SearchApps import resquest_google

In [5]:
model_save_path = "saved_model_qu"

In [6]:
core = ov.Core()
devices = core.available_devices
devices

['CPU', 'GPU', 'NPU']

In [7]:
def get_inputs(pairs, tokenizer, prompt=None, max_length=1024):
    if prompt is None:
        prompt = "Given a query A and a app json B, determine whether the app json match to the query by providing a prediction of either 'Yes' or 'No'."
    sep = "\n"
    prompt_inputs = tokenizer(prompt,
                              return_tensors=None,
                              add_special_tokens=False)['input_ids']
    sep_inputs = tokenizer(sep,
                           return_tensors=None,
                           add_special_tokens=False)['input_ids']
    inputs = []
    for query, passage in pairs:
        query_inputs = tokenizer(f'A: {query}',
                                 return_tensors=None,
                                 add_special_tokens=False,
                                 max_length=max_length * 3 // 4,
                                 truncation=True)
        passage_inputs = tokenizer(f'B: {passage}',
                                   return_tensors=None,
                                   add_special_tokens=False,
                                   max_length=max_length,
                                   truncation=True)
        item = tokenizer.prepare_for_model(
            [tokenizer.bos_token_id] + query_inputs['input_ids'],
            sep_inputs + passage_inputs['input_ids'],
            truncation='only_second',
            max_length=max_length,
            padding=False,
            return_attention_mask=False,
            return_token_type_ids=False,
            add_special_tokens=False
        )
        item['input_ids'] = item['input_ids'] + sep_inputs + prompt_inputs
        item['attention_mask'] = [1] * len(item['input_ids'])
        inputs.append(item)
    return tokenizer.pad(
            inputs,
            padding=True,
            max_length=max_length + len(sep_inputs) + len(prompt_inputs),
            pad_to_multiple_of=8,
            return_tensors='pt',
    )


In [8]:
# model_q = OVQuantizer.from_pretrained(model)
ov_config = OVConfig(quantization_config=OVQuantizationConfig())

In [9]:
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-gemma')
if os.path.exists(model_save_path):
    model = OVModelForCausalLM.from_pretrained(model_save_path)
    # model = OVModelForCausalLM.from_pretrained(model_save_path)
    
else: 
    
    model = OVModelForCausalLM.from_pretrained('BAAI/bge-reranker-v2-gemma', export=True, quantization_config=OVWeightQuantizationConfig(bits=8))

    model.save_pretrained(model_save_path)
    
model.to("gpu")
model.compile()
# tokenizer 

# model_q.quantize(ov_config=ov_config, calibration_dataset=ca)

In [10]:

# tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-gemma')

# model = OVModelForCausalLM.from_pretrained('BAAI/bge-reranker-v2-gemma',export=True)

# model.save_pretrained("saved_model")

yes_loc = tokenizer('Yes', add_special_tokens=False)['input_ids'][0]


In [11]:
query ="中国常用的支付软件"
queryres = resquest_google(query, True)

google_res = [item["name"] for item in queryres]


{
    "id": "com.unionpay",
    "url": "https://play.google.com/store/apps/details?id=com.unionpay",
    "name": "云闪付",
    "category": "财务",
    "contentRating": "适合所有人",
    "releaseNotes": "修复了已知的问题。",
    "description": "云闪付APP，银行业统一移动支付战略产品，银行卡管理一步到位，银行卡优惠一网打尽。\n【主要功能】\n1.全面的银联银行优惠权益查询，实时更新省钱省心。\n2.手机扫码支付，银联金融级安全保护，资金安全无忧。\n3.绑定银联卡，享受消费提醒、交易查询、手工记账服务。\n4.信用卡还款，手机充值，公共缴费，转账，收款一站搞定。\n5.在线办卡，尊享一元停车、出行贵宾厅服务等银联权益。\n【官方网站】https://www.95516.com/\n【官方微博】云闪付\n【官方微信】Yinlian_KY",
    "icons": {
        "small": "https://play-lh.googleusercontent.com/_KMxe53g6yqPsRPpwjsekvz8dZxZ0xwOdGnIETdCBSErM9HV0hcpz2fP9tL99Z1gnw=w128",
        "medium": "https://play-lh.googleusercontent.com/_KMxe53g6yqPsRPpwjsekvz8dZxZ0xwOdGnIETdCBSErM9HV0hcpz2fP9tL99Z1gnw=w256",
        "large": "https://play-lh.googleusercontent.com/_KMxe53g6yqPsRPpwjsekvz8dZxZ0xwOdGnIETdCBSErM9HV0hcpz2fP9tL99Z1gnw=w512"
    },
    "developer": {
        "id": "China+UnionPay",
        "name": "China UnionPay",
        "url": "https://

In [12]:
google_res

['云闪付',
 '支付宝',
 'Google Wallet',
 'PayPal',
 'Venmo',
 'Zelle',
 '中国银行（境外版）',
 'PayPal Business',
 '淘宝',
 'Cards - 卡片 - 手机钱包',
 '電子支付',
 'WeChat',
 'Remitly: Send Money & Transfer',
 'Samsung Wallet (Samsung Pay)',
 'Mobily Pay',
 'Amazon Shopping',
 'A+ Wallet',
 'Wise',
 'Tonkeeper — TON Wallet',
 'Chime – Mobile Banking']

In [13]:
def build_pairs(query, res):
    return [[query, json.dumps(item, ensure_ascii=False).replace("\\n", "")] for item in res]

In [14]:
pairs = build_pairs(query, queryres)

In [15]:
# pairs = pairs[0:8]


In [16]:
single_input_cnt =1
start_i = 0
scores = []
for item in pairs:
    model.eval()
    with torch.no_grad():
        end_ = start_i+single_input_cnt if start_i+single_input_cnt <= len(pairs) else len(pairs)
        inputs = get_inputs([pairs[0],item], tokenizer)
        model_out = model(**inputs, return_dict=True)
        cur_scores = model_out.logits[:, -1, yes_loc].view(-1, ).float()
        start_i+=single_input_cnt
        print(cur_scores)
        # for item in cur_scores:
        scores.append(cur_scores[1].item()/cur_scores[0].item())

        

    

You're using a GemmaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([7.5117, 7.5117])
tensor([7.5117, 6.9141])
tensor([7.5078, 4.7461])
tensor([7.5664, 4.2344])
tensor([7.5156, 3.4336])
tensor([7.5273, 4.3047])
tensor([7.5117, 6.8125])
tensor([7.5547, 3.0820])
tensor([7.5156, 4.3398])
tensor([7.5117, 5.0195])
tensor([7.5234, 6.7695])
tensor([7.5664, 6.4219])
tensor([7.5234, 4.8008])
tensor([7.5547, 3.8750])
tensor([7.5117, 3.9355])
tensor([7.4844, 2.7773])
tensor([7.5469, 6.0938])
tensor([7.4648, 4.8281])
tensor([7.5547, 3.6055])
tensor([7.5430, 3.8359])


In [17]:
rerank_sort = []
for i in range (len(google_res)):
    rerank_sort.append([google_res[i], scores[i]])

In [18]:
rerank_sort.sort(key=lambda element: element[1],reverse=True)

In [19]:
google_res

['云闪付',
 '支付宝',
 'Google Wallet',
 'PayPal',
 'Venmo',
 'Zelle',
 '中国银行（境外版）',
 'PayPal Business',
 '淘宝',
 'Cards - 卡片 - 手机钱包',
 '電子支付',
 'WeChat',
 'Remitly: Send Money & Transfer',
 'Samsung Wallet (Samsung Pay)',
 'Mobily Pay',
 'Amazon Shopping',
 'A+ Wallet',
 'Wise',
 'Tonkeeper — TON Wallet',
 'Chime – Mobile Banking']

In [20]:
rerank_sort


[['云闪付', 1.0],
 ['支付宝', 0.9204368174726989],
 ['中国银行（境外版）', 0.906916276651066],
 ['電子支付', 0.8997923156801662],
 ['WeChat', 0.8487351574599896],
 ['A+ Wallet', 0.8074534161490683],
 ['Cards - 卡片 - 手机钱包', 0.6682267290691628],
 ['Wise', 0.6467817896389325],
 ['Remitly: Send Money & Transfer', 0.6381100726895119],
 ['Google Wallet', 0.6321540062434964],
 ['淘宝', 0.5774428274428275],
 ['Zelle', 0.5718733783082511],
 ['PayPal', 0.5596282911719154],
 ['Mobily Pay', 0.5239209568382736],
 ['Samsung Wallet (Samsung Pay)', 0.5129265770423992],
 ['Chime – Mobile Banking', 0.5085447954427758],
 ['Tonkeeper — TON Wallet', 0.4772492244053775],
 ['Venmo', 0.45686070686070684],
 ['PayPal Business', 0.4079627714581179],
 ['Amazon Shopping', 0.37108559498956156]]