In [1]:
import torch
import gc
import json
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from langdetect import detect
from sentence_transformers import SentenceTransformer

import numpy as np


dict_map = {
    "òa": "oà",
    "Òa": "Oà",
    "ÒA": "OÀ",
    "óa": "oá",
    "Óa": "Oá",
    "ÓA": "OÁ",
    "ỏa": "oả",
    "Ỏa": "Oả",
    "ỎA": "OẢ",
    "õa": "oã",
    "Õa": "Oã",
    "ÕA": "OÃ",
    "ọa": "oạ",
    "Ọa": "Oạ",
    "ỌA": "OẠ",
    "òe": "oè",
    "Òe": "Oè",
    "ÒE": "OÈ",
    "óe": "oé",
    "Óe": "Oé",
    "ÓE": "OÉ",
    "ỏe": "oẻ",
    "Ỏe": "Oẻ",
    "ỎE": "OẺ",
    "õe": "oẽ",
    "Õe": "Oẽ",
    "ÕE": "OẼ",
    "ọe": "oẹ",
    "Ọe": "Oẹ",
    "ỌE": "OẸ",
    "ùy": "uỳ",
    "Ùy": "Uỳ",
    "ÙY": "UỲ",
    "úy": "uý",
    "Úy": "Uý",
    "ÚY": "UÝ",
    "ủy": "uỷ",
    "Ủy": "Uỷ",
    "ỦY": "UỶ",
    "ũy": "uỹ",
    "Ũy": "Uỹ",
    "ŨY": "UỸ",
    "ụy": "uỵ",
    "Ụy": "Uỵ",
    "ỤY": "UỴ",
    }

In [2]:
embedding_model = SentenceTransformer('BAAI/bge-large-en-v1.5')

In [3]:
detect("Một cửa hàng đã bán 30% số hàng hiện có và thu được 15 000 000 đồng")

'vi'

In [4]:
tokenizer_vi2en = AutoTokenizer.from_pretrained("vinai/vinai-translate-vi2en", src_lang="vi_VN", cache_dir="./cache")
model_vi2en = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-vi2en", cache_dir="./cache")

In [5]:
def translate_vi2en(vi_text: str) -> str:
    for i, j in dict_map.items():
        vi_text = vi_text.replace(i, j)
    input_ids = tokenizer_vi2en(vi_text, return_tensors="pt").input_ids
    output_ids = model_vi2en.generate(
        input_ids,
        decoder_start_token_id=tokenizer_vi2en.lang_code_to_id["en_XX"],
        num_return_sequences=1,
        # # With sampling
        # do_sample=True,
        # top_k=100,
        # top_p=0.8,
        # With beam search
        num_beams=5,
        early_stopping=True
    )
    en_text = tokenizer_vi2en.batch_decode(output_ids, skip_special_tokens=True)
    en_text = " ".join(en_text)
    return en_text

translate_vi2en("Một cửa hàng đã bán 30% số hàng hiện có và thu được 15 000 000 đồng.")

'One store sold 30% of its existing stock and earned VND15,000.'

In [6]:
# Make sure the model path is correct for your system!
math_model = Llama(
#     model_path="/kaggle/input/llama-cpp-math-models/metamath-mistral-7b.Q5_K_M.gguf",
    model_path="models/metamath-mistral-7b.Q8_0.gguf",
    n_gpu_layers=-1,
    n_ctx=4096,
    n_batch=1024,
    max_tokens=-1,
    verbose=False, # Verbose is required to pass to the callback manager
)


ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from models/metamath-mistral-7b.Q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q8_0     [  4096, 32001,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q8_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q8_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q8_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q8_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q8_0     [  4096, 14336,     1,     1 ]
llama_model_loade

## Prompt

In [7]:
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
}

INPUT_STR_PROMPT = """Question: {question}
Options:
{choices[0]}
{choices[1]}
{choices[2]}
{choices[3]}"""

def get_instruction_str(sample):
    num_options = len(sample["choices"])
    option_chars = [chr(ord("A") + i) for i in range(num_options)]
    return f"Answer the following question for me by choosing option {', '.join(option_chars[:-1])}, or {option_chars[-1]}."

def get_input_str(sample):
    num_options = len(sample["choices"])
    INPUT_STR_PROMPT = """Question: {question}
Options:
"""
    for i in range(num_options):
        INPUT_STR_PROMPT += "{" + f"choices[{i}]" + "}\n"
    INPUT_STR_PROMPT = INPUT_STR_PROMPT[:-1]
    return INPUT_STR_PROMPT.format(**sample)

In [8]:
def get_model_str(sample):
    instruction_str = get_instruction_str(sample)
    input_str = get_input_str(sample)
    return PROMPT_DICT["prompt_input"].format(instruction=instruction_str, input=input_str)

In [28]:
def detect_lang(text):
    try: 
        lang = detect(text)
    except:
        lang = None
    return lang

def process_option(option, lower=False):
    option = option[2:].strip()
    return option.lower() if lower else option

def preprocess_sample(sample, debug=False):
    question = sample["question"]
    choices = sample["choices"]
    choices = [process_option(choice, lower=False) for choice in choices]
    
    if detect_lang(question) == "vi":
        if debug: print(f"Translating question: {question}")
        question = translate_vi2en(question)
    
    for i, choice in enumerate(choices):
        if debug: print(f"Translating choice {i}: {choice}")
        if detect_lang(choice) == "vi":
            choices[i] = translate_vi2en(choice)
    
    choices = [chr(ord("A") + i) + ". " + choice for i, choice in enumerate(choices)]
    
    return {
        "question": question,
        "choices": choices
    }

def post_process_output(output, sample):
    options = sample["choices"]
    options = [process_option(option, lower=True) for option in options]
    
    option_id = {o:i for i, o in enumerate(options)}
    
    idx = -1
    for option in sorted(options, key=len, reverse=True):
        if option in output:
            idx = option_id[option]

    if idx == -1:
        pred = output.split("\n")[-3:]
        pred = " ".join(pred)
        choices = sample["choices"]
        embeddings_1 = embedding_model.encode(pred, normalize_embeddings=True)
        embeddings_2 = embedding_model.encode(choices, normalize_embeddings=True)
        similarity = embeddings_1 @ embeddings_2.T
        idx = np.argmax(similarity)
    return idx
    
def answer(sample, math_llm, num_trials=2, debug=False):
    processed_sample = preprocess_sample(sample, debug)
    
    model_str = get_model_str(processed_sample)
    if debug: print(model_str)
    
    for i in range(num_trials):
        math_output = math_llm(
            model_str,
            top_k=1,
            max_tokens=1024,
        )
        output = math_output["choices"][0]["text"].strip()
        if output != "" and i > 0:
            if debug: print(output)
            break

    idx = post_process_output(output, processed_sample)
    result = {
        "id" : sample["id"],
        "answer": sample["choices"][idx]
    }

    if debug: print(result)
    
    return result, output

In [24]:
# sample = {
#     "question": "10% of 11.5m2 is:",
#     "choices": ["A. 10,15dm2", "B. 1,5m2", "C. 15,5m2", "D. 1,15m2"],
# }

example_sample = {
    "id": "xxxx",


    
    # "question": "A store sold 30% of its existing goods and earned 15,000,000 VND. If all goods were sold, how much money would the store earn?",
    # "choices": [
    #     "A. 4 500 000 VND",
    #     "B. 45 000 000 VND",
    #     "C. 50 000 000 VND",
    #     "D. 450 000 000 VND"
    # ]
    # "question": "8 dm2 24 cm2 = ……… dm2. The appropriate number to fill in the blanks is:",
    #       "choices": [
    #          "A. 824",
    #          "B. 82.4",
    #          "C. 8.24",
    #          "D. 0.824"
    #       ],
    
#     "question": "2 ${\\times}$ 9 ? – 28 5 ${\\times}$ 3",
#     "choices": [
#     "A. 52",
#     "B. 53",
#     "C. 41",
#     "D. 45"
#     ],
    
#     "question": "The appropriate number to fill in the blanks 5kg 30g = …….. kg is:",
#           "choices": [
#              "A. 53",
#              "B. 50.3",
#              "C. 5.03",
#              "D. 5.3"
#           ]

#     "question": "A cyclist rode from A at 7 o'clock at a speed of 12km/h. At 8 o'clock a motorcyclist also from A chased the cyclist at a speed of 42km/h. Ask the cyclist What time did the machine catch up with the cyclist?",
#           "choices": [
#              "A. 24 minutes",
#              "B. 1 hour",
#              "C. 7 hours 24 minutes",
#              "D. 8 hours 24 minutes"
#           ]
    "question": "Một cửa hàng đã bán 30% số hàng hiện có và thu được 15 000 000 đồng. Hỏi nếu bán hết hàng thì cửa hàng thu được bao nhiêu tiền?",
        "choices": [
            "A. 4 500 000 đồng",
            "B. 45 000 000 đồng",
            "C. 50 000 000 đồng",
            "D. 450 000 000 đồng"
        ]
}

answer(example_sample, math_model, debug=True)

Translating question: Một cửa hàng đã bán 30% số hàng hiện có và thu được 15 000 000 đồng. Hỏi nếu bán hết hàng thì cửa hàng thu được bao nhiêu tiền?
Translating choice 0: 4 500 000 đồng
Translating choice 1: 45 000 000 đồng
Translating choice 2: 50 000 000 đồng
Translating choice 3: 450 000 000 đồng
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Answer the following question for me by choosing option A, B, C, or D.

### Input:
Question: A store sold 30% of its existing goods and earned VND15, 000, 000. Ask how much money did the store earn if it sold out?
Options:
A. 4, 500, 000 VND
B. VND 45, 000, 000
C. 50, 000, 000 VND
D. 450, 000, 000 VND

### Response:
To solve this problem, we need to determine the value of x, which represents the total amount of money the store earned if it sold out.
We know that the store sold 30% of its existing goods and earned VND

({'id': 'xxxx', 'answer': 'C. 50 000 000 đồng'},
 "To solve this problem, we need to determine the value of x, which represents the total amount of money the store earned if it sold out.\nWe know that the store sold 30% of its existing goods and earned VND15, 000, 000.\nLet's set up the equation as follows:\n30% of existing goods * Total amount earned = VND15, 000, 000\n0.3x = VND15, 000, 000\nTo solve for x, we divide both sides of the equation by 0.3:\nx = VND15, 000, 000 / 0.3\nx = VND50, 000, 000\nTherefore, the store would earn VND50, 000, 000 if it sold out.\nThe answer is: 50,000,000")

## Inference

In [16]:
test_samples = json.load(open("./datasets/public_test/math_test.json", "r", encoding="utf-8"))['data']
test_samples[3]

{'id': '01-0209',
 'question': 'Một thửa ruộng hình thang có đáy bé dài 8m, đáy lớn dài 12m. Kéo dài đáy lớn thêm 5m thì diện tích thửa ruộng tăng thêm 25m2. Hỏi diện tích thửa ruộng tăng thêm bao nhiêu phần trăm?',
 'choices': ['A. 125m^{2}', 'B. 20%', 'C. 25%', 'D. 50%']}

In [25]:
answer(test_samples[3], math_model, debug=True)

Translating question: Một thửa ruộng hình thang có đáy bé dài 8m, đáy lớn dài 12m. Kéo dài đáy lớn thêm 5m thì diện tích thửa ruộng tăng thêm 25m2. Hỏi diện tích thửa ruộng tăng thêm bao nhiêu phần trăm?
Translating choice 0: 125m^{2}
Translating choice 1: 20%
Translating choice 2: 25%
Translating choice 3: 50%
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Answer the following question for me by choosing option A, B, C, or D.

### Input:
Question: A trapezoidal field has a small bottom of 8m long and a big bottom of 12m long. Extending the big bottom by 5m, the area of the field will be increased by 25m2. Ask how much percent of the area of the field will be increased?
Options:
A. 125m^{2}
B. 20%
C. 25%
D. 50%

### Response:
To solve this problem, we need to determine the percentage increase in the area of the trapezoidal field.
Let's denote the height of th

({'id': '01-0209', 'answer': 'C. 25%'},
 "To solve this problem, we need to determine the percentage increase in the area of the trapezoidal field.\nLet's denote the height of the trapezoid as h.\nThe original area of the field is (8 + 12) * h / 2 = 20h/2 = 10h.\nWhen we extend the big bottom by 5m, the new length becomes 12 + 5 = 17m.\nThe new area of the field is (8 + 17) * h / 2 = 25h/2 = 12.5h.\nThe increase in area is 12.5h - 10h = 2.5h.\nTo find the percentage increase, we divide the increase by the original area and multiply by 100: (2.5h / 10h) * 100 = 25%.\nTherefore, the correct answer is option C, which is 25%.\n#### 25#### 25\nThe answer is: 25")

In [None]:
import random
from tqdm import tqdm

answer_data = []
logging_data = []
for sample in tqdm(test_samples):
    debug = True if random.random() < 0.1 else False
    ans, model_output = answer(sample, math_model, debug=debug)
    if debug:
        print("*" * 50 + "\n\n")
    answer_data.append(ans)

    sample["model_output"] = model_output
    logging_data.append(sample)

In [29]:
import pandas as pd

submission_df = pd.DataFrame(answer_data)
submission_df.to_csv("./submissions/baseline1.csv", index=False)

In [30]:
with open("./logs/baseline1.json", "w", encoding="utf-8") as f:
    json.dump(logging_data, f, ensure_ascii=False, indent=4)