# Install, Import, and Helper functions

In [None]:
# ! pip install -q torch==2.7.0 ms-swift==3.7.2 transformers==4.51.3 trl pydantic peft torchvision tqdm ipywidgets torchmetrics bitsandbytes accelerate protobuf pandas decord tokenizers sentencepiece pyarrow pydantic_core markdown2[all] numpy scikit-learn requests httpx uvicorn fastapi einops einops-exts timm tiktoken transformers_stream_generator scipy pandas torchaudio xformers pillow deepspeed pysubs2 moviepy==1.0.3 gradio
# ! pip install -q https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.7cxx11abiTRUE-cp310-cp310-linux_x86_64.whl

In [11]:
import pandas as pd
import requests
import numpy as np
from PIL import Image
from io import BytesIO
import torch
from torch.utils.data import Dataset
from torchvision import transforms
import ast
import os
import hashlib
from tqdm import tqdm
import base64
import binascii
import re
import smtplib
import sys
import logging
import time
import random
import warnings
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.model_selection import train_test_split
import json
warnings.filterwarnings("ignore")

In [12]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.benchmark = True
#     torch.backends.cudnn.benchmark = False
#     torch.backends.cudnn.deterministic = True
seed_everything(42)

def set_logging(file_path):
    ! rm {file_path}
    nblog = open(file_path, "a+")
    sys.stdout.echo = nblog
    sys.stderr.echo = nblog
    get_ipython().log.handlers[0].stream = nblog
    get_ipython().log.setLevel(logging.INFO)
    %autosave 5
set_logging("logs/whole.log")
    
def send_email(subject, text):
    smtpObj = smtplib.SMTP('smtp.gmail.com', 25)
    smtpObj.ehlo()
    smtpObj.starttls()
    smtpObj.login(address := 'chio4696@gmail.com', 'xnhunyoqaqvflpgw')
    smtpObj.sendmail(address, address, f"Subject: {subject}\n{text}\n")
    smtpObj.quit()

Autosaving every 5 seconds


# Data Preprocessing

In [None]:
def process_row(row_data):
    index, row, save_dir = row_data
    if row['input_type'] != 'image':
        return index, row['input']
    try:
        image_data = row['input']
        filename = hashlib.sha256(str(image_data).encode()).hexdigest() + '.jpg'
        save_path = os.path.join(save_dir, filename)
        image_bytes = None
        if isinstance(image_data, str) and image_data.startswith(('http', 'https')):
            response = requests.get(image_data, stream=True, timeout=10)
            response.raise_for_status()
            image_bytes = response.raw
        elif isinstance(image_data, bytes):
            image_bytes = BytesIO(image_data)
        elif isinstance(image_data, str):
            try:
                decoded_data = base64.b64decode(image_data)
                image_bytes = BytesIO(decoded_data)
            except (binascii.Error, ValueError):
                return index, None
        if image_bytes:
            with Image.open(image_bytes) as img:
                img.convert('RGB').save(save_path, 'jpeg')
            return index, save_path
        else:
            return index, None
    except Exception as e:
        return index, None

def preprocess_with_executor(parquet_path, save_dir):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        print(f"Created directory: {save_dir}")
    try:
        df = pd.read_parquet(parquet_path)
        print(f"Successfully loaded {parquet_path}")
    except Exception as e:
        print(f"Error loading parquet file: {e}")
        return
    tasks = [(index, row, save_dir) for index, row in df.iterrows()]
    new_input_paths = [None] * len(df)
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_row, task): task for task in tasks}
        for future in tqdm(as_completed(futures), total=len(tasks), desc=f"Processing {os.path.basename(parquet_path)}"):
            index, new_path = future.result()
            if index is not None and index < len(new_input_paths):
                if new_path is None and df.iloc[index]['input_type'] != 'image':
                    new_input_paths[index] = df.iloc[index]['input']
                else:
                    new_input_paths[index] = new_path
    df['input'] = new_input_paths
    output_filename = os.path.splitext(os.path.basename(parquet_path))[0] + '_path_converted.parquet'
    output_path = os.path.join(os.path.dirname(parquet_path), output_filename)
    try:
        df.to_parquet(output_path, index=False)
        print(f"Successfully saved processed data to {output_path}")
    except Exception as e:
        print(f"Error saving processed parquet file: {e}")

In [21]:
train_parquet_file = '../data/raw/deep_chal_multitask_dataset.parquet'
train_image_save_directory = '../data/image/train_images'
preprocess_with_executor(train_parquet_file, train_image_save_directory)

test_parquet_file = '../data/raw/deep_chal_multitask_dataset_test.parquet'
test_image_save_directory = '../data/image/test_images'
preprocess_with_executor(test_parquet_file, test_image_save_directory)

train = pd.read_parquet("../data/raw/deep_chal_multitask_dataset_path_converted.parquet")
train = train[train['input'].notna()]
train.to_parquet("../data/converted/deep_chal_multitask_dataset_path_converted.parquet", index=False)

test = pd.read_parquet("../data/raw/deep_chal_multitask_dataset_test_path_converted.parquet")
test.to_parquet("../data/converted/deep_chal_multitask_dataset_test_path_converted.parquet", index=False)

send_email("Preprocessing Done", "It is.")

In [22]:
train = pd.read_parquet("../data/converted/deep_chal_multitask_dataset_path_converted.parquet")

text_qa_df = train[train["task"] == "text_qa"].copy()
all_qa_pairs = []
for index, row in text_qa_df.iterrows():
    context = row['input']
    answers_dict = ast.literal_eval(row['output'])
    questions_list = ast.literal_eval(row['question'])
    answer_texts = answers_dict.get('input_text', [])
    for question, answer in zip(questions_list, answer_texts):
        all_qa_pairs.append({'input': context, 'question': question, 'output': answer})
processed_qa_df = pd.DataFrame(all_qa_pairs)
pronouns_to_remove = ['it', 'this', 'that', 'these', 'those', 'he', 'she', 'they', 'them', 'him', 'her', 'his', 'its', 'their', 'theirs']
pattern = r'\b(' + '|'.join(pronouns_to_remove) + r')\b'
mask_to_remove = processed_qa_df['question'].str.contains(pattern, case=False, regex=True)
final_filtered_df = processed_qa_df[~mask_to_remove]
allowed_start_words = ('What', 'How', 'Why', 'When', 'Where', 'Who', 'Is', 'Are', 'Am', 'Were', 'Was', 'Which', "List", "Did", "Before", "After", "Name", "Do", 'Can', 'Could', 'From', 'About', 'According', 'At', 'Can', 'Could', 'Define', 'Describe', 'Does', 'During', 'For', 'From', 'Had', 'Has', 'Have', 'In', 'On', 'Under', 'Whom', 'Whose', 'Will')
mask_starts_with = final_filtered_df['question'].str.startswith(allowed_start_words)
mask_long_enough = final_filtered_df['question'].str.split().str.len() >= 6
final_mask = mask_starts_with & mask_long_enough
final_filtered_df = final_filtered_df[final_mask]
other_tasks_df = train[train['task'] != 'text_qa'].copy()
final_filtered_df["input_type"] = "text"
final_filtered_df["task"] = "text_qa"
final_train_df = pd.concat([other_tasks_df, final_filtered_df], ignore_index=True)
train = final_train_df.reindex()
print("Penultimate Train DF: ", len(train))

Penultimate Train DF:  46638


In [23]:
IMAGE_BASE_PATH = '../data/image/train_images/'
SFT_FILE = '../data/dataset/sft_data_stratified.jsonl'
# VAL_FILE = '../data/dataset/val_data_stratified.jsonl'
SPLIT_RATIO = 0.1

df = train.copy()
df['question'] = df['question'].fillna('')
df['input'] = df['input'].fillna('')
print("Original task distribution:")
print(df['task'].value_counts())

df_text_qa = df[df['task'] == 'text_qa']
df_other_tasks = df[df['task'] != 'text_qa']
df_text_qa_undersampled = df_text_qa.groupby('input', group_keys=False).apply(lambda x: x.sample(n=min(len(x), 3), random_state=42))
df = pd.concat([df_other_tasks, df_text_qa_undersampled])
print("\nTask distribution after undersampling 'text_qa':")
print(df['task'].value_counts())

sft_df, val_df = train_test_split(df, test_size=SPLIT_RATIO, random_state=42, stratify=df['task'])

print("\n--- Dataset Split ---")
print(f"Total samples: {len(df)}")
print(f"SFT samples (90%): {len(sft_df)}")

print("\n--- Task Distribution in SFT set ---")
print(sft_df['task'].value_counts(normalize=True))
print("-" * 20)


with open(SFT_FILE, 'w') as f:
    for _, row in sft_df.iterrows():
        task = row['task']
        question = row['question']
        output = row['output']
        record = {}
        if task == 'captioning':
            query = '<image> Generate a single, detailed, and objective descriptive paragraph for the given image. Each description must begin with the phrase "The image is..." or "The image shows...", followed by a structured analysis that moves from the main subject to its details, and then to the background elements. You must use positional language, such as "on the left" or "at the top of the cover" to clearly orient the reader. If any text is visible in the image, transcribe it exactly and describe its visual characteristics like color and style. Conclude the entire description with a sentence that summarizes the overall atmosphere of the image, using a phrase like "The overall mood of the image is...". Throughout the paragraph, maintain a strictly factual, declarative tone with specific, descriptive vocabulary, avoiding any personal opinions or interpretations.'
            image_path = os.path.join(IMAGE_BASE_PATH, row['input'])
            record = {"input": query, "output": output, "images": [image_path]}
        elif task == 'vqa':
            query = f'<image> Given a document image and a question, extract the precise answer. Your response must be only the literal text found in the image, with no extra words or explanation.\n\nQuestion: {question}'
            image_path = os.path.join(IMAGE_BASE_PATH, row['input'])
            record = {"input": query, "output": output, "images": [image_path]}
        elif task == 'summarization':
            prompt = f"Generate a summary of the following legislative text. Start with the bill's official title, then state its primary purpose and key provisions. Use formal, objective language and focus on the actions the bill takes, such as what it amends, requires, prohibits, or establishes.\n\nText: {row['input']}"
            record = {"input": prompt, "output": output}
        elif task == 'text_qa':
            prompt = f"Given a context and a question, extract the most concise, direct answer from the text. Your answer should be a short phrase, not a complete sentence.\n\nContext: {row['input']}\n\nQuestion: {question}"
            record = {"input": prompt, "output": output}
        elif task == 'math_reasoning':
            prompt = f"Given a math word problem, solve the question by generating a step-by-step reasoning process. After detailing all the steps in your reasoning, you must conclude your response by placing the final numerical answer on its own separate line, prefixed with #### .\n\nQuestion: {row['input']}"
            record = {"input": prompt, "output": output}
        f.write(json.dumps(record) + '\n')
print(f"Saved {SFT_FILE}")

# NO VALIDATION FOR FASTER REPRODUCE

Original task distribution:
task
text_qa           15860
summarization     10000
vqa               10000
math_reasoning     7473
captioning         3305
Name: count, dtype: int64

Task distribution after undersampling 'text_qa':
task
text_qa           11229
summarization     10000
vqa               10000
math_reasoning     7473
captioning         3305
Name: count, dtype: int64

--- Dataset Split ---
Total samples: 42007
SFT samples (90%): 37806

--- Task Distribution in SFT set ---
task
text_qa           0.267312
vqa               0.238057
summarization     0.238057
math_reasoning    0.177908
captioning        0.078665
Name: proportion, dtype: float64
--------------------
Saved ../data/dataset/sft_data_stratified.jsonl


In [24]:
test = pd.read_parquet("../data/converted/deep_chal_multitask_dataset_test_path_converted.parquet")
IMAGE_BASE_PATH = '../data/image/test_images/'
TEST_FILE = '../data/dataset/test_data.jsonl'
test_df = test.copy()

with open(TEST_FILE, 'w') as f:
    for _, row in test_df.iterrows():
        task = row['task']
        question = row['question']
        record = {}
        if task == 'captioning':
            query = '<image> Generate a single, detailed, and objective descriptive paragraph for the given image. Each description must begin with the phrase "The image is..." or "The image shows...", followed by a structured analysis that moves from the main subject to its details, and then to the background elements. You must use positional language, such as "on the left" or "at the top of the cover" to clearly orient the reader. If any text is visible in the image, transcribe it exactly and describe its visual characteristics like color and style. Conclude the entire description with a sentence that summarizes the overall atmosphere of the image, using a phrase like "The overall mood of the image is...". Throughout the paragraph, maintain a strictly factual, declarative tone with specific, descriptive vocabulary, avoiding any personal opinions or interpretations.'
            image_path = os.path.join(IMAGE_BASE_PATH, row['input'])
            record = {"input": query, "images": [image_path]}
        elif task == 'vqa':
            query = f'<image> Given a document image and a question, extract the precise answer. Your response must be only the literal text found in the image, with no extra words or explanation.\n\nQuestion: {question}'
            image_path = os.path.join(IMAGE_BASE_PATH, row['input'])
            record = {"input": query, "images": [image_path]}
        elif task == 'summarization':
            prompt = f"Generate a summary of the following legislative text. Start with the bill's official title, then state its primary purpose and key provisions. Use formal, objective language and focus on the actions the bill takes, such as what it amends, requires, prohibits, or establishes.\n\nText: {row['input']}"
            record = {"input": prompt}
        elif task == 'text_qa':
            prompt = f"Given a context and a question, extract the most concise, direct answer from the text. Your answer should be a short phrase, not a complete sentence.\n\nContext: {row['input']}\n\nQuestion: {question}"
            record = {"input": prompt}
        elif task == 'math_reasoning':
            prompt = f"Given a math word problem, solve the question by generating a step-by-step reasoning process. After detailing all the steps in your reasoning, you must conclude your response by placing the final numerical answer on its own separate line, prefixed with #### .\n\nQuestion: {row['input']}"
            record = {"input": prompt}
        f.write(json.dumps(record) + '\n')
print(f"Saved {TEST_FILE}")

TypeError: join() argument must be str, bytes, or os.PathLike object, not 'NoneType'

In [None]:
display(pd.read_parquet("../data/converted/deep_chal_multitask_dataset_path_converted.parquet").groupby("task").head(1))
display(pd.read_parquet("../data/converted/deep_chal_multitask_dataset_test_path_converted.parquet").groupby("task").head(1))

# SFT Experimental Runs w/ CE Loss vs. Dynamic Loss

In [None]:
# MAY SET # OF DEVICE TO 0, 1 AND RUN EACH ON TERMINAL TO UTILIZE MULTIPLE GPUs

In [None]:
! CUDA_VISIBLE_DEVICES=0 \
swift sft \
    --model AIDC-AI/Ovis2.5-9B \
    --dataset '../data/dataset/sft_data_stratified.jsonl' \
    --dataloader_num_workers 16 \
    --save_steps 200 \
    --save_total_limit 1 \
    --logging_steps 1 \
    --output_dir 'output_ce_loss' \
    --use_hf true \
    --train_type lora \
    --lora_rank 64 \
    --lora_alpha 128 \
    --init_weights pissa \
    --use_rslora true \
    --target_modules all-linear \
    --freeze_vit true \
    --freeze_llm false \
    --freeze_aligner true \
    --torch_dtype bfloat16 \
    --max_length 8192 \
    --loss_scale ignore_empty_think \
    --attn_impl flash_attn \
    --num_train_epochs 1 \
    --per_device_train_batch_size 8 \
    --learning_rate 2e-5 \
    --gradient_accumulation_steps 2 \
    --padding_free true \
    --warmup_ratio 0.1 \
    --weight_decay 0.

In [None]:
! CUDA_VISIBLE_DEVICES=0 \
swift sft \
    --model AIDC-AI/Ovis2.5-9B \
    --dataset '../data/dataset/sft_data_stratified.jsonl' \
    --dataloader_num_workers 16 \
    --save_steps 200 \
    --save_total_limit 1 \
    --logging_steps 1 \
    --output_dir 'output_dynamic_loss' \
    --use_hf true \
    --train_type lora \
    --lora_rank 64 \
    --lora_alpha 128 \
    --init_weights pissa \
    --use_rslora true \
    --target_modules all-linear \
    --freeze_vit true \
    --freeze_llm false \
    --freeze_aligner true \
    --torch_dtype bfloat16 \
    --max_length 8192 \
    --enable_dft_loss true \
    --loss_scale ignore_empty_think \
    --attn_impl flash_attn \
    --num_train_epochs 1 \
    --per_device_train_batch_size 8 \
    --learning_rate 2e-5 \
    --gradient_accumulation_steps 2 \
    --padding_free true \
    --warmup_ratio 0.1 \
    --weight_decay 0.

In [None]:
send_email("SFT Done", "WAKE UP")

# Inference, Read Comments

In [None]:
# **** CHANGE THE ADAPTERS PATH INTO EACH RUN'S OUTPUT/SOME_VERSION/"CHECKPOINT-{}" PATH TO LOAD PROPERLY ****
# **** AFTER SUBMISSION FILE CREATION, SUBMIT AND USE SUBMISSION WITH HIGHER SCORING ON LB ****
# **** PLAY WITH TEMPERATURE, USED TEMP=[0 (for run 1 and 2), 0.3 (for run 1)] ****
# **** CURRENT ADAPTER PATH IS SET TO THE ADAPTER I USED IN THE COMPETITION ****

In [None]:
! CUDA_VISIBLE_DEVICES=0 \
swift infer \
    --adapters 'output/v0-20250828-230749/checkpoint-2733/' \
    --infer_backend pt \
    --temperature 0 \
    --max_new_tokens 4096 \
    --val_dataset '../data/dataset/test_data.jsonl' \
    --use_hf true \
    --max_batch_size 16

In [None]:
# **** BE SURE TO CHANGE THE PREDICTION / OUTPUT PATH FOR SECOND RUN SUBMISSION FILE CREATION ****
# **** THE PREDICTION PATH IS AT THE END OF LAST CELL OUTPUT ****

In [None]:
test = pd.read_parquet("../data/raw/")
pred = pd.read_json("output/v0-20250828-230749/checkpoint-2733/infer_result/20250830-115559.jsonl", lines=True)
test["output"] = pred["response"]
test.index.name = "id"
test["output"].to_csv("../prediction/submission_1.csv")

In [None]:
send_email("Inference Done", "Check LB.")

# Simple ENV out

In [None]:
# ! pip3 freeze > ../requirements.txt