In [2]:
!pip install gdown
!pip install datasets



In [3]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import copy
import gdown

## Loading the test data

In [4]:
from google.colab import drive
drive.mount('/content/drive')
gdown.download('https://drive.google.com/uc?id=1hPNALVA0QQOuAAGx-YSsPXb3BwbJb8j7', 'data_hackaton_test.data', quiet=False)
gdown.download('https://drive.google.com/uc?id=1hPNALVA0QQOuAAGx-YSsPXb3BwbJb8j7', 'data_hackaton_train.data', quiet=False)

test_df = pd.read_pickle('/content/data_hackaton_test.data')
test_df.shape

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Downloading...
From: https://drive.google.com/uc?id=1hPNALVA0QQOuAAGx-YSsPXb3BwbJb8j7
To: /content/data_hackaton_test.data
100%|██████████| 24.5M/24.5M [00:00<00:00, 63.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1hPNALVA0QQOuAAGx-YSsPXb3BwbJb8j7
To: /content/data_hackaton_train.data
100%|██████████| 24.5M/24.5M [00:00<00:00, 78.3MB/s]


(100000, 2)

In [5]:
from datasets import Dataset
dataset = Dataset.from_pandas(test_df)
dataset

Dataset({
    features: ['start', 'accepted_pair', '__index_level_0__'],
    num_rows: 100000
})

In [6]:
DICT_SYMBOLS = {
    'A': 1,
    'T': 2,
    'C': 3,
    'G': 4
}

## Calling the Models via Inference Endpoints on HuggingFace

> Since we fine-tuned fairly large models (flan-t5-small and Qwen2.5-0.5B-Instruct) we decided to serve these via HuggingFace! Here are the links and access to their weights.

flan-t5-small: https://huggingface.co/dantedgp/PharmaHacks2025-flan-t5-small

Qwen2.5-O.5B-Instruct: https://huggingface.co/dantedgp/PharmaHacks2025-Qwen2.5-0.5B-Instruct




In [7]:
from huggingface_hub import get_inference_endpoint, list_inference_endpoints

token = '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' # Please don't give this away!

t5_endpoint = get_inference_endpoint("pharmahacks2025-flan-t5-smal-bdh", token=token)
qwen_endpoint = get_inference_endpoint("pharmahacks2025-qwen2-5-0-5b-cdw", token=token)

t5_client = t5_endpoint.client
qwen_client = qwen_endpoint.client

In [8]:
qwen_client.text_generation(
    'Align DNA sequences:\n0: AGGA---\n1: TGTC---\n2: TTTC---\n3: CGAT---\n4: TGCG---\n5: TCTG---',
    max_new_tokens=128,
)

'()\n\n[2, 0], [3, 0], [4, 0], [5, 0], [6, 0], [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6], [2, 3], [3, 3], [4, 3], [5, 3], [6, 3], [6, 4], [6, 5], [6, 6] [1, 3], ['

## Setting up performance evaluation loop

In [9]:
import re

def generate_steps(start, text_generation):
    input_text = "Align DNA sequences:\n" + "\n".join(f"{i}: {seq}" for i, seq in enumerate(start))
    output_text  = text_generation(input_text, max_new_tokens=128)  # Either qwen or t5

    # Extract only the moves from the output (remove the input prompt)
    s = output_text.split("\n\n")[-1].rstrip(", '[")  # Remove unfinished elements at the end

    # Use regex to extract all tuples
    matches = re.findall(r"\[(\d+),\s*(\d+)\]", s)

    # Convert matches into a list of tuples
    tuple_list = [[int(x), int(y)] for x, y in matches]

    # Output the parsed list
    return tuple_list

## Assessing Qwen performance

In [10]:
import numpy as np
import copy
import matplotlib.pyplot as plt
import seaborn as sns

# Dictionary for visualization (adjust as needed)
DICT_SYMBOLS = {'A': 1, 'T': 2, 'G': 3, 'C': 4, '-': 0}

def build_puzzle_to_end(puzzle):
    """Pad each row with '-' to match the longest row for visualization."""
    max_len = max(len(row) for row in puzzle)
    return [row.ljust(max_len, '-') for row in puzzle]

def gearbox_score(puzzle, accepted_pair, bonus=1.15):
    """Calculate the gearbox score for the puzzle."""
    score = 0
    for col_ind in range(len(puzzle[0])):
        col_bonus = True
        col_tot = 0
        column_chars = [row[col_ind] for row in puzzle]
        for char in column_chars:
            if char == "-":
                col_bonus = False
                continue
            if char in accepted_pair[col_ind]:
                col_tot += 1
            else:
                col_bonus = False
        column_score = col_tot * bonus if col_bonus else col_tot
        score += column_score
    return score

def apply_step_to_puzzle(puzzle, step):
    """Apply a single step to the puzzle."""
    new_puzzle = puzzle.copy()
    row_index = step[0] - 1
    col_index = step[1]
    if row_index < 0 or row_index >= len(new_puzzle):
        return new_puzzle
    row_str = new_puzzle[row_index]
    if col_index < 0 or col_index > len(row_str):
        return new_puzzle
    new_row = row_str[:col_index] + '-' + row_str[col_index:]
    new_row = new_row[:len(row_str)]
    new_puzzle[row_index] = new_row
    return new_puzzle

def apply_all_steps(start, steps, accepted_pair):
    """Apply all steps on a copy of the start puzzle and return the final state and scores."""
    current_puzzle = list(start)
    updated_puzzles = []
    scores = []

    for step in steps:
        current_puzzle = apply_step_to_puzzle(current_puzzle, step)
        padded_current = build_puzzle_to_end(current_puzzle)
        score = gearbox_score(padded_current, accepted_pair)
        updated_puzzles.append(padded_current)
        scores.append(score)

    return updated_puzzles[-1], scores

In [26]:
random_indices = list(range(500, 1000))

solutions, scores = [], []

for i in random_indices:
  try:
    start = dataset[i]['start']
    accepted_pair = dataset[i]['accepted_pair']
    steps = generate_steps(start, qwen_client.text_generation)

    final_state, scores = apply_all_steps(start, steps, accepted_pair)
    best_index = scores.index(max(scores))
    best_steps = steps[:best_index]

    solution, scores = apply_all_steps(start, best_steps, accepted_pair)
    solutions.append(solution)
    scores.append(scores[-1])

    print(f'solution: {solution} | gearbox score: {scores[-1]}')
  except:
    print(f'failed on index {i}')
    solutions.append(None)
    scores.append(None)




solution: ['-TCTT--', '-TAGT--', '-CGGC--', '-AAGT--', '-AAGC--', 'TAGT---'] | gearbox score: 14
failed on index 501
failed on index 502
solution: ['-TAAG--', '-AAGT--', '-T-AAA-', '-TACA--', '-CGCG--', '-T--GTG'] | gearbox score: 12
failed on index 504
solution: ['-CGCTTCA----', '-CGTGTCA----', '-GAGTTCA----', '-TCTTTCA----', '-TGACCCCA---', '-CCGGGCA----', '-AGCGGCA----'] | gearbox score: 19
solution: ['-AGTCG--', '-GGTGC--', 'ATTCT---', 'ACTGT---', 'TCTGT---', 'ATTAT---'] | gearbox score: 18.9
failed on index 507
failed on index 508
solution: ['GTCCC--', 'TTT-C--', 'AG-C---', 'AAG-C--', 'AAG-C--', 'TGC-C--'] | gearbox score: 12
solution: ['-TTGG--', '-TTCA--', 'GTGA---', 'GCGA---', 'TGAA---', 'GGAC---'] | gearbox score: 11
solution: ['G-TT-AA-', 'T-AT-AA-', 'TTT--CC-', 'CCG--AG-', 'CCT--AC-', 'TCA-AA--'] | gearbox score: 24.9
solution: ['-CGTTTTC----', '-GCGTCCA----', '-GGCTTTA----', '-GGCCCTA----', '-GGGTCTA----', '-CTTCCGA----', 'CGTCCTA-----'] | gearbox score: 27.049999999999997


In [27]:
len(scores)

7

In [28]:
len(solutions)

500

In [29]:
old_dataset = new_dataset
old_dataset

Dataset({
    features: ['start', 'accepted_pair', '__index_level_0__', 'solution'],
    num_rows: 500
})

In [35]:
old_solutions = old_dataset['solution']

In [37]:
all_solutions = old_solutions + solutions

In [38]:
len(all_solutions)

1000

In [32]:
old_dataset[0]['solution']

['-GTGGACA----',
 '-CCTTTCC----',
 '-CGCCTCC----',
 '-CGGACCA----',
 '-CCCCCCA----',
 '-TAGGTCA----',
 '-GCGGGCA----',
 '-ACGTTCA----',
 '-ACCGTCA----',
 '-GAGGTCA----',
 '-AGCATCA----',
 '-ATTTTCA----',
 'AACGGCA-----',
 'CGCGGCA-----',
 'GTTGTCG-----',
 'TGTACCA-----',
 'CCCATCA-----',
 'TCTGTCA-----',
 'CCTGTCA-----']

In [15]:
new_dataset = dataset.select(range(500))
new_dataset


Dataset({
    features: ['start', 'accepted_pair', '__index_level_0__'],
    num_rows: 500
})

In [40]:
submission_dataset = dataset.select(range(1000))
submission_dataset

Dataset({
    features: ['start', 'accepted_pair', '__index_level_0__'],
    num_rows: 1000
})

In [41]:



submission_dataset = submission_dataset.map(lambda example, idx: {"solution": all_solutions[idx]}, with_indices=True)


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [42]:

submission_dataset

Dataset({
    features: ['start', 'accepted_pair', '__index_level_0__', 'solution'],
    num_rows: 1000
})

In [43]:
 submission_dataset.to_csv("submission_dataset.csv")


Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

516407