# IBIS + Simulated annealing (3)

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import numpy as np
import random
import time
import math

import numpy as np
import torch as T
from transformers import AutoTokenizer, AutoModelForCausalLM
import gc
from itertools import chain
from torch.nn.utils.rnn import pad_sequence
from math import exp

In [2]:
T.set_grad_enabled(False)

model_name = "/kaggle/input/gemma-2/transformers/gemma-2-9b/2" #"gpt2"
loss_fct = T.nn.CrossEntropyLoss(reduction='none')

DEVICE = T.device('cuda' if T.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype = T.float16 if DEVICE.type == 'cuda' else T.float32,
                device_map='auto')
model.eval()

# words from tokenizer
vocab = tokenizer.get_vocab()
vocab = {vocab[i]:i for i in vocab}
V = len(vocab)

# first symbol should be english letter
unbreakable = np.zeros((V,))
for v in range(V):
    unbreakable[v] = vocab[v][0].lower() in 'abcdefghijklmnopqrstuvwxyz'

print(f'Loaded model {model_name}')

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Loaded model /kaggle/input/gemma-2/transformers/gemma-2-9b/2


In [3]:
b = 64 #16 #32 #64 #128
B = 512
max_steps = 1024
patience = 128

def shuffle(s, batch_size=b, start_from_zero=True):

    with T.no_grad():
        sentence = tokenizer(
                    s,
                    return_tensors='pt',
                    add_special_tokens=False,)
        before = tokenizer(tokenizer.bos_token, return_tensors='pt',
                    add_special_tokens=False,)
        after = tokenizer(tokenizer.eos_token, return_tensors='pt',
                    add_special_tokens=False,)
    
        mask = (1-unbreakable[sentence['input_ids']])
        mask[0] = 1

        sentence = {k: v.to(DEVICE) for k, v in sentence.items()}
        before = {k: v.to(DEVICE) for k, v in before.items()}
        after = {k: v.to(DEVICE) for k, v in after.items()}
                                        
        for nch, k in enumerate(ibis(model, DEVICE, before, sentence, after, batch_size, B, max_steps, patience, False, mask, start_from_zero=start_from_zero)):
            if nch==0: 
                starting = k.item()
                print('Original order NLL = ', starting)
            else:
                print(k[0], k[1], k[2], tokenizer.decode(k[3][1:-1], clean_up_tokenization_spaces=False))

In [4]:
def score(model, model_inputs, debug=False):
    with T.no_grad():
        if debug:
            print(model_inputs)
        # Forward pass through the model
        outputs = model(**model_inputs, use_cache=False)
        logits = outputs['logits'] # batch_size, 15, 256000

        lsm = -logits.log_softmax(dim=-1)
        preds = T.zeros_like(lsm)
        preds[:,1:] = lsm[:,:-1]

        shift_logits = logits[..., :-1, :].contiguous() 
        shift_labels = model_inputs['input_ids'][..., 1:].contiguous()  
        
        loss = loss_fct(
            shift_logits.view(-1, shift_logits.size(-1)),
            shift_labels.view(-1)
        )
        loss = loss.view(len(logits), -1)
        sequence_loss = loss.mean(dim=1)
        ppl = T.exp(sequence_loss.float())

        word_scores = preds.gather(2, model_inputs['input_ids'].unsqueeze(2)).squeeze(2)
        sequence_scores = word_scores.sum(dim=1)
        # Return sequence scores, word-level scores, and negative log probabilities
        return ppl.cpu(), word_scores.cpu(), -preds.cpu()


cand_orders = { 3: [[1,3,2,4]], 
                4: [[1,4,3,2,5]], 
                5: [[1,3,2,5,4,6],[1,3,5,2,4,6],[1,3,5,4,2,6],[1,4,2,5,3,6],[1,4,3,5,2,6],[1,5,4,3,2,6],[1,5,2,4,3,6],[1,5,3,2,4,6]] }


def shuffle_proposals(mat, topk, bs, kopt):
    """
    mat: A 2D tensor representing pairwise scores or relationships between sequence elements.
    topk: The number of top candidate proposals to evaluate.
    bs: Batch size; the number of proposals to return.
    kopt: Number of tokens to shuffle. Determines the complexity of reshuffling.
    """
    # Length of the sequence (number of tokens).
    L = mat.shape[0]
    # tensor for kopt number of index permutations
    I = T.zeros((kopt,)+(L,)*(kopt)).long()
    for i in range(kopt):
        I[i] = T.arange(L).view((-1,) + (1,)*(kopt-1-i))
    # A boolean tensor that ensures the selected indices are in strictly increasing order
    # This avoids duplicate or invalid combinations, as each token can only appear once
    mask = (0 < I[0]) 
    for i in range(kopt-1):
        mask *= (I[i] < I[i+1])
    lv = mat.view(-1)
    orders = cand_orders[kopt]
    # randomly selected shuffle order from cand_orders
    o = np.array(orders[np.random.randint(len(orders))])
    # the score for the proposed new order (based on o)
    then = T.zeros((L,)*kopt)
    # The score for the current order of tokens
    now = T.zeros_like(then)
    for i in range(kopt):
        now += lv[ L*I[i] + I[i] ]
        then += lv[ L*I[o[i]-1] + I[o[i+1]-2] ]
    # The score improvement (positive indicates a better proposal).
    A = then - now
    # Invalid combinations (those not passing the mask) are heavily penalized
    A[~mask] = -1001

    # Finds the top topk reshuffling proposals based on score differences (A)
    topv, topi = A.view(-1).topk(min(A.numel(), topk))
    # Randomly selects bs indices from the top topk proposals for batching
    indices = np.random.randint(topi.shape[0],size=(bs,))
    topv = topv[indices]
    topi = topi[indices]
    
    orders = [o] * bs
    # Extracts the indices for each token
    imod = [(topi//L**(kopt-1-i))%L for i in range(kopt)]

    # Stacks the indices into a tensor of shape (bs, kopt) -> The indices for the shuffled proposals
    # Scores of the selected proposals
    # The reshuffling order used for each proposal.
    return T.stack(imod,-1), topv, orders


def ibis(model, device, before, sentence, after, bs, topk, its, patience, warminit=False, gluemask=None, start_from_zero=False):
    sent = sentence
    # bos + sentence + eos tokens
    padded = {
    'input_ids': T.cat([before['input_ids'], sent['input_ids'], after['input_ids']], dim=1),  
    'attention_mask': T.cat([before['attention_mask'], sent['attention_mask'], after['attention_mask']], dim=1)
    }
    # print(padded)
    zz = score(model, padded)
    # Total score for sentence
    # orscore = zz[0][0]
    orscore = zz[0]
    yield orscore

    # Total score for sentence
    # bestscore = zz[0][0] 
    bestscore = zz[0]
    # minus shifts the log probability scores by one position forward
    bestsc = zz[2][0]

    # indexes of the last token
    lfix,rfix,blanks=before['input_ids'].shape[0]-1,after['input_ids'].shape[0]-1,0
    # create bs number of lists with tokens: bos + sentence + eos
    permsents = [T.cat([before['input_ids'], sent['input_ids'], after['input_ids']], dim=1).cpu().squeeze() for _ in range(bs) ]
    # True for each token in bos + sentence + eos
    bestmask = np.full(permsents[0].shape, True)
    # put custom gluemask instead of bestmask for sentence tokens (not include eos and bos, they always True)
    if gluemask is not None: bestmask[lfix+1:-rfix-1] = gluemask
    # create bs number of bestmask
    permmasks = [ bestmask.copy() for _ in range(bs) ]

    if start_from_zero:
        first_idx = 0
    else:
        first_idx = 1
    # if warminit=False
    if not warminit:
        # all True tokens + last token in sentence
        seg = list(np.nonzero(bestmask[lfix+1:-rfix-1])[0]) + [ len(sent['input_ids'][0]) ]
        for b in range(first_idx, bs):
            # permutate seg -> tokens that we agreed to permutate is True
            perm = np.random.permutation(len(seg)-1)
            # permuted sentence segments
            ns = []
            # permuted mask segments
            nm = []
            for i in range(len(seg)-1):
                # Extracts the tokens corresponding to the ith randomly selected segment
                ns.append(sent['input_ids'][0].cpu()[seg[perm[i]]:seg[perm[i]+1]])
                # Extracts the mask for the same segment
                nm.append(bestmask[lfix+1:-rfix-1][seg[perm[i]]:seg[perm[i]+1]])
            # Updates the bth sentence and mask in the batch
            # Concatenates the permuted sentence segments (ns) into a single tensor.
            permsents[b][lfix+1:-rfix-1] = T.cat( ns, 0 )
            permmasks[b][lfix+1:-rfix-1] = np.concatenate( nm, 0 )
    # lists with permuted tokens, len = bs 
    padded = T.stack(permsents,0).to(device)
    bestsent = np.zeros(padded[0].shape)
    bestscore = 100000000000000000 
    movetype = 'init'
    # Counts the number of improvements made during the process
    nch = 0
    # Array to hold indices of tokens considered for modification
    candidates = np.array([1]*bs)
    # Keeps track of the iteration at which the last improvement occurred
    last_imp = 0

    # repeat for max_steps
    for it in range(its):  
        padded_batch = pad_sequence(padded, batch_first=True, padding_value=0)
        attention_mask = T.zeros_like(padded_batch)
        attention_mask[padded_batch != 0] = 1
    
        model_inputs = {
            'input_ids': padded_batch,
            'attention_mask': attention_mask
        }
        gc.collect()  
        # if current step minus last step with inprovement score is more than patience, break
        if it - last_imp > patience:
            print('patience limit')
            break

        sc, wsc, spr = score(model, model_inputs)
        # At the first iteration 0, the word scores (bestwsc) are saved for reference
        if it == 0: 
            bestwsc = wsc[0] 
        # list of scores for each token
        sc = sc.numpy()
        # sc = np.array([t.item() for t in sc], dtype=np.float16)
        # if score for any sequence of tokens lower than best score before
        if sc.min() < bestscore:
            # update best
            if it == 0 or np.any(permsents[sc.argmin()] != bestsent):
                nch += 1 
                # The sequence corresponding to the lowest score in the batch
                bestsent = permsents[sc.argmin()]
                bestscore = sc.min()
                bestsc = spr[sc.argmin()]
                bestwsc = wsc[sc.argmin()]
                bestmask = permmasks[sc.argmin()]
    
                if type(bestsent)==T.Tensor: 
                    bestsent = bestsent.numpy()
                
                last_imp = it
                # step, 'init', ...
                yield (it, movetype, bestscore, bestsent, bestmask)

        thespr = bestsc
        kopt = np.random.randint(3,6)
        # Probabilities used to select candidate tokens for modification
        cutprobs = np.ones_like(bestwsc)
        # Tokens outside the mask (bestmask == False) are not considered
        cutprobs[~bestmask] = 0.
        # Boundary tokens (e.g., bos and eos) have higher probabilities to ensure stability
        cutprobs[lfix] = 100
        cutprobs[-1-rfix] = 100

        # Global search for tokens to modify
        # if the sequence length exceeds 6 tokens
        if it%2 == 0 and len(bestsent)-lfix-rfix > 6:
            # Number of candidates to select
            ncand = bestmask[lfix:len(bestsent)-rfix].sum()
            # limit number of candidates to select to 40 or 20
            if kopt == 4: ncand = min(40,ncand)
            if kopt == 5: ncand = min(20,ncand)  
            l,r = lfix, len(bestsent)-rfix
            # Selects ncand indices from the range [lfix, len(bestsent)-rfix], probabilities are normalized from cutprobs
            candidates = np.random.choice(np.arange(l,r), replace=False, p=cutprobs[l:r]/cutprobs[l:r].sum(), size=(ncand,))
            candidates.sort()
            movetype=f'GS {kopt}'
        else: 
            # Local search focuses on a small portion of the sequence.
            # Performed on odd iterations (it%2 != 0) or when the sequence length <= 6
            # Randomly determines the length of the local window (7-15 tokens)
            ropt = np.random.randint(7,15)
            try:
                start = np.random.randint(lfix+1, len(bestsent)-ropt-rfix)
                l,r = start,start+ropt
                candidates = np.random.choice(np.arange(l,r), replace=False, p=cutprobs[l:r]/cutprobs[l:r].sum(), size=(min(ropt,(cutprobs[l:r]>0).sum()),))
            except:
                ropt = min(15,len(bestsent)-lfix-rfix-2)
                start = np.random.randint(lfix+1,max(lfix+2,len(bestsent)-ropt-rfix))
                l,r = start,start+ropt
                candidates = np.random.choice(np.arange(l,r), replace=False, p=cutprobs[l:r]/cutprobs[l:r].sum(), size=(min(ropt,(cutprobs[l:r]>0).sum()),))
            candidates.sort()
            movetype=f'LS {kopt}'

        # Calculates the pairwise scores between candidate tokens using the shifted probabilities.
        # thespr: The shifted probabilities (log probabilities shifted by one position) for the sequence. It is used to calculate pairwise scores between candidates.
        # bestsent[candidates]: Extracts the indices of tokens in bestsent that are selected as candidates.
        links = thespr[:,bestsent[candidates]][candidates]
        permsents = []
        permmasks = []
        # i: Indices of tokens for each proposal.
        # v: Scores for the proposals.
        # o: The shuffle orders applied to the candidates.
        i,v,o = shuffle_proposals(links, topk, bs, kopt)
        
        for j in range(bs):
            # bos + shuffled sequense + eos
            inds = [candidates[0]] + list(candidates[i[j]]) + [candidates[-1]]
            # Only process proposals with valid scores
            if v[j] > -1000:
                # Start with the portion of the sequence before the first candidate
                pieces = [bestsent[:inds[0]]]
                maskpieces = [bestmask[:inds[0]]]
                for k in range(kopt+1):
                    # Append shuffled segments
                    pieces.append(bestsent[inds[o[j][k]-1]:inds[o[j][k]]])
                    maskpieces.append(bestmask[inds[o[j][k]-1]:inds[o[j][k]]])
                # Add the portion of the sequence after the last candidate
                pieces.append(bestsent[inds[-1]:])
                newsent = np.concatenate(pieces,0)
                maskpieces.append(bestmask[inds[-1]:])
                newmask = np.concatenate(maskpieces,0)
            # If the proposal is invalid, keep the original sequence and mask
            else: newsent, newmask = bestsent, bestmask
                
            permsents.append(newsent)
            permmasks.append(newmask)

        padded = T.stack(list(map(T.from_numpy,permsents)),0).to(device)

In [5]:
df = pd.read_csv('/kaggle/input/ibis-sub/submission_ibis.csv')

In [7]:
shuffle(df.loc[0, 'text'])

Original order NLL =  498.1680603027344
0 init 69455.22  mist ornamentindeerre advent scletoe family chimneyroo fireplace elfge gingerbread
1 GS 3 32553.004  mist ornamentindeerge gingerbreadre advent scletoe family chimneyroo fireplace elf
3 GS 4 26987.389  mist ornament advent scletoe familyreindeerge gingerbread chimneyroo fireplace elf
5 GS 4 18118.477  mist ornament advent scletoe familyreindeer fireplace elf gingerbread chimneyrooge
7 GS 5 9774.185  mist ornament chimney adventletoe familyreindeer fireplace elf gingerbread scrooge
8 LS 4 7435.8076  mistletoe familyreindeer fireplace advent ornament chimney elf gingerbread scrooge
9 GS 4 2520.043 reindeer family mistletoe fireplace advent ornament chimney elf gingerbread scrooge
10 LS 5 1932.184 reindeer family mistletoe ornament fireplace chimney elf advent gingerbread scrooge
11 GS 5 1333.1669 reindeer family mistletoe gingerbread scrooge chimney ornament fireplace elf advent
12 LS 5 1317.635 reindeer scrooge chimney family ging

In [15]:
shuffle(df.loc[1, 'text'])

Original order NLL =  549.2720336914062
0 init 36032.89 indeer bake gingerbread fireplace nightletoe there family jump givege and sc elf mist ornament walk drive sleep advent laugh chimneyroo
1 GS 5 17020.734 indeer family jump give elf mist ornament walk drive sleep advent laugh chimney bake gingerbread fireplace nightletoe therege and scroo
2 LS 3 15497.555 indeer family jump give elf mist ornament walk drive sleep laugh chimney bake advent gingerbread fireplace nightletoe therege and scroo
3 GS 3 8491.959 indeer ornament walk drive sleep laugh chimney bake advent gingerbread fireplace night family jump give elf mistletoe therege and scroo
4 LS 5 7494.128 indeer ornament walk drive sleep laugh chimney bake advent gingerbread fireplace night mistletoe elf give family jump therege and scroo
5 GS 3 5314.1323 indeer ornament family jump walk drive sleep laugh chimney bake advent gingerbread fireplace night mistletoe elf give therege and scroo
6 LS 4 5031.3193 indeer ornament family jump 

In [22]:
shuffle(df.loc[2, 'text'])

Original order NLL =  308.1136779785156
0 init 34924.277  sleigh grinch decorations holly jingletidecracker giftsulema holiday naughty nut nice stocking y beard workshop polar chimneygi ornament cheer carol
1 GS 4 11516.805  sleigh grinch decorations holly jingle holiday naughty nut nice stocking yulematidecracker gifts beard workshop polar chimneygi ornament cheer carol
3 GS 3 4992.1655  sleigh grinch decorations holly jingle holiday naughty nutcracker gifts nice stocking yulematide beard workshop polar chimneygi ornament cheer carol
4 LS 3 4510.0454  sleigh grinch decorations holly jingle holiday naughty gifts nutcracker nice stocking yulematide beard workshop polar chimneygi ornament cheer carol
5 GS 5 2432.987  sleigh grinch decorations holly jingle holiday naughty stocking yulemagi ornament cheer caroltide beard workshop polar chimney gifts nutcracker nice
7 GS 5 1836.5149  grinch holly jingle holiday naughty stocking yulemagi ornament cheer caroltide beard workshop polar chimney 

In [6]:
shuffle(df.loc[3, 'text'], batch_size=32)

Original order NLL =  267.69378662109375
0 init 27412.379  carol the yuleinch holly un gifts ofor visit jingle nut sing stocking holiday and magi workshop sleigh eat nicewrap beard decorations grtide relax polar cheer naughty cheer isnamentcracker chimney
1 GS 5 17698.762  carol the yuleinch holly un gifts ofornamentcracker chimney beard visit jingle nut sing stocking holiday and magi workshop sleigh eat nicewrap decorations grtide relax polar cheer naughty cheer is
3 GS 4 8166.6377  carol the yuletide gifts ofornamentcracker chimney beard visit jingle nut sing stocking holiday and magi workshop sleigh eat nicewrap decorations grinch holly un relax polar cheer naughty cheer is
4 LS 3 7552.905  carol the yuletide gifts ofornamentcracker chimney beard visit jingle nut sing stocking holiday and magi workshop sleigh eatwrap decorations grinch holly un relax nice polar cheer naughty cheer is
5 GS 3 4953.316 ornamentcracker chimney beard visit jingle nut sing carol the yuletide gifts of stoc

In [15]:
shuffle(df.loc[4, 'text'], batch_size=32)

Original order NLL =  126.44959259033203
0 init 14903.853  toygleeg peppermint not peace chocolate bowhohocake angel milk cookie ho candle dream to papertia game believe that puzzle hope joy from candy nightgn the with poin season wonder of have doll merry kag snow wish fruit card fireplace youog and it we wrapping greeting starset wreath as in workshopglobe
1 GS 4 9399.742  toygleeg peppermint not peace chocolate bowhohocake angel milk cookie ho candle dream to papertia game believe that puzzle hope joy from candy nightgnog and it we wrapping greeting starset wreath as in workshop wish fruit card fireplace you the with poin season wonder of have doll merry kag snowglobe
2 LS 5 9039.644  toygleeg peppermint not peace chocolate bowhohocake angel milk cookie ho candle dream to papertia game believe that night from candy hope joy puzzlegnog and it we wrapping greeting starset wreath as in workshop wish fruit card fireplace you the with poin season wonder of have doll merry kag snowglobe
3

In [16]:
shuffle(df.loc[5, 'text'], batch_size=16)

Original order NLL =  50.89107131958008
0 init 7915.3765  naughty give it relax merry stocking candy nice toy advent sleep milktide ofgle gr to jingleinset ho snow wreath bake polar familyhoho eg fireplaceroo sleigh wrapping candle and visit y peppermint hope workshop cheer withletoe night card kag un and eat nutule season giftscake star joyinchog wish puzzle fruit game drive carol you night the decorations ornament have of chimney is in we elf the peacegn beard holly paperglobe cheer greeting believe magi mist walk laugh bow fireplace as doll sc not wonder gingerbread reindeer workshop cookietia angel jump dream ornament singpo fromcracker chimney chocolate thege and holidaywrap that
2 LS 3 7792.6597  naughty give it relax merry stocking candy nice toy advent sleep milktide ofgle gr to jingleinset ho snow wreath bake polar familyhoho eg fireplaceroo sleigh wrapping candle and visit y peppermint hope workshop cheer withletoe night card kag un and eat nutule season giftscake star joyinc

In [18]:
row_5 = 'naughty of and from the of and un and eat the have the that is as in with it we to not you magi wonder nice give candy toy star believe elf night beard hohohopoinsettia holly mistletoe snowglobe polar nightwrap sleep walk sing laugh jump relax dream wish hope joy peace gifts family merry yuletide season greeting scrooge grinch jingle carol sleigh drive reindeer workshop workshop visit bake cookie peppermint chocolate milk eggnog fruitcake wrapping paper bow doll puzzle game card ornament ornament decorations candle advent wreath angel nutcracker gingerbread chimney chimney stocking fireplace fireplace kaggle holiday cheer cheer'
shuffle(row_5, batch_size=16, start_from_zero=False)


Original order NLL =  111.15632629394531
0 init 111.59138 naughty of and from the of and un and eat the have the that is as in with it we to not you magi wonder nice give candy toy star believe elf night beard hohohopoinsettia holly mistletoe snowglobe polar nightwrap sleep walk sing laugh jump relax dream wish hope joy peace gifts family merry yuletide season greeting scrooge grinch jingle carol sleigh drive reindeer workshop workshop visit bake cookie peppermint chocolate milk eggnog fruitcake wrapping paper bow doll puzzle game card ornament ornament decorations candle advent wreath angel nutcracker gingerbread chimney chimney stocking fireplace fireplace kaggle holiday cheer cheer
13 GS 3 110.72296 naughty of and from the of and the have un and eat the that is as in with it we to not you magi wonder nice give candy toy star believe elf night beard hohohopoinsettia holly mistletoe snowglobe polar nightwrap sleep walk sing laugh jump relax dream wish hope joy peace gifts family merry

In [None]:
T.cuda.empty_cache()

In [19]:
df.loc[0, 'text'] = 'reindeer mistletoe elf gingerbread family advent scrooge chimney fireplace ornament'
df.loc[1, 'text'] = 'reindeer mistletoe elf scrooge gingerbread chimney fireplace ornament advent night sleep drive walk jump bake the family laugh and give'
df.loc[2, 'text'] = 'magi yuletide cheer grinch carol holiday holly jingle naughty nice nutcracker polar sleigh workshop beard chimney decorations ornament gifts stocking'
df.loc[3, 'text'] = 'ornament yuletide cheer cheer is the holiday of gifts unwrap and eat relax sing carol decorations holly jingle sleigh workshop chimney stocking nutcracker magi visit naughty nice grinch polar beard'
df.loc[4, 'text'] = 'eggnog milk chocolate peppermint candy fruitcake cookie snowglobe toy doll game puzzle wrapping paper bow candle fireplace poinsettia star angel wreath hohoho peace the night as it we believe in to dream with hope that have not wish you merry season of joy and wonder greeting card from kaggle workshop'
df.loc[5, 'text'] = 'poinsettia yuletide eggnog milk chocolate peppermint candy fruitcake mistletoe holly wreath gingerbread cookie stocking hohoho laugh cheer jump sing bake walk drive visit eat sleep relax unwrap give to and from the and the and the of of as in is you that it we with have not family holiday season decorations gifts greeting card wrapping paper bow toy doll game puzzle ornament ornament nutcracker scrooge grinch snowglobe sleigh reindeer polar beard elf workshop workshop naughty nice chimney fireplace night chimney fireplace night wish dream hope believe wonder magi star angel advent candle carol joy peace cheer merry jingle kaggle'

In [20]:
from metric import PerplexityCalculator
scorer = PerplexityCalculator('/kaggle/input/gemma-2/transformers/gemma-2-9b/2')

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [24]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import numpy as np
import random
import time
import math


temp_start = 6.0 #10.0    #how high a temperature we start with (prior 10)
temp_end = 0.3 #0.5       #final temperature (prior 0.2)
cooling_rate = 0.98 #0.95  #how quick we cool each time we drop temp (prior 0.95)
steps_per_temp = 4 #5  #steps at each temperature (prior 20)    <---- Increase this for a longer run (20 steps is about 3 hours)

def simulated_annealing_optimize(text: str, temp_start=temp_start, temp_end=temp_end, cooling_rate=cooling_rate, steps_per_temp=steps_per_temp, verbose=False):
    """Optimize word sequence using simulated annealing, handling NaN scores by randomizing.

    Args:
       text: Input string of space-separated words to optimize
       temp_start: Starting temperature - higher means more random exploration
       temp_end: Ending temperature - lower means more selective at end
       cooling_rate: How fast temperature decreases each step
       steps_per_temp: How many swaps to try at each temperature
       verbose: Whether to print detailed progress
    """
    
    words = text.split()

    current = words.copy()
    current_score = scorer.get_perplexity(' '.join(current))

    # Handling any NaNs...
    if math.isnan(current_score):
        # Keep shuffling until we find a valid sequence
        while True:
            current = words.copy()
            random.shuffle(current)
            current_score = scorer.get_perplexity(' '.join(current))
            if not math.isnan(current_score):
                break
            
    best = current.copy()
    best_score = current_score
    temp = temp_start
    print(f"Start Temperature: {temp:.2f}, Initial score: {current_score:.2f}")
    
    # Main annealing loop - keep trying until we've cooled down enough
    while temp > temp_end:
        for _ in range(steps_per_temp):  # Do multiple attempts at each temperature
            # Try improving sequence by swapping random pairs of words
            i, j = random.sample(range(len(words)), 2)
            neighbor = current.copy()
            neighbor[i], neighbor[j] = neighbor[j], neighbor[i]
            
            # Get score for this arrangement, skip if invalid
            neighbor_score = scorer.get_perplexity(' '.join(neighbor))
            if math.isnan(neighbor_score):
                continue
            
            # Accept better scores, sometimes accept worse ones based on temperature
            delta = neighbor_score - current_score
            if delta < 0 or random.random() < math.exp(-delta / temp):
                current = neighbor
                current_score = neighbor_score
                
                if current_score < best_score:
                    best = current.copy()
                    best_score = current_score
                    print(">", end="")
                else: print("<", end="")
            else:print("-", end="")

        
        # Reduce temperature according to cooling schedule (AFTER all steps at this temperature)
        temp *= cooling_rate
        if verbose: print(f"\nTemperature: {temp:.2f}, Current score: {current_score:.2f}")
    
    print(f"\nFinal score: {best_score:.2f}, {best}")
    
    return ' '.join(best), best_score

In [27]:
submission = pd.DataFrame(columns=['id', 'text'])
scores = []

# Process each sample
for idx, row in df.iterrows():
    if idx == 0:
        score = 469
        optimized = row.text
    else:
       print(f"\nProcessing sample {idx}...")
       optimized, score = simulated_annealing_optimize(row.text)
    scores.append(score)
   
    # Add to submission dataframe
    submission.loc[idx] = {
        'id': row.id,
        'text': optimized
    }
    print("-" * 50)

# Print summary statistics
print("\nScore Summary:")
print(f"Submission mean score: {np.mean(scores):.2f}")

# Save to CSV
submission.to_csv("submission_simulated_annealing_ibis_2.csv", index=False)
print("\nSubmission file created!")

submission

--------------------------------------------------

Processing sample 1...
Start Temperature: 6.00, Initial score: 511.98
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Final score: 511.98, ['reindeer', 'mistletoe', 'elf', 'scrooge', 'gingerbread', 'chimney', 'fireplace', 'ornament', 'advent', 'night', 'sleep', 'drive', 'walk', 'jump', 'bake', 'the', 'family', 'laugh', 'and', 'give']
--------------------------------------------------

Processing 

Unnamed: 0,id,text
0,0,reindeer mistletoe elf gingerbread family adve...
1,1,reindeer mistletoe elf scrooge gingerbread chi...
2,2,magi yuletide cheer grinch carol holiday holly...
3,3,ornament yuletide cheer cheer is the holiday o...
4,4,eggnog milk chocolate peppermint candy fruitca...
5,5,poinsettia yuletide eggnog milk chocolate pepp...
