### Load libraries

In [1]:
import torch
from llm2vec import LLM2Vec

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# standard libr
import pandas as pd
import numpy as np
import shutil
import os
from tqdm import tqdm
import glob
import pickle

### Load data

Dataset available at: https://github.com/anhaidgroup/deepmatcher/blob/master/Datasets.md

For comparison we keep:

(1) **Abt-Buy**: Product data from Abt.com and Buy.com. The core
attribute is description, which is a long text blob describing the
product. We use no informative attribute (e.g. the title), but only
the noisy description attribute.

(2) **iTunes-Amazon** (Dirty): music data from iTunes and Amazon. The data has been
modified to simulate dirty data as done by.

(3) **DBLP-ACM** (Dirty): Bibliographic data from DBLP and ACM.
The data has been modified to simulate dirty data.

(4) **DBLP-Scholar** (Dirty): Bibliographic data from DBLP and Google Scholar.
The data has been modified to simulate dirty data.

(5) **Walmart-Amazon** (Dirty): Product data from Walmart and Amazon. The
data has been modified to simulate dirty data.

For making dirty data they suggest for each attribute other than "title" to randomly move each value to the attribute "title" in the same tuple with a probability of $p = 0.5$.


To evaluate our transformer architectures, we split all five
datasets into into three parts with a ratio of 3:1:1. We use the 60%
split of the data for training, and the two 20% splits for validation
and test. All reported numbers in this paper show results on the
test split.


In [3]:
### folder path in google drive where the data are located
data_dir_path = "data"
zipped_data_names = glob.glob('data/*.zip')
zipped_data_names

['data\\abt_buy_exp_data.zip',
 'data\\dirty_dblp_acm_exp_data.zip',
 'data\\dirty_dblp_scholar_exp_data.zip',
 'data\\dirty_itunes_amazon_exp_data.zip',
 'data\\dirty_walmart_amazon_exp_data.zip']

In [4]:
### unzip dataset
unzip_data_dir_path = "data/datasets"

### unzip dataset
#for dir_path_dataset in tqdm(zipped_data_names):
#  shutil.unpack_archive(dir_path_dataset, os.path.join(unzip_data_dir_path, dir_path_dataset.split('\\')[1].split('.')[0]))

In [5]:
### final dataset_path for each dataset
datasets_path = [os.path.join(dataset_name, 'exp_data') for dataset_name in os.listdir(unzip_data_dir_path)]
datasets_path

['abt_buy_exp_data\\exp_data',
 'dirty_dblp_acm_exp_data\\exp_data',
 'dirty_dblp_scholar_exp_data\\exp_data',
 'dirty_itunes_amazon_exp_data\\exp_data',
 'dirty_walmart_amazon_exp_data\\exp_data']

In [6]:
## example
tableA = pd.read_csv(os.path.join(unzip_data_dir_path, datasets_path[0], 'tableA.csv'))
tableB = pd.read_csv(os.path.join(unzip_data_dir_path, datasets_path[0], 'tableB.csv'))
test = pd.read_csv(os.path.join(unzip_data_dir_path, datasets_path[0], 'test.csv'))

In [7]:
test.head(1)

Unnamed: 0,ltable_id,rtable_id,label
0,445,910,0


In [8]:
tableA[tableA.id == 445]

Unnamed: 0,id,name,description,price
445,445,sony pink cyber-shot 7.2 megapixel digital cam...,sony pink cyber-shot 7.2 megapixel digital cam...,


In [9]:
tableA[tableA.id == 910]

Unnamed: 0,id,name,description,price
910,910,tivo hd xl black digital video recorder tcd658000,tivo hd xl black digital video recorder tcd658...,599.0


### Prepare dataset

In [10]:
def prepare_dataset(table, with_labels = True):
  '''This function take as input a table and add a columns with:
      - if schema agnostic: all attribute concatenated
      - otherwise: consider the values of one or two specific attributes per
                   dataset
  '''
  new_table = table.copy()
  if not with_labels:
    # we drop first attribute since is row_id
    new_table.loc[:, 'attribute'] = new_table.apply(lambda sample: ' '.join(str(x) for x in sample.dropna()[1:]), axis = 1)
  else:
    new_table.loc[:, 'attribute'] = new_table.apply(lambda sample: ' '.join(f'<{k}> {v} </{k}>' for k, v in sample.dropna()[1:].to_dict().items()),  axis = 1)
  return new_table

In [11]:
### example
filename = os.path.join(unzip_data_dir_path, datasets_path[0], 'tableA.csv')
table = pd.read_csv(filename)
table.head()

Unnamed: 0,id,name,description,price
0,0,sony turntable pslx350h,sony turntable pslx350h belt drive system 33-1...,
1,1,bose acoustimass 5 series iii speaker system a...,bose acoustimass 5 series iii speaker system a...,399.0
2,2,sony switcher sbv40s,sony switcher sbv40s eliminates disconnecting ...,49.0
3,3,sony 5 disc cd player cdpce375,sony 5 disc cd player - cdpce375 5 disc change...,
4,4,bose 27028 161 bookshelf pair speakers in whit...,bose 161 bookshelf speakers in white 161wh art...,158.0


In [12]:
prepare_dataset(table, with_labels = False).head()

Unnamed: 0,id,name,description,price,attribute
0,0,sony turntable pslx350h,sony turntable pslx350h belt drive system 33-1...,,sony turntable pslx350h sony turntable pslx350...
1,1,bose acoustimass 5 series iii speaker system a...,bose acoustimass 5 series iii speaker system a...,399.0,bose acoustimass 5 series iii speaker system a...
2,2,sony switcher sbv40s,sony switcher sbv40s eliminates disconnecting ...,49.0,sony switcher sbv40s sony switcher sbv40s elim...
3,3,sony 5 disc cd player cdpce375,sony 5 disc cd player - cdpce375 5 disc change...,,sony 5 disc cd player cdpce375 sony 5 disc cd ...
4,4,bose 27028 161 bookshelf pair speakers in whit...,bose 161 bookshelf speakers in white 161wh art...,158.0,bose 27028 161 bookshelf pair speakers in whit...


In [13]:
prepare_dataset(table, with_labels = True).head()

Unnamed: 0,id,name,description,price,attribute
0,0,sony turntable pslx350h,sony turntable pslx350h belt drive system 33-1...,,<name> sony turntable pslx350h </name> <descri...
1,1,bose acoustimass 5 series iii speaker system a...,bose acoustimass 5 series iii speaker system a...,399.0,<name> bose acoustimass 5 series iii speaker s...
2,2,sony switcher sbv40s,sony switcher sbv40s eliminates disconnecting ...,49.0,<name> sony switcher sbv40s </name> <descripti...
3,3,sony 5 disc cd player cdpce375,sony 5 disc cd player - cdpce375 5 disc change...,,<name> sony 5 disc cd player cdpce375 </name> ...
4,4,bose 27028 161 bookshelf pair speakers in whit...,bose 161 bookshelf speakers in white 161wh art...,158.0,<name> bose 27028 161 bookshelf pair speakers ...


In [14]:
def describe_dataset(dataset_path):
    '''Function for extract statistics from the dataset'''
    ## load dataset
    tableA = pd.read_csv(os.path.join(dataset_path, 'tableA.csv'))
    tableB = pd.read_csv(os.path.join(dataset_path, 'tableB.csv'))
    test = pd.read_csv(os.path.join(dataset_path, 'test.csv'))

    assert tableA.shape[1] == tableB.shape[1]

    ## save statistics 
    output = {
        'A_size': tableA.shape[0],
        'B_size': tableB.shape[0],
        'testing_pairs': test.shape[0],
        'pos_match': test[test.label==1].shape[0],
        'neg_match': test[test.label==0].shape[0],
        'attributes': len(tableA.columns[1:]),
        'name': dataset_path.split('/')[1].split('\\')[1].split('_exp')[0]
    }
    return output

In [15]:
all_datasets_stats = []
for dataset_path in datasets_path:
    out = describe_dataset(os.path.join(unzip_data_dir_path, dataset_path))
    all_datasets_stats.append(out)

In [16]:
all_datasets_stats = pd.DataFrame(all_datasets_stats)
all_datasets_stats = all_datasets_stats.set_index('name', drop = True)

In [20]:
all_datasets_stats

Unnamed: 0_level_0,A_size,B_size,testing_pairs,pos_match,neg_match,attributes
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
abt_buy,1081,1092,1916,206,1710,3
dirty_dblp_acm,2616,2294,2473,444,2029,4
dirty_dblp_scholar,2616,64263,5742,1070,4672,4
dirty_itunes_amazon,6907,55923,109,27,82,8
dirty_walmart_amazon,2554,22074,2049,193,1856,5


In [18]:
all_datasets_stats.to_pickle('dataset_stats_table.pickle')

In [19]:
for dataset_path in datasets_path:
    train = pd.read_csv(os.path.join(unzip_data_dir_path, dataset_path, 'train.csv'))
    valid = pd.read_csv(os.path.join(unzip_data_dir_path, dataset_path, 'valid.csv'))
    print(train.shape, valid.shape)

(5743, 3) (1916, 3)
(7417, 3) (2473, 3)
(17223, 3) (5742, 3)
(321, 3) (109, 3)
(6144, 3) (2049, 3)


### Subsetting testing records

In [20]:
### in order to speed up the compution, I will compute the embedding only of the id I will test
def load_all_data(dataset_path):
    '''Function for load the dataset'''
    tableA = pd.read_csv(os.path.join(dataset_path, 'tableA.csv'))
    tableB = pd.read_csv(os.path.join(dataset_path, 'tableB.csv'))
    test = pd.read_csv(os.path.join(dataset_path, 'test.csv'))

    return tableA, tableB, test

In [21]:
def retrieve_testing_sample_only(dataset_path):
    '''Function for retrieve only the rows in the tables that will be tested'''
    tableA, tableB, test = load_all_data(dataset_path)

    ## retrieve the testing rows
    tableA_testingId = test.ltable_id.to_list()
    tableB_testingId = test.rtable_id.to_list()
    ## subsetting the original dataframe
    testing_tableA = tableA[tableA.id.isin(tableA_testingId)].reset_index(drop=True)
    testing_tableB = tableB[tableB.id.isin(tableB_testingId)].reset_index(drop=True)

    return testing_tableA, testing_tableB

In [28]:
for dataset_path in datasets_path:
    file = os.path.join(unzip_data_dir_path, dataset_path)
    testing_tableA, testing_tableB = retrieve_testing_sample_only(file)
    ## save
    ds_name = dataset_path.split('\\')[0]
    testing_tableA.to_pickle(f'{unzip_data_dir_path}/{ds_name}_tableA_testrecord.pickle')
    testing_tableB.to_pickle(f'{unzip_data_dir_path}/{ds_name}_tableB_testrecord.pickle')

## Model

In [20]:
def load_model(model_name, login_token):
  '''Function for load a model base on model_name. Return the text encoder LLM.
  By default the LLM2Vec model uses the mean pooling strategy.
  You can change the pooling strategy by passing the pooling_mode argument to the from_pretrained method.
  Similarly, you can change the maximum sequence length by passing the max_length argument (default is 512).'''

  base_name, fine_tuned_name = model_name
  # Loading base MNTP model, along with custom code that enables bidirectional connections in decoder-only LLMs
  l2v = LLM2Vec.from_pretrained(
      base_name,
      peft_model_name_or_path=fine_tuned_name,
      device_map="cuda" if torch.cuda.is_available() else "cpu",
      torch_dtype=torch.bfloat16,
      token=login_token
  )
  return l2v

### Supervised Matching

Following: https://dl.acm.org/doi/abs/10.14778/3598581.3598594

This is considered a **binary classification
task**, classifying each candidate pair as match or non-match.

In [21]:
def append_instruction(instruction, records):
  '''This functions create input as [[instruction1, text1], [instruction2, text2]].'''
  new_records = []
  for r in records:
      new_records.append([instruction, r, 0])
  return new_records

def compute_embeddings(llm_encoder, instruction, records):
  '''This function serve for computing embedding of the record for different instruction prompt'''
  print(f"Encoding {len(records)} records...")
  new_records = append_instruction(instruction, records)
  corpus_embeddings = np.asarray(llm_encoder.encode(new_records))
  return corpus_embeddings

In [22]:
## select models
model_names = [
    ## (base_model_class, finetuned_version, save_file_name)
    ("McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp", "McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-supervised", "Mistral_sup"),
    ("McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp", "McGill-NLP/LLM2Vec-Mistral-7B-Instruct-v2-mntp-unsup-simcse", "Mistral_sim"),
]

In [24]:
login_token = "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"

### model selection  -> done: [ 0, ]
idx = 1
selected_model, model_ID = model_names[idx][:2], model_names[idx][2]
llm_encoder = load_model(selected_model, login_token = login_token)

Downloading shards: 100%|████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  6.24it/s]
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 3/3 [00:11<00:00,  3.70s/it]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [25]:
instruction_list = [(0, "Represent the text for finding another product description for the same product"),
                   (1, "Retrieve semantically similar text: ")]

In [26]:
names = ['tableA', 'tableB']
instr = "Represent the text for finding another product description for the same product"
istr_id = 0

In [27]:
### load table
for dataset_path in datasets_path:
    ## name
    ds_name = dataset_path.split('_exp')[0]
    ## retrieve tables
    testing_tableA, testing_tableB = retrieve_testing_sample_only(os.path.join(unzip_data_dir_path, dataset_path))
    data = [testing_tableA, testing_tableB]
    ### loop over tableA and tableB
    for i in range(2):
        ### loop over labels/no labels
        for j in range(2):
            print(f'Preprocessing: {ds_name}, {names[i]}, with_labels={j}, with model={model_ID}, with instr={istr_id}')
            table = prepare_dataset(data[i], with_labels=j)
            print(f'size: {table.shape}')  # n_attr = id + attr_merge + others
            corpus_embeddings = compute_embeddings(llm_encoder, instr, table.attribute.to_list())
            ## save emebddings
            torch.save(corpus_embeddings, f'embeddings_model_{model_ID}_instr_{istr_id}_ds_{ds_name}_table_{names[i]}_with_labels_{j}.pt')
            print(corpus_embeddings.shape)
            ## save mapping (indipendente dal modello e dall'istruzione)
            with open(f'mapping_ds_{ds_name}_table_{names[i]}_with_labels_{j}.pickle', 'wb') as handle:
                pickle.dump(table.id.to_dict(), handle, protocol=pickle.HIGHEST_PROTOCOL)

Preprocessing: abt_buy, tableA, with_labels=0, with model=Mistral_sim, with instr=0
size: (737, 5)
Encoding 737 records...


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Batches: 100%|█████████████████████████████████████████████████████████████████████████| 24/24 [12:31<00:00, 31.32s/it]


(737, 4096)
Preprocessing: abt_buy, tableA, with_labels=1, with model=Mistral_sim, with instr=0
size: (737, 5)
Encoding 737 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 24/24 [14:03<00:00, 35.13s/it]


(737, 4096)
Preprocessing: abt_buy, tableB, with_labels=0, with model=Mistral_sim, with instr=0
size: (700, 5)
Encoding 700 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 22/22 [06:35<00:00, 17.97s/it]


(700, 4096)
Preprocessing: abt_buy, tableB, with_labels=1, with model=Mistral_sim, with instr=0
size: (700, 5)
Encoding 700 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 22/22 [07:41<00:00, 20.98s/it]


(700, 4096)
Preprocessing: dirty_dblp_acm, tableA, with_labels=0, with model=Mistral_sim, with instr=0
size: (1271, 6)
Encoding 1271 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 40/40 [10:33<00:00, 15.83s/it]


(1271, 4096)
Preprocessing: dirty_dblp_acm, tableA, with_labels=1, with model=Mistral_sim, with instr=0
size: (1271, 6)
Encoding 1271 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 40/40 [13:00<00:00, 19.51s/it]


(1271, 4096)
Preprocessing: dirty_dblp_acm, tableB, with_labels=0, with model=Mistral_sim, with instr=0
size: (1218, 6)
Encoding 1218 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 39/39 [10:49<00:00, 16.66s/it]


(1218, 4096)
Preprocessing: dirty_dblp_acm, tableB, with_labels=1, with model=Mistral_sim, with instr=0
size: (1218, 6)
Encoding 1218 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 39/39 [13:13<00:00, 20.36s/it]


(1218, 4096)
Preprocessing: dirty_dblp_scholar, tableA, with_labels=0, with model=Mistral_sim, with instr=0
size: (1708, 6)
Encoding 1708 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 54/54 [12:26<00:00, 13.83s/it]


(1708, 4096)
Preprocessing: dirty_dblp_scholar, tableA, with_labels=1, with model=Mistral_sim, with instr=0
size: (1708, 6)
Encoding 1708 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 54/54 [15:35<00:00, 17.33s/it]


(1708, 4096)
Preprocessing: dirty_dblp_scholar, tableB, with_labels=0, with model=Mistral_sim, with instr=0
size: (3938, 6)
Encoding 3938 records...


Batches: 100%|███████████████████████████████████████████████████████████████████████| 124/124 [28:45<00:00, 13.91s/it]


(3938, 4096)
Preprocessing: dirty_dblp_scholar, tableB, with_labels=1, with model=Mistral_sim, with instr=0
size: (3938, 6)
Encoding 3938 records...


Batches: 100%|███████████████████████████████████████████████████████████████████████| 124/124 [35:00<00:00, 16.94s/it]


(3938, 4096)
Preprocessing: dirty_itunes_amazon, tableA, with_labels=0, with model=Mistral_sim, with instr=0
size: (104, 10)
Encoding 104 records...


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 4/4 [01:27<00:00, 21.77s/it]


(104, 4096)
Preprocessing: dirty_itunes_amazon, tableA, with_labels=1, with model=Mistral_sim, with instr=0
size: (104, 10)
Encoding 104 records...


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 4/4 [01:56<00:00, 29.00s/it]


(104, 4096)
Preprocessing: dirty_itunes_amazon, tableB, with_labels=0, with model=Mistral_sim, with instr=0
size: (106, 10)
Encoding 106 records...


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 4/4 [01:16<00:00, 19.21s/it]


(106, 4096)
Preprocessing: dirty_itunes_amazon, tableB, with_labels=1, with model=Mistral_sim, with instr=0
size: (106, 10)
Encoding 106 records...


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 4/4 [01:47<00:00, 26.98s/it]


(106, 4096)
Preprocessing: dirty_walmart_amazon, tableA, with_labels=0, with model=Mistral_sim, with instr=0
size: (900, 7)
Encoding 900 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 29/29 [07:32<00:00, 15.60s/it]


(900, 4096)
Preprocessing: dirty_walmart_amazon, tableA, with_labels=1, with model=Mistral_sim, with instr=0
size: (900, 7)
Encoding 900 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 29/29 [09:36<00:00, 19.89s/it]


(900, 4096)
Preprocessing: dirty_walmart_amazon, tableB, with_labels=0, with model=Mistral_sim, with instr=0
size: (1584, 7)
Encoding 1584 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 50/50 [14:14<00:00, 17.10s/it]


(1584, 4096)
Preprocessing: dirty_walmart_amazon, tableB, with_labels=1, with model=Mistral_sim, with instr=0
size: (1584, 7)
Encoding 1584 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 50/50 [17:43<00:00, 21.26s/it]

(1584, 4096)





### For training

In [24]:
choosen_dataset = ['abt_buy_exp_data\\exp_data', 
                   'dirty_dblp_acm_exp_data\\exp_data', 
                   'dirty_itunes_amazon_exp_data\\exp_data']

In [25]:
login_token = "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"

### model selection  -> done: [ 0, ]
idx = 0
selected_model, model_ID = model_names[idx][:2], model_names[idx][2]
llm_encoder = load_model(selected_model, login_token = login_token)

Downloading shards: 100%|████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  4.83it/s]
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 3/3 [00:06<00:00,  2.10s/it]


In [26]:
names = ['tableA', 'tableB']
instr = "Represent the text for finding another product description for the same product"
istr_id = 0

In [27]:
def retrieve_sample_only(dataset_path, split):
    '''Function for retrieve only the rows in the tables that will be tested'''
    tableA, tableB, _ = load_all_data(dataset_path)
    split_df = pd.read_csv(os.path.join(dataset_path, f'{split}.csv'))

    ## retrieve the testing rows
    tableA_Id = split_df.ltable_id.to_list()
    tableB_Id = split_df.rtable_id.to_list()
    ## subsetting the original dataframe
    split_tableA = tableA[tableA.id.isin(tableA_Id)].reset_index(drop=True)
    split_tableB = tableB[tableB.id.isin(tableB_Id)].reset_index(drop=True)

    return split_tableA, split_tableB

In [28]:
### load table
for dataset_path in choosen_dataset:
    ## name
    ds_name = dataset_path.split('_exp')[0]
    for split in ['train', 'valid']:
        ## retrieve tables
        tableA, tableB = retrieve_sample_only(os.path.join(unzip_data_dir_path, dataset_path), split = split)
        data = [tableA, tableB]
        ### loop over tableA and tableB
        for i in range(2):
            ### loop over labels/no labels
            for j in range(2):
                print(f'Preprocessing: {ds_name}, {names[i]}, with_labels={j}, with model={model_ID}, with instr={istr_id}, split={split}')
                table = prepare_dataset(data[i], with_labels=j)
                print(f'size: {table.shape}')  # n_attr = id + attr_merge + others
                corpus_embeddings = compute_embeddings(llm_encoder, instr, table.attribute.to_list())
                ## save emebddings
                torch.save(corpus_embeddings, f'embeddings_split_{split}_model_{model_ID}_instr_{istr_id}_ds_{ds_name}_table_{names[i]}_with_labels_{j}.pt')
                print(corpus_embeddings.shape)
                ## save mapping (indipendente dal modello e dall'istruzione)
                with open(f'mapping_split_{split}_ds_{ds_name}_table_{names[i]}_with_labels_{j}.pickle', 'wb') as handle:
                    pickle.dump(table.id.to_dict(), handle, protocol=pickle.HIGHEST_PROTOCOL)

Preprocessing: abt_buy, tableA, with_labels=0, with model=Mistral_sup, with instr=0, split=train
size: (973, 5)
Encoding 973 records...


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Batches: 100%|█████████████████████████████████████████████████████████████████████████| 31/31 [16:10<00:00, 31.30s/it]


(973, 4096)
Preprocessing: abt_buy, tableA, with_labels=1, with model=Mistral_sup, with instr=0, split=train
size: (973, 5)
Encoding 973 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 31/31 [18:06<00:00, 35.06s/it]


(973, 4096)
Preprocessing: abt_buy, tableB, with_labels=0, with model=Mistral_sup, with instr=0, split=train
size: (956, 5)
Encoding 956 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 30/30 [08:37<00:00, 17.24s/it]


(956, 4096)
Preprocessing: abt_buy, tableB, with_labels=1, with model=Mistral_sup, with instr=0, split=train
size: (956, 5)
Encoding 956 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 30/30 [10:11<00:00, 20.38s/it]


(956, 4096)
Preprocessing: abt_buy, tableA, with_labels=0, with model=Mistral_sup, with instr=0, split=valid
size: (728, 5)
Encoding 728 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 23/23 [12:13<00:00, 31.88s/it]


(728, 4096)
Preprocessing: abt_buy, tableA, with_labels=1, with model=Mistral_sup, with instr=0, split=valid
size: (728, 5)
Encoding 728 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 23/23 [13:15<00:00, 34.61s/it]


(728, 4096)
Preprocessing: abt_buy, tableB, with_labels=0, with model=Mistral_sup, with instr=0, split=valid
size: (702, 5)
Encoding 702 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 22/22 [06:31<00:00, 17.80s/it]


(702, 4096)
Preprocessing: abt_buy, tableB, with_labels=1, with model=Mistral_sup, with instr=0, split=valid
size: (702, 5)
Encoding 702 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 22/22 [07:40<00:00, 20.92s/it]


(702, 4096)
Preprocessing: dirty_dblp_acm, tableA, with_labels=0, with model=Mistral_sup, with instr=0, split=train
size: (2093, 6)
Encoding 2093 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 66/66 [17:22<00:00, 15.79s/it]


(2093, 4096)
Preprocessing: dirty_dblp_acm, tableA, with_labels=1, with model=Mistral_sup, with instr=0, split=train
size: (2093, 6)
Encoding 2093 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 66/66 [21:20<00:00, 19.40s/it]


(2093, 4096)
Preprocessing: dirty_dblp_acm, tableB, with_labels=0, with model=Mistral_sup, with instr=0, split=train
size: (1927, 6)
Encoding 1927 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 61/61 [17:06<00:00, 16.82s/it]


(1927, 4096)
Preprocessing: dirty_dblp_acm, tableB, with_labels=1, with model=Mistral_sup, with instr=0, split=train
size: (1927, 6)
Encoding 1927 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 61/61 [20:50<00:00, 20.51s/it]


(1927, 4096)
Preprocessing: dirty_dblp_acm, tableA, with_labels=0, with model=Mistral_sup, with instr=0, split=valid
size: (1271, 6)
Encoding 1271 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 40/40 [10:33<00:00, 15.84s/it]


(1271, 4096)
Preprocessing: dirty_dblp_acm, tableA, with_labels=1, with model=Mistral_sup, with instr=0, split=valid
size: (1271, 6)
Encoding 1271 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 40/40 [13:03<00:00, 19.58s/it]


(1271, 4096)
Preprocessing: dirty_dblp_acm, tableB, with_labels=0, with model=Mistral_sup, with instr=0, split=valid
size: (1210, 6)
Encoding 1210 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 38/38 [10:43<00:00, 16.93s/it]


(1210, 4096)
Preprocessing: dirty_dblp_acm, tableB, with_labels=1, with model=Mistral_sup, with instr=0, split=valid
size: (1210, 6)
Encoding 1210 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 38/38 [13:07<00:00, 20.73s/it]


(1210, 4096)
Preprocessing: dirty_itunes_amazon, tableA, with_labels=0, with model=Mistral_sup, with instr=0, split=train
size: (288, 10)
Encoding 288 records...


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 9/9 [03:46<00:00, 25.16s/it]


(288, 4096)
Preprocessing: dirty_itunes_amazon, tableA, with_labels=1, with model=Mistral_sup, with instr=0, split=train
size: (288, 10)
Encoding 288 records...


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 9/9 [05:16<00:00, 35.22s/it]


(288, 4096)
Preprocessing: dirty_itunes_amazon, tableB, with_labels=0, with model=Mistral_sup, with instr=0, split=train
size: (297, 10)
Encoding 297 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 10/10 [03:29<00:00, 20.99s/it]


(297, 4096)
Preprocessing: dirty_itunes_amazon, tableB, with_labels=1, with model=Mistral_sup, with instr=0, split=train
size: (297, 10)
Encoding 297 records...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 10/10 [05:01<00:00, 30.13s/it]


(297, 4096)
Preprocessing: dirty_itunes_amazon, tableA, with_labels=0, with model=Mistral_sup, with instr=0, split=valid
size: (104, 10)
Encoding 104 records...


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 4/4 [01:28<00:00, 22.11s/it]


(104, 4096)
Preprocessing: dirty_itunes_amazon, tableA, with_labels=1, with model=Mistral_sup, with instr=0, split=valid
size: (104, 10)
Encoding 104 records...


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 4/4 [02:01<00:00, 30.38s/it]


(104, 4096)
Preprocessing: dirty_itunes_amazon, tableB, with_labels=0, with model=Mistral_sup, with instr=0, split=valid
size: (107, 10)
Encoding 107 records...


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 4/4 [01:16<00:00, 19.00s/it]


(107, 4096)
Preprocessing: dirty_itunes_amazon, tableB, with_labels=1, with model=Mistral_sup, with instr=0, split=valid
size: (107, 10)
Encoding 107 records...


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 4/4 [01:46<00:00, 26.72s/it]

(107, 4096)



