## Clonar Repositório e Datasets

In [1]:
# Clonar Projeto
!git clone https://github.com/pauloh48/EM-Join.git

Cloning into 'EM-Join'...
remote: Enumerating objects: 25, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 25 (delta 2), reused 20 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (25/25), 180.79 KiB | 6.95 MiB/s, done.
Resolving deltas: 100% (2/2), done.


In [None]:
# Clonar conjunto de dados
import os
import gdown

# Definir o caminho para a pasta 'datasets'
datasets_folder = 'EM-Join/datasets'

# Verificar se a pasta já existe
if not os.path.exists(datasets_folder):
    # Instalar gdown
    !pip install gdown

    # Baixar o arquivo
    gdown.download('https://drive.google.com/uc?id=1UunGNGlkgDpl0FkFvFXWn2qYjJIZeiWW', 'file_name.zip', quiet=False)
    print("Arquivo baixado com sucesso.")
else:
    print("A pasta 'datasets' já existe. O download foi pulado.")




Downloading...
From (original): https://drive.google.com/uc?id=1UunGNGlkgDpl0FkFvFXWn2qYjJIZeiWW
From (redirected): https://drive.google.com/uc?id=1UunGNGlkgDpl0FkFvFXWn2qYjJIZeiWW&confirm=t&uuid=13df1559-c130-45be-98fb-745aac36cf24
To: /content/file_name.zip
100%|██████████| 409M/409M [00:09<00:00, 45.2MB/s]


'file_name.zip'

In [None]:
# Armazenar conjunto de dados na pasta do pipeline

import zipfile
import os

zip_path = os.getcwd() + '/file_name.zip'
extract_to = os.getcwd() + '/EM-Join/'

# Extrair o arquivo ZIP
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print(f"Arquivo extraído para {os.path.abspath(extract_to)}")

current_folder = os.getcwd() + '/EM-Join/EM_Join_dataset'
new_folder = os.getcwd() + '/EM-Join/datasets'

os.rename(current_folder, new_folder)
os.remove(zip_path)

print(f"Pasta renomeada de '{current_folder}' para '{new_folder}'")

Arquivo extraído para /content/EM-Join
Pasta renomeada de '/content/EM-Join/EM_Join_dataset' para '/content/EM-Join/datasets'


## Instalar bibliotecas

In [None]:
%cd {os.getcwd()}/EM-Join

# install requirements
!pip install -r requirements.txt
!apt install libomp-dev

/content/EM-Join
Collecting datasets (from -r requirements.txt (line 1))
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting faiss-gpu (from -r requirements.txt (line 2))
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting sumy (from -r requirements.txt (line 4))
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->-r requirements.txt (line 1))
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->-r requirements.txt (line 1))
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets->-r requirements.txt (line 1))
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets->-r requirements.txt (line 1))
  Downloadi

## Pipeline

### Possiveis Datasets

In [5]:
# Possíveis Seleções para song-song e big-citations
"""
!CUDA_VISIBLE_DEVICES=0 python "pipeline_em_join.py" \
--collection 'sparkly-all' \
--type_dataset 'magellan' \
--dataset "song-song" \
# --dataset "big-citations" \
# --dataset "beer_teste" \
"""

'\n!CUDA_VISIBLE_DEVICES=0 python "pipeline_em_join.py" --collection \'sparkly-all\' --type_dataset \'magellan\' --dataset "song-song" # --dataset "big-citations" # --dataset "beer_teste" '

### Ajuda

In [6]:
# Ajuda se necessário
# !python pipeline_em_join.py --help

### Execução

In [None]:
# desabilitar WANDB
import os
os.environ['WANDB_MODE'] = 'disabled'

### song-song

#### song-song: all-mpnet-base-v2

In [None]:
!CUDA_VISIBLE_DEVICES=0 python pipeline_em_join.py \
--collection 'sparkly-all' \
--type_dataset 'magellan' \
--dataset "song-song" \
--device 'cuda' \
--percent_summarize 0.2 \
--model_pre "all-mpnet-base-v2" \
--normalize_embedings \
--generate_train_data \
--save_train_data \
--perform_fine_tuning \
--num_epochs 30 \
--batch_size 64 \
--learning_rate 5e-5 \
--scheduler 'ConstantLR' \
--save_best_model_ft \
--overwrite_exist_model \
--generate_vectors \
--save_vectors \
--perform_join \
--auto_threshold \
--threshold 0.6 \
--index 'IndexFlatIP' \
--show_metrics \
--save_result

2024-11-07 04:46:21.051987: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-07 04:46:21.071796: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-07 04:46:21.078151: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-07 04:46:21.092441: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
[nltk_data] Downloading package punkt to /roo

#### song-song: all-MiniLM-L12-v2

In [None]:
!CUDA_VISIBLE_DEVICES=0 python pipeline_em_join.py \
--collection 'sparkly-all' \
--type_dataset 'magellan' \
--dataset "song-song" \
--device 'cuda' \
--percent_summarize 0.2 \
--model_pre "all-MiniLM-L12-v2" \
--normalize_embedings \
--generate_train_data \
--save_train_data \
--perform_fine_tuning \
--num_epochs 30 \
--batch_size 64 \
--learning_rate 5e-5 \
--scheduler 'ConstantLR' \
--save_best_model_ft \
--overwrite_exist_model \
--generate_vectors \
--save_vectors \
--perform_join \
--auto_threshold \
--threshold 0.6 \
--index 'IndexFlatIP' \
--show_metrics \
--save_result

### big-citations

#### big-citations: all-mpnet-base-v2


In [None]:
!CUDA_VISIBLE_DEVICES=0 python pipeline_em_join.py \
--collection 'sparkly-all' \
--type_dataset 'magellan' \
--dataset "big-citations" \
--device 'cuda' \
--percent_summarize 0.2 \
--model_pre "all-mpnet-base-v2" \
--normalize_embedings \
--generate_train_data \
--save_train_data \
--perform_fine_tuning \
--num_epochs 30 \
--batch_size 64 \
--learning_rate 5e-5 \
--scheduler 'ConstantLR' \
--save_best_model_ft \
--overwrite_exist_model \
--generate_vectors \
--save_vectors \
--perform_join \
--auto_threshold \
--threshold 0.6 \
--index 'IndexFlatIP' \
--show_metrics \
--save_result

#### big-citations: all-MiniLM-L12-v2

In [None]:
!CUDA_VISIBLE_DEVICES=0 python pipeline_em_join.py \
--collection 'sparkly-all' \
--type_dataset 'magellan' \
--dataset "big-citations" \
--device 'cuda' \
--percent_summarize 0.2 \
--model_pre "all-MiniLM-L12-v2" \
--normalize_embedings \
--generate_train_data \
--save_train_data \
--perform_fine_tuning \
--num_epochs 30 \
--batch_size 64 \
--learning_rate 5e-5 \
--scheduler 'ConstantLR' \
--save_best_model_ft \
--overwrite_exist_model \
--generate_vectors \
--save_vectors \
--perform_join \
--auto_threshold \
--threshold 0.6 \
--index 'IndexFlatIP' \
--show_metrics \
--save_result