In [34]:
import subprocess
import psycopg
import torch
import h5py
import gc
import os
import numpy as np
import pandas as pd
from PIL import Image
from transformers import pipeline
from tqdm.auto import tqdm
from psycopg.rows import dict_row
from transformers import CLIPProcessor, CLIPModel
from torch.utils.data import DataLoader, Dataset
from transformers import CLIPTokenizer

In [35]:
conn = psycopg.connect(
    host="localhost",
    port=5432,
    dbname="postgres",
    user="postgres",
    password="0909231769"
)

In [36]:
def max_index() -> int:
    with conn.cursor(row_factory=dict_row) as cur:
        cur.execute("SELECT COUNT(*) AS total_rows FROM arxiv_papers")
        result = cur.fetchone()
    return result['total_rows']

def get_row(index: int) -> dict | None:
    with conn.cursor(row_factory=dict_row) as cur: 
        cur.execute("SELECT * FROM arxiv_papers WHERE idx = %s", (index,))
        return cur.fetchone()
    
def get_batch_rows(start_idx: int, end_idx: int) -> list[dict]:
    with conn.cursor(row_factory=dict_row) as cur:
        cur.execute("""
            SELECT idx, id, abstract 
            FROM arxiv_papers
            WHERE idx >= %s AND idx < %s 
            ORDER BY idx
        """, (start_idx, end_idx))
        return cur.fetchall()
    
def merge_HDF5_files(input_list, output_file):
    if not input_list:
        print("‚ùå Error: Input file list is empty.")
        return
    total_records = 0
    first_file = None
    for f_path in input_list:
        if os.path.exists(f_path):
            first_file = f_path
            break
    if not first_file:
        print("‚ùå Error: No valid input files found.")
        return
    with h5py.File(first_file, 'r') as f_first:
        embed_shape = np.squeeze(f_first['embeddings'][:]).shape[1]
        embed_dtype = f_first['embeddings'].dtype
        url_dtype = f_first['urls'].dtype
    with h5py.File(output_file, 'w') as f_output:
        f_output.create_dataset(
            'urls',
            shape=(0,),
            maxshape=(None,),
            dtype=url_dtype,
            chunks=True
        )
        f_output.create_dataset(
            'embeddings',
            shape=(0, embed_shape),
            maxshape=(None, embed_shape),
            dtype=embed_dtype,
            chunks=True
        )
    pbar = tqdm(total = len(input_list), desc="Merging")
    with h5py.File(output_file, 'a') as f_output:
        for file_path in input_list:
            if not os.path.exists(file_path):
                print(f"‚ö†Ô∏è File not found: {file_path}. Skipping.")
                pbar.update(1)
                continue
            try:
                with h5py.File(file_path, 'r') as f_input:
                    current_urls = f_input['urls'][:]
                    current_embeddings = f_input['embeddings'][:]
                    current_embeddings = np.squeeze(current_embeddings)
                    num_records = current_urls.shape[0]
                    if num_records == 0:
                        continue
                    dset_urls = f_output['urls']
                    dset_embeddings = f_output['embeddings']
                    new_size = total_records + num_records
                    dset_urls.resize(new_size, axis=0)
                    dset_embeddings.resize(new_size, axis=0)
                    dset_urls[total_records:new_size] = current_urls
                    dset_embeddings[total_records:new_size] = current_embeddings
                    total_records = new_size
            except Exception as e:
                print(f"‚ùå Error processing file {file_path}: {e}. Skipping this file.")
            pbar.update(1)
    pbar.close()

In [37]:
start = 1
end = 10000
chunk = 1000
end = min(end, max_index())

In [38]:
MODEL_LIST = [  "openai/clip-vit-base-patch32",
                "openai/clip-vit-base-patch16",
                "openai/clip-vit-large-patch14",
                "openai/clip-vit-large-patch14-336" ]
MODEL = MODEL_LIST[0]
CACHE = "../.cache"
OUTPUT = f"../.cache/{MODEL[7:]}/paper_embeddings"
NUM_WORKERS = min(16, int(chunk/200))
BATCH_SIZE = 8

In [39]:
torch.cuda.empty_cache()
gc.collect()

857

In [40]:
# text = get_row(2)['abstract']
# summarizer = pipeline(
#     "summarization", 
#     model="t5-base", 
#     device=0
# )

# summary = summarizer(
#     text, 
#     max_new_tokens=60,
#     min_length=10, 
#     do_sample=False
# )

# tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")

# tokens = tokenizer.encode(summary[0]['summary_text'])
# print(f"S·ªë l∆∞·ª£ng token: {len(tokens)}")

In [41]:
# subprocess.run(["mkdir", f"../.cache/{MODEL[7:]}"])
# subprocess.run(["mkdir", OUTPUT])
# summarizer = pipeline("summarization", model="t5-base", device=0)
# processor = CLIPProcessor.from_pretrained(MODEL, use_fast=True)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = CLIPModel.from_pretrained(MODEL).to(device)
# model.eval()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def clear_gpu():
    torch.cuda.empty_cache()
    gc.collect()

for i in range(start, end + 1, chunk):
    batch_rows = get_batch_rows(i, i + chunk)
    if not batch_rows:
        continue
    ids = [row['id'] for row in batch_rows]
    original_texts = [row["abstract"] for row in batch_rows]
    summarizer = pipeline(
        "summarization", 
        model="t5-base", 
        device=0
    )
    short_texts = []
    for batch_idx in tqdm(range(0, len(original_texts), BATCH_SIZE), desc="Summarizing"):
        batch_txt = original_texts[batch_idx : batch_idx + BATCH_SIZE]
        summaries = summarizer(
            batch_txt, 
            max_new_tokens=60, 
            min_length=10, 
            do_sample=False,
            batch_size=BATCH_SIZE,
            truncation=True
        )
        short_texts.extend([s['summary_text'] for s in summaries])
    del summarizer
    clear_gpu()

    model = CLIPModel.from_pretrained(MODEL).to(device)
    processor = CLIPProcessor.from_pretrained(MODEL, use_fast=True)
    model.eval()
    all_vectors = []
    for batch_idx in tqdm(range(0, len(short_texts), BATCH_SIZE), desc="Embedding"):
        batch_txt = short_texts[batch_idx : batch_idx + BATCH_SIZE]
        inputs = processor(text=batch_txt, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            text_features = model.get_text_features(**inputs)
        text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)
        all_vectors.append(text_features.cpu().numpy())
    del model
    clear_gpu()

    if all_vectors:
        final_vectors = np.concatenate(all_vectors, axis=0)
        current_urls = [f"https://arxiv.org/pdf/{id_val}.pdf" for id_val in ids]
        output_path = OUTPUT + f"/{MODEL[7:]}_Papers_Embedded_{i}_to_{i + chunk - 1}.h5"
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        if not os.path.exists(output_path):
            with h5py.File(output_path, "w") as outfile:
                dt = h5py.string_dtype(encoding='utf-8')
                outfile.create_dataset("urls", data=current_urls, dtype=dt)
                outfile.create_dataset("embeddings", data=final_vectors)
            print(f"   üíæ ƒê√£ l∆∞u file: {os.path.basename(output_path)}")
        else:
            print(f"   ‚ö†Ô∏è File ƒë√£ t·ªìn t·∫°i.")
    del short_texts, all_vectors, original_texts, batch_rows
    clear_gpu()

Device set to use cuda:0


Summarizing:   0%|          | 0/125 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


KeyboardInterrupt: 