Phase 1

In [1]:
import os
import PyPDF2
from tqdm import tqdm

def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

def process_pdfs(folder_path, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith(".pdf") and not os.path.exists(os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.txt")):
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(pdf_path)

            output_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.txt")
            with open(output_path, "w", encoding="utf-8") as output_file:
                output_file.write(text)
        elif not filename.endswith(".pdf"):
            print(f"file type not supported ({filename})")

input_folder = "./climate_text_dataset"
output_folder = "./dataset_txt_small"

process_pdfs(input_folder, output_folder)

100%|██████████| 830/830 [00:00<00:00, 3088.52it/s]


Phase 2

In [2]:
import shutil
import random
random.seed(42)

SPLIT_PCT = 0.9

ds_txt = os.listdir(output_folder)
random.shuffle(ds_txt)

if not os.path.exists(os.path.join(output_folder, "train")):
    os.makedirs(os.path.join(output_folder, "train"))
if not os.path.exists(os.path.join(output_folder, "test")):
    os.makedirs(os.path.join(output_folder, "test"))

for filename in ds_txt[:int(len(ds_txt) * SPLIT_PCT)]:
    shutil.move(os.path.join(output_folder, filename), os.path.join(output_folder, "train", filename))
for filename in ds_txt[int(len(ds_txt) * SPLIT_PCT):]:
    shutil.move(os.path.join(output_folder, filename), os.path.join(output_folder, "test", filename))

Phase 3 (deprecated)

In [3]:
# SPLIT_DOC_SIZE = 1024

# for filename in tqdm(os.listdir(os.path.join(output_folder, "train"))):
#     with open(os.path.join(output_folder, "train", filename), "r", encoding="utf-8") as file:
#         text = file.read().split(" ")
#         for split in range(0, (len(text)-1)//SPLIT_DOC_SIZE+1):
#             with open(os.path.join(output_folder, "train", f"{os.path.splitext(filename)[0]}_split{split}.txt"), "w", encoding="utf-8") as output_file:
#                 output_file.write(" ".join(text[split*SPLIT_DOC_SIZE:(split+1)*SPLIT_DOC_SIZE]))
#         os.remove(os.path.join(output_folder, "train", filename))

# for filename in tqdm(os.listdir(os.path.join(output_folder, "test"))):
#     with open(os.path.join(output_folder, "test", filename), "r", encoding="utf-8") as file:
#         text = file.read().split(" ")
#         for split in range(0, (len(text)-1)//SPLIT_DOC_SIZE+1):
#             with open(os.path.join(output_folder, "test", f"{os.path.splitext(filename)[0]}_split{split}.txt"), "w", encoding="utf-8") as output_file:
#                 output_file.write(" ".join(text[split*SPLIT_DOC_SIZE:(split+1)*SPLIT_DOC_SIZE]))
#         os.remove(os.path.join(output_folder, "test", filename))

Phase 3

In [4]:
from transformers import AutoTokenizer

SKIP_INTRO_GRACE = 128
SPLIT_DOC_SIZE = 1024

model_name = "./Llama-3.2-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

for filename in tqdm(os.listdir(os.path.join(output_folder, "train"))):
    with open(os.path.join(output_folder, "train", filename), "r", encoding="utf-8") as file:
        text = file.read().replace("\n", " ")
        tokenized_text = tokenizer(text, padding="max_length", truncation=True, max_length=SPLIT_DOC_SIZE+SKIP_INTRO_GRACE, return_attention_mask=False)["input_ids"]
        with open(os.path.join(output_folder, "train", f"{os.path.splitext(filename)[0]}_small.txt"), "w", encoding="utf-8") as output_file:
            output_file.write(tokenizer.decode(tokenized_text[SKIP_INTRO_GRACE:]))
        os.remove(os.path.join(output_folder, "train", filename))

for filename in tqdm(os.listdir(os.path.join(output_folder, "test"))):
    with open(os.path.join(output_folder, "test", filename), "r", encoding="utf-8") as file:
        text = file.read().replace("\n", " ")
        tokenized_text = tokenizer(text, padding="max_length", truncation=True, max_length=SPLIT_DOC_SIZE+SKIP_INTRO_GRACE, return_attention_mask=False)["input_ids"]
        with open(os.path.join(output_folder, "test", f"{os.path.splitext(filename)[0]}_small.txt"), "w", encoding="utf-8") as output_file:
            output_file.write(tokenizer.decode(tokenized_text[SKIP_INTRO_GRACE:]))
        os.remove(os.path.join(output_folder, "test", filename))

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 747/747 [01:16<00:00,  9.74it/s]
100%|██████████| 83/83 [00:37<00:00,  2.20it/s]
