#### setup

In [1]:
import torch

torch.random.manual_seed(42)

cuda_available = torch.cuda.is_available()

if cuda_available:
    dev_count = torch.cuda.device_count()
    dev_current = torch.cuda.current_device()
    dev_name = torch.cuda.get_device_name(dev_current)
    print(f'Device count: {dev_count}')
    print(f'Current device: {dev_current}')
    print(f'Device name: {dev_name}')

Device count: 1
Current device: 0
Device name: NVIDIA GeForce RTX 3060


#### data

In [15]:
import polars as pl
import os

data_path = 'D:\\data-science\\wikipedia\\parq'

all_files = os.listdir(data_path)

df_list = [pl.read_parquet(os.path.join(data_path, all_files[-1]))]

df = pl.concat(df_list)

In [16]:
df = df.drop(['id', 'url'])
df

title,text
str,str
"""The Angel of 8th Ave.""","""""The Angel of 8th Ave."" (styli…"
"""Hurricane Municipal Airport""","""Hurricane Municipal Airport , …"
"""Satin berrypecker""","""The satin berrypecker (Melanoc…"
"""Cassinia complanata""","""Cassinia complanata, commonly …"
"""Monoporella""","""Monoporella is a genus of bryo…"
…,…
"""Bianca Fernandez""","""Bianca Jolie Fernandez (born 2…"
"""Condons and Clangibbon""","""Condons and Clangibbon () is a…"
"""2022 Chattanooga Red Wolves SC…","""The 2022 Chattanooga Red Wolve…"
"""Nkiko Prosper""","""Turatsinze Nkiko Prosper (born…"


In [17]:
df = df.with_columns(
    pl.col('text')
    .str.replace_all(r'\n+', ' ')
    .str.replace_all(r'\s+', ' ')
)

#### saving tokens

In [18]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

special_tokens = {
    'pad_token': '[PAD]',
    'bos_token': '[BOS]',
    'eos_token': '[EOS]'
}

tokenizer.add_special_tokens(special_tokens)

2

In [36]:
df = df.with_columns(pl.col('text').str.split(' ').list.len().alias('length'))

In [47]:
df['length'].mean()

313.7274008430247

In [12]:
df_sample = df[:15_000]
chunk_size = 1_000
tokenized_chunks = []

for i in range(0, len(df), chunk_size):
    chunk = df[i:i+chunk_size]['text'].to_list()
    tokenized_chunk = tokenizer(chunk, return_tensors='pt', max_length=512, padding='longest', truncation=True)
    tokenized_chunks.append(tokenized_chunk['input_ids'])

In [13]:
tokenized_text = torch.concatenate(tokenized_chunks)
tokenized_text.shape

torch.Size([157528, 256])

#### dataloader

In [14]:
torch.save(tokenized_text, 'D:\\data-science\\wikipedia\\tokens\\tokenized_text.pt')