In [4]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"

download_shell = (
    "huggingface-cli download --local-dir-use-symlinks False --resume-download  --repo-type dataset %s --local-dir %s"
    % ('togethercomputer/RedPajama-Data-1T', 'data/RedPajama')
)
os.system(download_shell)

# 将以.jsonl结尾的文件都放入以_分隔的第一个元素的目录下
import shutil

data_dir = 'data/RedPajama'
for file in os.listdir(data_dir):
    if file.endswith('.jsonl'):
        new_dir_name = file.split('_')[0]
        new_dir_path = os.path.join(data_dir, new_dir_name)
        if not os.path.exists(new_dir_path):
            os.makedirs(new_dir_path)

        old_file_path = os.path.join(data_dir, file)
        new_file_path = os.path.join(new_dir_path, file)

        shutil.move(old_file_path, new_file_path)


### Step 1: NFC Normalization

In [13]:
! python preprocessing/normalize_text.py --data_dir data/RedPajama/arxiv/ --target_dir data/RedPajama_norm/arxiv/

resetting to 1 for number of processes
Parsed 0 input files. Files written : 0it [00:00, ?it/s]  | 0/1 [00:00<?, ?it/s]
100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.34it/s]


### Step 2: Filter Short Documents

In [14]:
! python preprocessing/filter.py data/RedPajama_norm/arxiv/ data/RedPajama_filtered.pickle 64 arxiv 200

Finished processing, writing to disk!


### Step 3: Deduplication

#### Step 3.1: MinHash Generation

In [39]:
! python dedup/to_hash.py arxiv data/RedPajama_norm/arxiv/ data/RedPajama_minhash/arxiv/ 64 0 0 0 -w 13 -k 10000

  0%|                                                     | 0/1 [00:00<?, ?it/s]
1it [00:00, 51.63it/s]
100%|███████████████████████████████████████████| 64/64 [00:18<00:00,  3.48it/s]
100%|█████████████████████████████████████████████| 1/1 [00:18<00:00, 18.43s/it]


#### Step 3.2: Duplicate Pairs Generation

In [42]:
! python dedup/generate_duplicate_pairs.py --input_dir data/RedPajama_minhash/ --out_file data/redpj_duplicates/duplicate_pairs.txt --range 13 --bands 9 --processes 45

0: Processed 0.0%. 0.0003256797790527344
1: Processed 0.0%. 0.0003390312194824219
2: Processed 0.0%. 0.0002963542938232422
3: Processed 0.0%. 0.00045037269592285156
4: Processed 0.0%. 0.00028967857360839844
5: Processed 0.0%. 0.0002856254577636719
6: Processed 0.0%. 0.00029730796813964844
8: Processed 0.0%. 0.00031065940856933594
7: Processed 0.0%. 0.0004742145538330078
Total number of documents: 64
Total number of documents: 64
Total number of documents: 64
Total number of documents: 64
Total number of documents: 64
Total number of documents: 64
Total number of documents: 64
Total number of documents: 64
Total number of documents: 64


#### Step 3.3: Duplicate Graph Construction & Search for Connected Components

In [43]:
! python dedup/generate_connected_components.py --input_dir data/redpj_duplicates --out_file data/redpj_duplicates/connected_components.pickle

Started graph building
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
length of the set of duplicates: 0 0.003435373306274414
0it [00:00, ?it/s]
number of connected components: 1 0.0035572052001953125
Graph generated duplicates list!!! 0.0036547183990478516


#### Step 3.4: Generate Final List of Duplicates

In [44]:
! python dedup/generate_duplicates_dict.py --input_file data/redpj_duplicates/connected_components.pickle --out_file data/redpj_duplicates/duplicates.pickle 

Processing duplicates!!!
100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 41120.63it/s]
number of duplicate documents that will be removed: 0


### Step 4: Interleave & Shuffle

In [12]:
! python preprocessing/shuffle_holdout.py pass1 --input_dir data/RedPajama_norm/ --duplicates data/redpj_duplicates/duplicates.pickle --short_docs data/RedPajama_filtered.pickle --out_dir data/SlimPajama/pass1

Sampling chunk of documents 0
Sampling chunk of documents 0
Finished sampling documents.
Finished sampling documents.
Total number of processed documents: 10  Total time: 0.03184103965759277
Total number of processed documents: 20  Total time: 0.04440808296203613
Total number of processed documents: 30  Total time: 0.0540311336517334
Total number of processed documents: 40  Total time: 0.0596623420715332
Total number of processed documents: 50  Total time: 0.06805253028869629
Total number of processed documents: 60  Total time: 0.07453274726867676
Finished writing documents.
Pass 1 finished...


### Step 5: Split Dataset into Train and Holdout

In [18]:
%%bash
for j in {1..20}
do
    python preprocessing/shuffle_holdout.py pass2 "$((j-1))" "$j" "$j" --input_dir data/SlimPajama/pass1 --train_dir data/SlimPajama/train --holdout_dir data/SlimPajama/holdout > $j.log 2>&1 &
done

### Step 6: Deduplicate Train against Holdout

In [19]:
%%bash
python dedup/dedup_train.py 1 --src_dir data/SlimPajama/train --tgt_dir data/SlimPajama/holdout --out_dir data/SlimPajama/train_deduped
for j in {2..20}
do
    python dedup/dedup_train.py "$j" --src_dir data/SlimPajama/train --tgt_dir data/SlimPajama/holdout --out_dir data/SlimPajama/train_deduped > $j.log 2>&1 &
done

0it [00:00, ?it/s]
  0%|          | 0/9 [00:00<?, ?it/s]

Finished collecting hashes for eval 0


100%|██████████| 9/9 [00:00<00:00, 175.01it/s]


Total written: 50
