# Preprocessing

- Rename biber field to biber_tagged
- Confirm it's present for every single row
- Compare against the original file and confirm everything is the same with the addition of new field(s)
- Add the new fields into the original file

Manually delete tagged_data folders once after sanity checking

## Setup

In [7]:
import json
import os
from tqdm import tqdm
import multiprocessing as mp
from itertools import islice

tagged_dataset_paths = {
    'amazon': '/shared/3/projects/hiatus/tagged_data/amazon/amazon.jsonl',
    'reddit': '/shared/3/projects/hiatus/tagged_data/reddit/reddit.jsonl',
    'book3corpus': '/shared/3/projects/hiatus/tagged_data/book3corpus/book3corpus.jsonl',
    'wiki': '/shared/3/projects/hiatus/tagged_data/wiki/wiki.jsonl',
    'wiki_discussions': '/shared/3/projects/hiatus/tagged_data/wiki_discussions/wiki_discussions.jsonl',
    'realnews': '/shared/3/projects/hiatus/tagged_data/realnews/realnews.jsonl',
    'gmane': '/shared/3/projects/hiatus/tagged_data/gmane/realnews.jsonl'
}

tagged_dataset_paths = {
    'wiki': '/shared/3/projects/hiatus/tagged_data/wiki/wiki.jsonl',
    'wiki_discussions': '/shared/3/projects/hiatus/tagged_data/wiki_discussions/wiki_discussions.jsonl',
}

def process_chunk(chunk):
    processed = []
    skipped = 0
    for orig_line, lisa_line in chunk:
        try:
            orig_data = json.loads(orig_line)
            lisa_data = json.loads(lisa_line)
        
            if not all(key in lisa_data for key in orig_data.keys()):
                raise KeyError(f"Missing original fields")
            
            if 'encodings' not in lisa_data:
                raise KeyError("'encodings' field is missing")
            
            lisa_data['biber_tagged'] = lisa_data.pop('encodings')
            
            processed.append(json.dumps(lisa_data))
        except (KeyError, json.JSONDecodeError):
            skipped += 1
        except Exception as e:
            print(f"Unexpected error processing line: {str(e)}")
            skipped += 1
    
    return processed, skipped

def process_dataset(dataset_name, dataset_path):
    lisa_path = os.path.join(os.path.dirname(dataset_path), 'lisa.jsonl')
    corpus_path = os.path.join(os.path.dirname(dataset_path), 'corpus.jsonl')
    
    chunk_size = 10000  
    
    with open(dataset_path, 'r') as orig_file, \
         open(lisa_path, 'r') as lisa_file, \
         open(corpus_path, 'w') as corpus_file:
        
        total_lines = sum(1 for _ in orig_file)
        orig_file.seek(0)  # Reset file pointer
        
        with mp.Pool(processes=10) as pool:
            chunks = iter(lambda: list(islice(zip(orig_file, lisa_file), chunk_size)), [])
            results = []
            
            for result in tqdm(pool.imap(process_chunk, chunks), total=total_lines//chunk_size, desc=f"Processing {dataset_name}"):
                results.append(result)
                processed_lines = sum(len(r[0]) for r in results)
                skipped_lines = sum(r[1] for r in results)
            
            for result in results:
                corpus_file.write('\n'.join(result[0]))
                corpus_file.write('\n')
    
    # Remove lisa.jsonl
    try:
        os.remove(lisa_path)
    except OSError as e:
        print(f"Error removing {lisa_path}: {str(e)}")
    
    print(f"Processed {dataset_name}")
    print(f"Lines processed: {processed_lines}")
    print(f"Lines skipped: {skipped_lines}")


In [8]:
for dataset_name, dataset_path in tagged_dataset_paths.items():
    process_dataset(dataset_name, dataset_path)

print("All datasets processed successfully.")

Processing wiki: 314it [16:20,  3.12s/it]                         


Processed wiki
Lines processed: 3134059
Lines skipped: 9


Processing wiki_discussions: 410it [19:54,  2.91s/it]                         


Processed wiki_discussions
Lines processed: 4093891
Lines skipped: 4
All datasets processed successfully.
