In [None]:
import hashlib

def get_line_hash(text):
    """
    Normalizes text (removes whitespace/casing) and returns an MD5 hash.
    This ensures 'Hello world' and 'hello world ' are treated as the same.
    """
    normalized_text = text.strip().lower().encode('utf-8')
    return hashlib.md5(normalized_text).hexdigest()

def filter_disjoint_data(in_domain_path, ood_raw_path, ood_output_path):
    # 1. Create a "Blacklist" of In-Domain hashes
    in_domain_hashes = set()
    
    print("Loading in-domain data...")
    with open(in_domain_path, 'r', encoding='utf-8') as f:
        for line in f:
            # Skip empty lines
            if not line.strip():
                continue
            line_hash = get_line_hash(line)
            in_domain_hashes.add(line_hash)
            
    print(f"Loaded {len(in_domain_hashes)} unique in-domain sentences.")

    # 2. Stream OOD data and filter
    print("Filtering OOD data...")
    kept_count = 0
    removed_count = 0
    
    with open(ood_raw_path, 'r', encoding='utf-8') as f_in, \
         open(ood_output_path, 'w', encoding='utf-8') as f_out:
        
        for line in f_in:
            if not line.strip():
                continue
                
            # Check if this line matches any in-domain line
            current_hash = get_line_hash(line)
            
            if current_hash in in_domain_hashes:
                removed_count += 1 # It's a duplicate, skip it!
            else:
                f_out.write(line) # It's unique, keep it!
                kept_count += 1

    print(f"Done! Kept {kept_count} sentences. Removed {removed_count} overlaps.")

# Example Usage:
# filter_disjoint_data('in_domain_train.txt', 'common_crawl_raw.txt', 'ood_disjoint.txt')

In [10]:
with open('/home/lionelhu/brainaudio/data/in_domain_ds_clean.txt', 'r') as f:
    lines = f.readlines()

max_len = 0
min_len = float('inf')  # Better than hardcoded large number
total_len = 0

for line in lines:
    words = line.strip().split()  # Strip newline and split on whitespace
    word_count = len(words)
    
    if word_count > 0:  # Only count non-empty lines
        if word_count > max_len:
            max_len = word_count
        
        if word_count < min_len:
            min_len = word_count
        
        total_len += word_count

avg_len = total_len / len(lines)

print(f"The average sentence length of the in-domain corpus is {avg_len:.2f}")
print(f"The maximum sentence length of the in-domain corpus is {max_len}")
print(f"The minimum sentence length of the in-domain corpus is {min_len}")


The average sentence length of the in-domain corpus is 6.38
The maximum sentence length of the in-domain corpus is 22
The minimum sentence length of the in-domain corpus is 1


In [12]:
import spacy
import multiprocessing
from tqdm import tqdm # optional, for progress bar: pip install tqdm

# --- CONFIGURATION ---
INPUT_FILE = "/data2/brain2text/lm_training_data/raw_c4.txt"  # Your 20GB raw file
OUTPUT_FILE = "/data2/brain2text/lm_training_data/c4_formatted_sentences.txt"
MIN_WORDS = 3       # Filter out very short fragments
BATCH_SIZE = 10000  # Lines to process in one chunk

# Global variable for the worker processes
nlp = None

def init_worker():
    """
    Initialize spaCy in each worker process.
    We only disable the heavy components (parser, ner) to make it fast.
    """
    global nlp
    # Load the small English model
    nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "lemmatizer", "tagger"])
    
    # Add the sentencizer (fast rule-based sentence splitter)
    nlp.add_pipe("sentencizer")

def process_batch(lines_batch):
    """
    Takes a list of raw text lines (paragraphs).
    Returns a list of clean, individual sentences.
    """
    clean_sentences = []
    
    # nlp.pipe processes text as a stream, which is faster
    # batch_size here is internal to spaCy
    for doc in nlp.pipe(lines_batch, batch_size=200):
        for sent in doc.sents:
            text = sent.text.strip()
            
            # --- HEURISTICS / FILTERING ---
            # 1. Skip empty lines
            if not text:
                continue
            
            # 2. Skip single words or very short garbage
            # (Splitting by space is a rough word count estimate)
            if len(text.split()) < MIN_WORDS:
                continue
                
            # 3. (Optional) Skip lines that don't start with a capital letter
            # if not text[0].isupper(): continue
            
            clean_sentences.append(text)
            
    return clean_sentences

def reformat_large_file():
    # Use about 80% of your CPU cores
    num_cores = max(1, multiprocessing.cpu_count() - 2)
    pool = multiprocessing.Pool(processes=num_cores, initializer=init_worker)
    
    print(f"Processing with {num_cores} cores...")

    with open(INPUT_FILE, 'r', encoding='utf-8', errors='ignore') as f_in, \
         open(OUTPUT_FILE, 'w', encoding='utf-8') as f_out:
        
        batch = []
        
        # Read the file line by line (lazy loading, does not load 20GB into RAM)
        for line in tqdm(f_in):
            line = line.strip()
            if line:
                batch.append(line)
            
            # When batch is full, send to workers
            if len(batch) >= BATCH_SIZE:
                results = pool.apply_async(process_batch, (batch,))
                
                # Write results immediately to disk as they become ready
                # Note: To keep order perfectly strict is harder, but for 
                # Language Modeling training data, strict order of *unrelated* # paragraphs usually doesn't matter.
                # For simplicity in this script, we wait for the result:
                processed_sents = results.get() 
                
                for sent in processed_sents:
                    f_out.write(sent + "\n")
                
                batch = []

        # Process remaining lines
        if batch:
            processed_sents = pool.apply(process_batch, (batch,))
            for sent in processed_sents:
                f_out.write(sent + "\n")

    pool.close()
    pool.join()
    print("Done! Formatted file saved.")
    

**lionelhu@jarjar:~/brainaudio/scripts$** `wc -l /data2/brain2text/lm_training_data/c4_cleaned.txt`

$137582321$ `/data2/brain2text/lm_training_data/c4_cleaned.txt`