In [8]:
# Cell 0: Set working directory to project root

import os

# Change to project root (one level up from notebooks/)
os.chdir("..")

print(f"Working directory: {os.getcwd()}")

Working directory: c:\Users\arabe\nlp-pikogpt-funkyai


In [3]:
# Cell 1: Test Setup
print("Testing imports...")

from datasets import load_dataset, load_from_disk, Dataset
print("✓ datasets library works")

from langdetect import detect
import pandas as pd
import re
from collections import Counter

print("✓ All imports successful")
print("\nReady to start EDA!")

Testing imports...
✓ datasets library works
✓ All imports successful

Ready to start EDA!


In [4]:
# Cell 2: Load OpenWebText from HuggingFace (Optimized)

print("Loading OpenWebText from HuggingFace...")
print("Using streaming to avoid downloading full 60GB dataset\n")

dataset_stream = load_dataset(
    "Skylion007/openwebtext",
    split="train",
    streaming=True
)

print("Collecting 10,000 samples for EDA...")
SAMPLE_SIZE = 10000

samples = []
for i, sample in enumerate(dataset_stream):
    if i >= SAMPLE_SIZE:
        break
    samples.append(sample)
    if i % 2000 == 0:
        print(f"  Collected {i:,} / {SAMPLE_SIZE:,} samples...")

subset = Dataset.from_list(samples)

print(f"\n✓ Loaded successfully!")
print(f"Subset size: {len(subset):,} documents")
print(f"\nExample document (first 300 chars):")
print(subset[0]['text'][:300])

Loading OpenWebText from HuggingFace...
Using streaming to avoid downloading full 60GB dataset





Resolving data files:   0%|          | 0/80 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/80 [00:00<?, ?it/s]

Collecting 10,000 samples for EDA...
  Collected 0 / 10,000 samples...
  Collected 2,000 / 10,000 samples...
  Collected 4,000 / 10,000 samples...
  Collected 6,000 / 10,000 samples...
  Collected 8,000 / 10,000 samples...

✓ Loaded successfully!
Subset size: 10,000 documents

Example document (first 300 chars):
Port-au-Prince, Haiti (CNN) -- Earthquake victims, writhing in pain and grasping at life, watched doctors and nurses walk away from a field hospital Friday night after a Belgian medical team evacuated the area, saying it was concerned about security.

The decision left CNN Chief Medical Corresponden


In [9]:
# Cell 3: Load the NLP26 Test Split

from datasets import load_from_disk

print("Loading NLP26 test split from disk...")

TEST_DATA_PATH = "data/raw/NLP26_OWT_eval/test"

try:
    test_dataset = load_from_disk(TEST_DATA_PATH)
    print(f"✓ Test dataset loaded!")
    print(f"Test dataset size: {len(test_dataset):,} documents")
    print(f"\nSample test document (first 300 chars):")
    print(test_dataset[0]['text'][:300])
except Exception as e:
    print(f"✗ Error loading test data: {e}")
    print(f"\nMake sure you downloaded the test folder to: {TEST_DATA_PATH}")

Loading NLP26 test split from disk...
✓ Test dataset loaded!
Test dataset size: 400,689 documents

Sample test document (first 300 chars):
Image caption A fragment of an iron vessel was found during the excavation

Archaeologists say they have confirmed the location of a meeting place of a medieval Norse parliament.

Called a "thing", evidence of the mound was uncovered during excavations of Dingwall's Cromartie Memorial car park.

Whe


In [10]:
# Cell 4: Compare training and test data structure

print("=" * 60)
print("TRAINING DATA (OpenWebText subset)")
print("=" * 60)
print(f"Number of samples: {len(subset):,}")
print(f"Features: {subset.features}")
print(f"\nExample document:\n{subset[0]['text'][:500]}...")

print("\n" + "=" * 60)
print("TEST DATA (NLP26 split - DO NOT TRAIN ON THIS)")
print("=" * 60)
print(f"Number of samples: {len(test_dataset):,}")
print(f"Features: {test_dataset.features}")
print(f"\nExample document:\n{test_dataset[0]['text'][:500]}...")

TRAINING DATA (OpenWebText subset)
Number of samples: 10,000
Features: {'text': Value('string')}

Example document:
Port-au-Prince, Haiti (CNN) -- Earthquake victims, writhing in pain and grasping at life, watched doctors and nurses walk away from a field hospital Friday night after a Belgian medical team evacuated the area, saying it was concerned about security.

The decision left CNN Chief Medical Correspondent Sanjay Gupta as the only doctor at the hospital to get the patients through the night.

CNN initially reported, based on conversations with some of the doctors, that the United Nations ordered the B...

TEST DATA (NLP26 split - DO NOT TRAIN ON THIS)
Number of samples: 400,689
Features: {'text': Value('string')}

Example document:
Image caption A fragment of an iron vessel was found during the excavation

Archaeologists say they have confirmed the location of a meeting place of a medieval Norse parliament.

Called a "thing", evidence of the mound was uncovered during excavatio

In [11]:
# Cell 5: Document Length Analysis

print("Analyzing document lengths...\n")

lengths = [len(doc['text']) for doc in subset]

print("Document Length Statistics:")
print(f"  Min:    {min(lengths):,} chars")
print(f"  Max:    {max(lengths):,} chars")
print(f"  Mean:   {sum(lengths)/len(lengths):,.0f} chars")
print(f"  Median: {sorted(lengths)[len(lengths)//2]:,} chars")

# Distribution buckets
print("\nLength Distribution:")
buckets = [0, 100, 500, 1000, 5000, 10000, 50000, float('inf')]
bucket_names = ['0-100', '100-500', '500-1K', '1K-5K', '5K-10K', '10K-50K', '50K+']

for i in range(len(buckets)-1):
    count = sum(1 for l in lengths if buckets[i] <= l < buckets[i+1])
    pct = count / len(lengths) * 100
    print(f"  {bucket_names[i]:>10}: {count:>5} ({pct:>5.1f}%)")

Analyzing document lengths...

Document Length Statistics:
  Min:    658 chars
  Max:    100,000 chars
  Mean:   4,859 chars
  Median: 3,169 chars

Length Distribution:
       0-100:     0 (  0.0%)
     100-500:     0 (  0.0%)
      500-1K:   580 (  5.8%)
       1K-5K:  6553 ( 65.5%)
      5K-10K:  2006 ( 20.1%)
     10K-50K:   828 (  8.3%)
        50K+:    33 (  0.3%)


In [12]:
# Cell 6: Language Detection

from langdetect import detect, LangDetectException

def detect_language_safe(text):
    """Detect language with error handling."""
    try:
        return detect(text[:1000])  # Use first 1000 chars for speed
    except LangDetectException:
        return "unknown"
    except:
        return "error"

print("Detecting languages (this takes 1-2 minutes)...\n")

# Check a sample of documents
LANG_SAMPLE_SIZE = 1000
languages = []

for i in range(LANG_SAMPLE_SIZE):
    lang = detect_language_safe(subset[i]['text'])
    languages.append(lang)
    if (i + 1) % 250 == 0:
        print(f"  Processed {i + 1}/{LANG_SAMPLE_SIZE}...")

# Count results
from collections import Counter
lang_counts = Counter(languages)

print("\nLanguage Distribution:")
for lang, count in lang_counts.most_common(10):
    pct = count / len(languages) * 100
    print(f"  {lang:>10}: {count:>5} ({pct:>5.1f}%)")

non_english = sum(c for l, c in lang_counts.items() if l != 'en')
print(f"\nNon-English content: {non_english}/{len(languages)} ({non_english/len(languages)*100:.1f}%)")
print("^ This content needs to be filtered out")

Detecting languages (this takes 1-2 minutes)...

  Processed 250/1000...
  Processed 500/1000...
  Processed 750/1000...
  Processed 1000/1000...

Language Distribution:
          en:   997 ( 99.7%)
          nl:     1 (  0.1%)
          es:     1 (  0.1%)
          de:     1 (  0.1%)

Non-English content: 3/1000 (0.3%)
^ This content needs to be filtered out


In [13]:
# Cell 7: HTML Detection

import re

def has_html(text):
    """Check if text contains HTML tags."""
    return bool(re.search(r'<[^>]+>', text))

def find_html_tags(text):
    """Find all HTML tags in text."""
    pattern = r'<([a-zA-Z][a-zA-Z0-9]*)\b[^>]*>'
    return re.findall(pattern, text)

print("Checking for HTML content...\n")

html_count = 0
all_tags = []

for doc in subset:
    text = doc['text']
    if has_html(text):
        html_count += 1
        tags = find_html_tags(text)
        all_tags.extend(tags)

print(f"Documents containing HTML: {html_count}/{len(subset)} ({html_count/len(subset)*100:.1f}%)")

if all_tags:
    print("\nMost common HTML tags:")
    tag_counts = Counter(all_tags)
    for tag, count in tag_counts.most_common(10):
        print(f"  <{tag}>: {count}")

# Show example if found
if html_count > 0:
    print("\n" + "=" * 60)
    print("EXAMPLE DOCUMENT WITH HTML:")
    print("=" * 60)
    for doc in subset:
        if has_html(doc['text']):
            print(doc['text'][:600])
            break

Checking for HTML content...

Documents containing HTML: 131/10000 (1.3%)

Most common HTML tags:
  <span>: 111
  <div>: 63
  <br>: 60
  <string>: 57
  <T>: 44
  <a>: 44
  <TestObject>: 37
  <char>: 26
  <key>: 22
  <typename>: 14

EXAMPLE DOCUMENT WITH HTML:
Over the years, I've learned to be cautious with C++ pointers. In particular, I'm always very careful about who owns a given pointer, and who's in charge of calling delete on it. But my caution often forces me to write deliberately inefficient functions. For example:

vector < string > tokenize_string ( const string & text );

Here, we have a large string text , and we want to split it into a vector of tokens. This function is nice and safe, but it allocates one string for every token in the input. Now, if we were feeling reckless, we could avoid these allocations by returning a vector of poin


In [14]:
# Cell 8: URL and Code Detection

import re

def count_urls(text):
    """Count URLs in text."""
    pattern = r'https?://\S+|www\.\S+'
    return len(re.findall(pattern, text))

def contains_code(text):
    """Detect if text contains code."""
    code_patterns = [
        r'```',                          # Markdown code blocks
        r'def \w+\s*\(',                 # Python function
        r'function\s+\w+\s*\(',          # JavaScript function
        r'class\s+\w+\s*[:\{]',          # Class definition
        r'import\s+[\w.]+',              # Import statement
        r'#include\s*<',                 # C/C++ include
    ]
    for pattern in code_patterns:
        if re.search(pattern, text):
            return True
    return False

print("Checking for URLs and code...\n")

docs_with_urls = 0
docs_with_code = 0
total_urls = 0

for doc in subset:
    text = doc['text']
    urls = count_urls(text)
    if urls > 0:
        docs_with_urls += 1
        total_urls += urls
    if contains_code(text):
        docs_with_code += 1

print(f"Documents with URLs:  {docs_with_urls}/{len(subset)} ({docs_with_urls/len(subset)*100:.1f}%)")
print(f"Documents with code:  {docs_with_code}/{len(subset)} ({docs_with_code/len(subset)*100:.1f}%)")
print(f"Total URLs found:     {total_urls}")

# Show example with code if found
if docs_with_code > 0:
    print("\n" + "=" * 60)
    print("EXAMPLE DOCUMENT WITH CODE:")
    print("=" * 60)
    for doc in subset:
        if contains_code(doc['text']):
            print(doc['text'][:800])
            break

Checking for URLs and code...

Documents with URLs:  724/10000 (7.2%)
Documents with code:  113/10000 (1.1%)
Total URLs found:     1707

EXAMPLE DOCUMENT WITH CODE:
NOTE: this website was build using Miraj. The source code is available at: miraj-project/homepage . Many other simple examples with commented code are available at miraj-project/demos/hello-world

Components can also be easily defined as one-off elements for use in a single page. Both page and components can be defined in the same project.

Miraj also makes it very easy to define and share component libraries. Multiple components may be defined across multiple namespaces; a deflibrary macro then assembles any combination of components into a library namespace, which is independent of the defining namespaces. Miraj can automatically generate a demo page for previewing/testing components under development.

Things get a little more complicated when you add web components. Miraj allows the pr


In [15]:
# Cell 9: Special Characters and Quality Issues

import re

def find_issues(text):
    """Find various text quality issues."""
    issues = []
    
    if '�' in text:
        issues.append('replacement_char')
    if re.search(r'(.)\1{15,}', text):
        issues.append('repeated_chars')
    if re.search(r'\n{5,}', text):
        issues.append('excessive_newlines')
    
    return issues

print("Checking for quality issues...\n")

issue_counts = Counter()
problematic_docs = 0

for doc in subset:
    issues = find_issues(doc['text'])
    if issues:
        problematic_docs += 1
        issue_counts.update(issues)

print(f"Documents with issues: {problematic_docs}/{len(subset)} ({problematic_docs/len(subset)*100:.1f}%)")

if issue_counts:
    print("\nIssue breakdown:")
    for issue, count in issue_counts.most_common():
        print(f"  {issue}: {count}")

Checking for quality issues...

Documents with issues: 100/10000 (1.0%)

Issue breakdown:
  repeated_chars: 85
  replacement_char: 15


In [16]:
# Cell 10: EDA Summary

summary = """
================================================================================
                           EDA FINDINGS SUMMARY
================================================================================

DATASET OVERVIEW
----------------
- Training data (OpenWebText): ~8 million documents total, using 10K sample for EDA
- Test data (NLP26 split): 400,689 documents - MUST BE EXCLUDED FROM TRAINING

DOCUMENT LENGTH
---------------
- Min: 658 chars | Max: 100,000 chars | Median: 3,169 chars
- Most documents (65.5%) are between 1K-5K characters
- No documents under 500 characters in this sample

LANGUAGE DISTRIBUTION
---------------------
- English: 99.7%
- Non-English: 0.3% (Dutch, Spanish, German detected)
- Action: Filter non-English content using langdetect

HTML CONTENT
------------
- Documents with HTML: 1.3%
- Common tags: <span>, <div>, <br>, <a>
- Action: Remove HTML tags with regex

URLs
----
- Documents with URLs: 7.2%
- Total URLs found: 1,707 in 10K sample
- Action: Remove URLs with regex

CODE SNIPPETS
-------------
- Documents with code: 1.1%
- Types: Python, JavaScript, XML/HTML code examples
- Action: Remove markdown code blocks, consider filtering heavy-code docs

QUALITY ISSUES
--------------
- Documents with issues: 1.0%
- Repeated characters: 85 cases
- Replacement characters (�): 15 cases
- Action: Clean or filter problematic documents

================================================================================
                         PREPROCESSING PIPELINE
================================================================================

Based on these findings, the preprocessing script should:

1. LANGUAGE FILTER    → Keep only English (langdetect == 'en')
2. TEST SET FILTER    → Remove sentences appearing in NLP26 test split (CRITICAL!)
3. HTML REMOVAL       → Strip all HTML tags
4. URL REMOVAL        → Remove http://, https://, www. links
5. CODE REMOVAL       → Remove markdown code blocks (```)
6. SPECIAL CHARS      → Remove replacement characters (�), normalize whitespace
7. QUALITY FILTER     → Remove docs with excessive repetition
8. LENGTH FILTER      → Consider minimum length threshold (e.g., 100 chars after cleaning)

================================================================================
"""

print(summary)


                           EDA FINDINGS SUMMARY

DATASET OVERVIEW
----------------
- Training data (OpenWebText): ~8 million documents total, using 10K sample for EDA
- Test data (NLP26 split): 400,689 documents - MUST BE EXCLUDED FROM TRAINING

DOCUMENT LENGTH
---------------
- Min: 658 chars | Max: 100,000 chars | Median: 3,169 chars
- Most documents (65.5%) are between 1K-5K characters
- No documents under 500 characters in this sample

LANGUAGE DISTRIBUTION
---------------------
- English: 99.7%
- Non-English: 0.3% (Dutch, Spanish, German detected)
- Action: Filter non-English content using langdetect

HTML CONTENT
------------
- Documents with HTML: 1.3%
- Common tags: <span>, <div>, <br>, <a>
- Action: Remove HTML tags with regex

URLs
----
- Documents with URLs: 7.2%
- Total URLs found: 1,707 in 10K sample
- Action: Remove URLs with regex

CODE SNIPPETS
-------------
- Documents with code: 1.1%
- Types: Python, JavaScript, XML/HTML code examples
- Action: Remove markdown code b