In [2]:
import re
import os
from xml.etree import ElementTree as ET

def parse_reuters_document(text):
    # extract reuters articles from text
    pattern = r'<REUTERS.*?</REUTERS>'
    matches = re.findall(pattern, text, re.DOTALL)
    return matches

def clean_xml(xml_text):
    # remove invalid xml characters
    xml_text = re.sub(r'&#\d+;', '', xml_text)
    return xml_text

def extract_fields(reuters_xml):
    # parse xml and get fields
    try:
        cleaned = clean_xml(reuters_xml)
        root = ET.fromstring(cleaned)
    except ET.ParseError as e:
        print(f"Parse error: {e}")
        return None
    
    doc = {
        'newid': root.get('NEWID'),
        'date': root.findtext('DATE', '').strip(),
        'title': '',
        'dateline': '',
        'body': '',
        'topics': [],
        'places': [],
        'people': [],
        'orgs': [],
        'companies': []
    }
    
    # get text content
    text_elem = root.find('TEXT')
    if text_elem is not None:
        doc['title'] = text_elem.findtext('TITLE', '').strip()
        doc['dateline'] = text_elem.findtext('DATELINE', '').strip()
        doc['body'] = text_elem.findtext('BODY', '').strip()
    
    # get categories
    topics_elem = root.find('TOPICS')
    if topics_elem is not None:
        doc['topics'] = [d.text for d in topics_elem.findall('D') if d.text]
    
    places_elem = root.find('PLACES')
    if places_elem is not None:
        doc['places'] = [d.text for d in places_elem.findall('D') if d.text]
    
    people_elem = root.find('PEOPLE')
    if people_elem is not None:
        doc['people'] = [d.text for d in people_elem.findall('D') if d.text]
    
    orgs_elem = root.find('ORGS')
    if orgs_elem is not None:
        doc['orgs'] = [d.text for d in orgs_elem.findall('D') if d.text]
    
    companies_elem = root.find('COMPANIES')
    if companies_elem is not None:
        doc['companies'] = [d.text for d in companies_elem.findall('D') if d.text]
    
    return doc

# main execution
data_path = r"data"

all_documents = []

# find all sgm files
sgm_files = [f for f in os.listdir(data_path) if f.endswith('.sgm')]
print(f"Found {len(sgm_files)} SGM files\n")

# read each file
for filename in sgm_files[:3]:
    filepath = os.path.join(data_path, filename)
    print(f"Reading {filename}...")
    
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        content = f.read()
    
    docs = parse_reuters_document(content)
    all_documents.extend(docs)
    print(f"  Found {len(docs)} documents")

print(f"\nTotal documents: {len(all_documents)}")
print("=" * 80)

# show sample documents
print("\nSample Documents:")
for i, doc_xml in enumerate(all_documents[:5]):
    doc = extract_fields(doc_xml)
    
    if doc is None:
        continue
    
    print(f"\n[Document {i+1}] ID: {doc['newid']}")
    print("-" * 80)
    print(f"Date: {doc['date']}")
    print(f"Title: {doc['title']}")
    print(f"Dateline: {doc['dateline']}")
    
    body_preview = doc['body'][:150] + "..." if len(doc['body']) > 150 else doc['body']
    print(f"Body: {body_preview}")
    
    if doc['places']:
        print(f"Places: {', '.join(doc['places'])}")
    if doc['topics']:
        print(f"Topics: {', '.join(doc['topics'])}")
    if doc['people']:
        print(f"People: {', '.join(doc['people'])}")
    if doc['orgs']:
        print(f"Organizations: {', '.join(doc['orgs'])}")
    if doc['companies']:
        print(f"Companies: {', '.join(doc['companies'])}")

print("\n" + "=" * 80)

# calculate statistics
print("\nDataset Statistics:")
print("-" * 80)

all_docs = [extract_fields(doc_xml) for doc_xml in all_documents]
all_docs = [d for d in all_docs if d is not None]

docs_with_topics = sum(1 for d in all_docs if d['topics'])
docs_with_places = sum(1 for d in all_docs if d['places'])
docs_with_people = sum(1 for d in all_docs if d['people'])
docs_with_orgs = sum(1 for d in all_docs if d['orgs'])
docs_with_companies = sum(1 for d in all_docs if d['companies'])
docs_with_body = sum(1 for d in all_docs if d['body'])

total = len(all_docs)
print(f"Documents with body text: {docs_with_body}/{total} ({100*docs_with_body/total:.1f}%)")
print(f"Documents with places: {docs_with_places}/{total} ({100*docs_with_places/total:.1f}%)")
print(f"Documents with topics: {docs_with_topics}/{total} ({100*docs_with_topics/total:.1f}%)")
print(f"Documents with people: {docs_with_people}/{total} ({100*docs_with_people/total:.1f}%)")
print(f"Documents with organizations: {docs_with_orgs}/{total} ({100*docs_with_orgs/total:.1f}%)")
print(f"Documents with companies: {docs_with_companies}/{total} ({100*docs_with_companies/total:.1f}%)")

# show unique values
all_places = set()
all_topics = set()
for d in all_docs:
    all_places.update(d['places'])
    all_topics.update(d['topics'])

print(f"\nUnique places ({len(all_places)}): {sorted(list(all_places))[:20]}")
if len(all_places) > 20:
    print(f"  ... and {len(all_places) - 20} more")

print(f"\nUnique topics ({len(all_topics)}): {sorted(list(all_topics))}")

# important fields summary
print("\n" + "=" * 80)
print("\nKey Fields for the Project:")
print("-" * 80)
print("Title - for autocomplete and search")
print("Body - main content for text analysis")
print("Date - temporal information (needs parsing)")
print("Places - geographic tags (need geocoding to lat/lon)")
print("Dateline - contains city names (extra location info)")
print("Topics - subject categories")


Found 22 SGM files

Reading reut2-000.sgm...
  Found 1000 documents
Reading reut2-001.sgm...
  Found 1000 documents
Reading reut2-002.sgm...
  Found 1000 documents

Total documents: 3000

Sample Documents:

[Document 1] ID: 1
--------------------------------------------------------------------------------
Date: 26-FEB-1987 15:01:01.79
Title: BAHIA COCOA REVIEW
Dateline: SALVADOR, Feb 26 -
Body: Showers continued throughout the week in
the Bahia cocoa zone, alleviating the drought since early
January and improving prospects for the coming temp...
Places: el-salvador, usa, uruguay
Topics: cocoa

[Document 2] ID: 2
--------------------------------------------------------------------------------
Date: 26-FEB-1987 15:02:20.00
Title: STANDARD OIL <SRD> TO FORM FINANCIAL UNIT
Dateline: CLEVELAND, Feb 26 -
Body: Standard Oil Co and BP North America
Inc said they plan to form a venture to manage the money market
borrowing and investment activities of both compa...
Places: usa

[Document 3] ID: 