# Text Pre-Processing

The requirement of Milestone 1 is to pre-process the text data for our Word-Order analysis task. We chose 3 languages from the Leipzig Corpora Collection provided in the task description for the following years, classifying it into 2 periods - Pre and Post ChatGPT:
1. English: 2018-2020(PRE); 2023-2024(POST)
2. German: 2019-Nov,2022(PRE); Dec,2022-2024(POST)
3. Russian: 2013, 2019, 2020, Jan 2022-Nov 2022(PRE); 2024(POST)

### Import dependencies.

In [1]:
import re
import pandas as pd
import numpy as np
from pathlib import Path
import csv

### Pre-processing

We define a function to clean individual sentences by removing unwanted characters and normalizing spaces. Along with that sentences which are too short were removed from the dataset. 
All the sentences from the sentences text files are merged with the date from the metadata text files using the files named "inv_so" which contains all the sentence IDs to their corresponding source IDs. 
In the end, for each language a dataframe is made combining data from the three text files for the columns "Language", "Year", "Period", "Date" and "Sentence". These are stored in the form of TSV files. 

Functions for cleaning the corpus and merging them into a dataframe based on languages and another one with all the languages:

In [2]:
def clean_sentence(sentence):
    # Remove URLs
    sentence = re.sub(r'http\S+|www\S+|https\S+', '', sentence, flags=re.MULTILINE)
    
    # Remove HTML tags
    sentence = re.sub(r'<.*?>', '', sentence)
    
    # Remove quotation marks 
    sentence = sentence.strip('"')
    
    # Normalize spaces
    sentence = re.sub(r"\s+", " ", sentence).strip()

    return sentence

def process_corpus(base_path: Path, language: str):
    """
    Reads Leipzig-style corpus files (-sentences, -sources, -inv_so),
    merges them, cleans text, and adds Pre/Post-ChatGPT period.
    """
    base = str(base_path).replace("-sentences.txt", "")
    
    # Load core files
    sentences = pd.read_csv(f"{base}-sentences.txt", sep="\t", header=None,
                            names=["sentence_id", "sentence"],
                            quoting=csv.QUOTE_NONE, encoding="utf-8-sig",
                            on_bad_lines="skip", engine="python")
    
    try:
        sources = pd.read_csv(f"{base}-sources.txt", sep="\t", header=None,
                              names=["source_id", "url", "date"],
                              quoting=csv.QUOTE_NONE, encoding="utf-8-sig",
                              on_bad_lines="skip", engine="python")
        
        inv_so = pd.read_csv(f"{base}-inv_so.txt", sep="\t", header=None,
                             names=["source_id", "sentence_id"],
                             quoting=csv.QUOTE_NONE, encoding="utf-8-sig",
                             on_bad_lines="skip", engine="python")
        
        # Merge to attach dates
        df = inv_so.merge(sources, on="source_id", how="left").merge(sentences, on="sentence_id", how="left")
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
        cutoff = pd.Timestamp("2022-11-30")
        df["period"] = df["date"].apply(lambda d: "Pre-ChatGPT" if pd.notnull(d) and d <= cutoff else "Post-ChatGPT")
        df["year"] = df["date"].dt.year
    
    except FileNotFoundError:
        # Fall back: no metadata available
        print(f"⚠️ No date metadata for {language}. Using filename year only.")
        year_match = re.search(r"(\d{4})", base_path.name)
        year = int(year_match.group(1)) if year_match else None
        period = "Pre-ChatGPT" if year < 2023 else "Post-ChatGPT"
        df = pd.read_csv(base_path, sep="\t", header=None, names=["sentence_id", "sentence"],
                         quoting=csv.QUOTE_NONE, encoding="utf-8-sig", on_bad_lines="skip", engine="python")
        df["year"] = year
        df["period"] = period
        df["date"] = None
    
    total_before = len(df)  # Track length before cleaning
    # Add language and clean
    df["language"] = language
    df = df.dropna(subset=["sentence"])
    df["sentence"] = df["sentence"].astype(str).str.strip().apply(clean_sentence)
    df = df[df["sentence"].str.len() > 30]
    total_after = len(df)  # Tracking length after cleaning
    print(f"Processed {language}: {total_before} sentences before cleaning, {total_after} after cleaning.")
    
    return df[["language", "year", "period", "date", "sentence"]]

In [3]:
# --- Auto-discover and process all datasets ---
base_dir = Path("data/raw")
output_dir = Path("data/clean")
output_dir.mkdir(parents=True, exist_ok=True)

all_dfs = []

for lang_dir in base_dir.iterdir():
    if lang_dir.is_dir():  # e.g. english, german, russian
        language = lang_dir.name.capitalize()
        print(f"\n Processing language: {language}")
        
        for file in lang_dir.rglob("*-sentences.txt"):
            print(f"Processing {file.name}...")
            
            df = process_corpus(file, language)
            
            # Output file name based on year or fallback
            df["year"] = df["year"].astype("Int64")
            year = int(df["year"].mode()[0]) if not df["year"].dropna().empty else "unknown"
            out_path = output_dir / f"{language.lower()}_{year}.csv"
            df.to_csv(out_path.with_suffix(".tsv"), sep="\t", index=False)
            all_dfs.append(df)

# Combine everything
merged = pd.concat(all_dfs, ignore_index=True)
merged.to_csv(output_dir / "all_languages_clean.tsv", index=False, sep="\t")

print("\n All corpora processed and saved in 'data/clean/'")


 Processing language: English
Processing eng_news_2020_100K-sentences.txt...
Processed English: 100000 sentences before cleaning, 96901 after cleaning.
Processing eng_news_2024_100K-sentences.txt...
Processed English: 100000 sentences before cleaning, 97382 after cleaning.

 Processing language: German
Processing deu_news_2024_100K-sentences.txt...
Processed German: 100000 sentences before cleaning, 98165 after cleaning.

 Processing language: Russian
Processing rus_news_2020_100K-sentences.txt...
Processed Russian: 100000 sentences before cleaning, 96429 after cleaning.
Processing rus_news_2024_100K-sentences.txt...
Processed Russian: 100000 sentences before cleaning, 97646 after cleaning.

 All corpora processed and saved in 'data/clean/'
