This script performs the complete textâ€‘processing for our project
1. DATA CLEANING:
   - Loads raw News and Opinion CSV files.
   - Keeps only the 'title' and 'section' columns.
   - Cleans titles by lowercasing, removing punctuation, and tokenizing.
   - Removes English stopwords and lemmatizes remaining words.
   - Saves cleaned News and Opinion datasets as separate CSV files.


In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load data
news = pd.read_csv(r"../DATA/News_Uncleaned.csv")
opinion = pd.read_csv(r"../DATA/Opinion_Uncleaned.csv")

news = news[['title', 'section']]
opinion = opinion[['title', 'section']]

df = pd.concat([news, opinion], ignore_index=True)

# Clean titles
def clean_title(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

df['clean_title'] = df['title'].apply(clean_title)
df['tokens'] = df['clean_title'].str.split()

# Stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df['tokens'] = df['tokens'].apply(
    lambda words: [w for w in words if w not in stop_words]
)

# Lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

df['tokens'] = df['tokens'].apply(
    lambda words: [lemmatizer.lemmatize(w) for w in words]
)

# Save cleaned CSVs into DATA folder
news_clean = df[df['section'] == 'News'].copy()
opinion_clean = df[df['section'] == 'Opinion'].copy()

news_clean.to_csv(r"../DATA/News_cleaned.csv", index=False)
opinion_clean.to_csv(r"../DATA/Opinion_cleaned.csv", index=False)

print("Saved files:")
print(" - ../DATA/News_cleaned.csv")
print(" - ../DATA/Opinion_cleaned.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\stangutur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\stangutur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\stangutur\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Saved files:
 - ../DATA/News_cleaned.csv
 - ../DATA/Opinion_cleaned.csv
