In [2]:
import pandas as pd
import re
from urllib.parse import urlparse

# Define the file path for the dataset (TSV format)
file_path = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_train/news.tsv'

# Read the dataset (without header)
df = pd.read_csv(file_path, sep='\t', header=None)

# Insert the header row
header = [
    "News ID", 
    "Category", 
    "Subcategory", 
    "Title", 
    "Abstract", 
    "URL", 
    "Entities Mentioned", 
    "Entities in Abstract"
]
df.columns = header

# 1. Handle Missing Data (Drop for Critical Fields)
# -------------------------------------------------
# Drop rows where 'Title', 'Abstract', or 'URL' are missing
df.dropna(subset=['Title', 'Abstract', 'URL'], inplace=True)

# 2. Impute Missing Values for Entities Mentioned and Entities in Abstract
# ------------------------------------------------------------------------
# Impute missing values in 'Entities Mentioned' and 'Entities in Abstract' with 'unknown'
df['Entities Mentioned'] = df['Entities Mentioned'].fillna('unknown')
df['Entities in Abstract'] = df['Entities in Abstract'].fillna('unknown')

# For 'Category' and 'Subcategory', impute missing values with 'unknown'
df['Category'] = df['Category'].fillna('unknown')
df['Subcategory'] = df['Subcategory'].fillna('unknown')

# 3. Convert All Text to Lowercase
# --------------------------------
df['Title'] = df['Title'].str.lower()
df['Abstract'] = df['Abstract'].str.lower()
df['Category'] = df['Category'].str.lower()
df['Subcategory'] = df['Subcategory'].str.lower()

# 4. Clean 'Subcategory' to Remove Numbers, Spaces, and Special Characters
# ------------------------------------------------------------------------
def clean_subcategory(subcategory):
    # Remove all non-letter characters (numbers, special characters, spaces)
    cleaned_subcategory = re.sub(r'[^a-zA-Z]', '', subcategory)
    return cleaned_subcategory

# Apply cleaning to the 'Subcategory' column
df['Subcategory'] = df['Subcategory'].apply(clean_subcategory)

# 5. Ensure Title and Abstract Are Separated by Only One Space
# ------------------------------------------------------------
def clean_text(text):
    # Remove extra spaces, ensure only one space between words
    return re.sub(r'\s+', ' ', text).strip()

df['Title'] = df['Title'].apply(clean_text)
df['Abstract'] = df['Abstract'].apply(clean_text)

# 6. Validate URLs
# ----------------
def validate_url(url):
    # Check if URL is properly formed and contains a valid scheme (http or https)
    parsed = urlparse(url)
    return parsed.scheme in ['http', 'https']

# Apply URL validation
df = df[df['URL'].apply(validate_url)]

# 7. Format 'Entities Mentioned' and 'Entities in Abstract' as Lists
# ------------------------------------------------------------------
import json

def convert_to_list(entity_column):
    # Convert JSON-like strings to Python lists
    try:
        return json.loads(entity_column)
    except (json.JSONDecodeError, TypeError):
        return ["unknown"] if entity_column == "unknown" else []

df['Entities Mentioned'] = df['Entities Mentioned'].apply(convert_to_list)
df['Entities in Abstract'] = df['Entities in Abstract'].apply(convert_to_list)

# 8. Final Check for Missing Values and Data Cleaning
# ---------------------------------------------------
df.replace({"": pd.NA, "[]": pd.NA}, inplace=True)

# 9. Save the Cleaned Data
# -------------------------
new_file_path = '/Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_train/News_cleaned.csv'
df.to_csv(new_file_path, index=False)

print(f"Cleaned dataset saved to {new_file_path}")

Cleaned dataset saved to /Users/n7/Desktop/ie University SAMBD Acadamics/Capstone Project/Data/MINDlarge_train/News_cleaned.csv
