In [1]:
import pandas as pd
import numpy as np
import re
import string
from pathlib import Path

In [2]:
# Paths
RAW_DATA_DIR = Path("../data/raw")
PROCESSED_DATA_DIR = Path("../data/processed")
PROCESSED_DATA_DIR.mkdir(exist_ok=True)

In [3]:
# Text cleaning function
def clean_news_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [4]:
# Load and clean
news_file = RAW_DATA_DIR / "news_data.csv"

if news_file.exists():
    news_df = pd.read_csv(news_file)
    print(f"Original shape: {news_df.shape}")

    # Drop duplicates by title + published date
    if 'title' in news_df.columns and 'publishedAt' in news_df.columns:
        news_df = news_df.drop_duplicates(subset=['title', 'publishedAt'], keep='first')

    # Drop rows with missing titles
    news_df = news_df.dropna(subset=['title'])

    # Clean title and description
    news_df['clean_title'] = news_df['title'].apply(clean_news_text)
    if 'description' in news_df.columns:
        news_df['clean_description'] = news_df['description'].apply(clean_news_text)
    else:
        news_df['clean_description'] = ""

    # Save cleaned news
    output_path = PROCESSED_DATA_DIR / "news_data_clean.csv"
    news_df.to_csv(output_path, index=False)
    print(f"Saved cleaned news to {output_path}")
else:
    print("news_data.csv not found")

Original shape: (1247, 5)
Saved cleaned news to ..\data\processed\news_data_clean.csv


In [5]:
# Quick check
news_df[['title', 'clean_title', 'clean_description']].head(10)

Unnamed: 0,title,clean_title,clean_description
0,Apple (AAPL) iPhone Production Soars as India ...,apple aapl iphone production soars as india ga...,apple inc nasdaqaapl is one of the most profit...
1,Apple Inc. (AAPL): Investor AI Worries Still H...,apple inc aapl investor ai worries still haven...,we recently published 10 stocks on jim cramer’...
2,JP Morgan will increase AAPL share target pric...,jp morgan will increase aapl share target pric...,investment analyst firm jp morgan is predictin...
3,"'Magnificent Seven' stocks: Buy Nvidia, skip A...",magnificent seven stocks buy nvidia skip apple,the magnificent seven nvidia nvda alphabet goo...
4,"AAPL Q3 2025: Analysts expect low growth, iPho...",aapl q3 2025 analysts expect low growth iphone...,we’re just a day away from the aapl q3 2025 ea...
5,Apple Inc. (AAPL): People Are Tired Of The Sto...,apple inc aapl people are tired of the stock b...,we recently published jim cramer recently disc...
6,Nvidia beats Apple to a $4T valuation as it ri...,nvidia beats apple to a 4t valuation as it rid...,nvidia the us company specializing in highperf...
7,"Apple Needs 'Product-Focused CEO,' Say Analyst...",apple needs productfocused ceo say analysts as...,analysts from lightshed partners have suggeste...
8,"Apple Is Going To Surprise Wall Street, Says T...",apple is going to surprise wall street says to...,as wall street has been rallying behind the ip...
9,Here's How to Listen to Apple's Upcoming Earni...,heres how to listen to apples upcoming earning...,apple has announced that it will share earning...
