In [1]:
import pandas as pd
import numpy as np
import re
import string
import os
from pathlib import Path

In [2]:
# Paths
RAW_DATA_DIR = Path("../data/raw")
PROCESSED_DATA_DIR = Path("../data/processed")
PROCESSED_DATA_DIR.mkdir(exist_ok=True)

In [3]:
# Text cleaning function
def clean_tweet_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # URLs
    text = re.sub(r"@\w+", "", text)  # Mentions
    text = re.sub(r"#", "", text)  # Keep word but remove '#'
    text = re.sub(r"\$\w+", "", text)  # Stock tickers
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [4]:
# Load and clean
tweets_file = RAW_DATA_DIR / "tweet_finance.csv"

if tweets_file.exists():
    tweets_df = pd.read_csv(tweets_file)
    print(f"Original shape: {tweets_df.shape}")
    print(f"Columns: {tweets_df.columns.tolist()}")

    # Remove duplicates based on ticker + date + text
    tweets_df = tweets_df.drop_duplicates(subset=['ticker', 'date', 'text'], keep='first')

    # Drop NA text
    tweets_df = tweets_df.dropna(subset=['text'])

    # Clean text
    tweets_df['clean_text'] = tweets_df['text'].apply(clean_tweet_text)

    # Drop rows with empty clean_text
    tweets_df = tweets_df[tweets_df['clean_text'].str.strip() != ""]

    # Save cleaned tweets
    output_path = PROCESSED_DATA_DIR / "tweet_finance_clean.csv"
    tweets_df.to_csv(output_path, index=False)
    print(f"Saved cleaned tweets to {output_path}")
else:
    print("tweet_finance.csv not found")

Original shape: (100, 5)
Columns: ['ticker', 'date', 'text', 'likes', 'retweets']
Saved cleaned tweets to ..\data\processed\tweet_finance_clean.csv


In [5]:
# Check result
tweets_df[['text', 'clean_text']].head(10)

Unnamed: 0,text,clean_text
0,Now its Apple's turn to pick a horse in the ra...,now its apples turn to pick a horse in the rac...
1,See that line for the AAPL stock? That’s how ...,see that line for the aapl stock that’s how mu...
2,AAPL stock dips pre-earnings—AI hype or cash t...,aapl stock dips preearnings—ai hype or cash tr...
3,Apple Earnings Recap: iPhone Sales Beat Estima...,apple earnings recap iphone sales beat estimat...
4,Barclays maintains Apple $AAPL at Underweight ...,barclays maintains apple at underweight and ra...
5,Apple (AAPL) Shares Jump Following Earnings Re...,apple aapl shares jump following earnings report
6,@Abecrombietrade @munster_gene I've owned $AAP...,ive owned stock for 30 years ive enjoyed aroun...
7,$TSLA Which other company has a CEO publicly a...,which other company has a ceo publicly asking ...
8,Market Insight : Apple (AAPL) shows cautious o...,market insight apple aapl shows cautious optim...
9,B of A Securities maintains Apple $AAPL at Buy...,b of a securities maintains apple at buy and r...
