In [3]:
import re
import difflib
from typing import Dict
import pandas as pd
from bs4 import BeautifulSoup

# Embedded stopwords based on typical English stopwords
EMBEDDED_STOPWORDS = {
    "a","about","above","after","again","against","all","am","an","and","any","are","aren't",
    "as","at","be","because","been","before","being","below","between","both","but","by",
    "can't","cannot","could","couldn't","did","didn't","do","does","doesn't","doing",
    "don't","down","during","each","few","for","from","further","had","hadn't","has",
    "hasn't","have","haven't","having","he","he'd","he'll","he's","her","here","here's",
    "hers","herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've",
    "if","in","into","is","isn't","it","it's","its","itself","let's","me","more","most",
    "mustn't","my","myself","no","nor","not","of","off","on","once","only","or","other",
    "ought","our","ours","ourselves","out","over","own","same","shan't","she","she'd",
    "she'll","she's","should","shouldn't","so","some","such","than","that","that's",
    "the","their","theirs","them","themselves","then","there","there's","these","they",
    "they'd","they'll","they're","they've","this","those","through","to","too","under",
    "until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were",
    "weren't","what","what's","when","when's","where","where's","which","while","who",
    "who's","whom","why","why's","with","won't","would","wouldn't","you","you'd",
    "you'll","you're","you've","your","yours","yourself","yourselves",
}

def build_acronym_dictionary() -> Dict[str, str]:
    return {
        "usd": "us dollars",
        "eur": "euros",
        "gbp": "british pounds",
        "mn": "million",
        "bn": "billion",
        "pct": "percent",
        "inc": "incorporated",
        "co": "company",
        "corp": "corporation",
        "ltd": "limited",
        "dept": "department",
        "u.s.": "us",
        "u.s": "us",
        "nyse": "new york stock exchange",
        "ftse": "financial times stock exchange",
    }

def clean_text(text: str, stop_words: set,
               english_vocab: set, acronym_dict: Dict[str, str]) -> str:

    if not isinstance(text, str):
        return ""

    # Remove URLs
    text = re.sub(r"http\S+|www\.\S+", "", text)

    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text(separator=" ")

    # Remove unwanted characters
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)

    # Lowercase
    text = text.lower()

    # Expand acronyms
    for acr, expansion in acronym_dict.items():
        text = re.sub(r"\b" + re.escape(acr) + r"\b", expansion, text)

    # Tokenize & remove stopwords
    tokens = [t for t in text.split() if t not in stop_words]

    # Simple spell correction (fuzzy match)
    vocab_list = list(english_vocab)
    corrected = []
    for tok in tokens:
        if len(tok) <= 2 or tok.isdigit() or tok in english_vocab:
            corrected.append(tok)
            continue
        matches = difflib.get_close_matches(tok, vocab_list, n=1, cutoff=0.8)
        corrected.append(matches[0] if matches else tok)

    return " ".join(corrected)


In [4]:
from google.colab import files
uploaded = files.upload()   # Upload your CSV manually

csv_path = list(uploaded.keys())[0]  # Automatically get filename

df = pd.read_csv(csv_path, header=None,
                 names=["Sentiment", "News Headline"],
                 encoding="latin1")

print("DataFrame shape:", df.shape)
print("Columns:", list(df.columns))
print("Class distribution:")
print(df["Sentiment"].value_counts())


Saving all-data.csv to all-data (1).csv
DataFrame shape: (4846, 2)
Columns: ['Sentiment', 'News Headline']
Class distribution:
Sentiment
neutral     2879
positive    1363
negative     604
Name: count, dtype: int64


In [5]:
# Stop words
stop_words = set(EMBEDDED_STOPWORDS)
stop_words.update({"said", "mr", "mrs", "ms", "also",
                   "would", "could", "one", "two", "three"})

# Acronyms
acronym_dict = build_acronym_dictionary()

# Build vocabulary from dataset
english_vocab = set()
for text in df["News Headline"]:
    cleaned = re.sub(r"[^A-Za-z0-9\s]", "", str(text)).lower()
    english_vocab.update(cleaned.split())

# Clean text column
df["Cleaned Headline"] = [
    clean_text(text, stop_words, english_vocab, acronym_dict)
    for text in df["News Headline"]
]


In [6]:
print("\nBefore/After Examples:\n")

sample = df.head(5)[["News Headline", "Cleaned Headline"]]

for idx, row in sample.iterrows():
    print(f"Row {idx+1}:")
    print("Original :", row["News Headline"])
    print("Cleaned  :", row["Cleaned Headline"])
    print()



Before/After Examples:

Row 1:
Original : According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Cleaned  : according gran company plans move production russia although company growing

Row 2:
Original : Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .
Cleaned  : technopolis plans develop stages area less 100000 square meters order host companies working computer technologies telecommunications statement

Row 3:
Original : The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .
Cleaned  : international electronic industry company elcoteq laid tens employees tallinn facility contrary earlier layoffs compa

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode sentiment labels (positive/negative/neutral â†’ integers)
label_encoder = LabelEncoder()
df["Label"] = label_encoder.fit_transform(df["Sentiment"])

X = df["Cleaned Headline"]
y = df["Label"]

# Train-test split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))


Train size: 3876
Test size: 970


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer (Bag of Words)
bow_vectorizer = CountVectorizer(
    max_features=5000,    # limit vocabulary size (optional)
    ngram_range=(1,2),    # unigrams + bigrams
)

# Fit on training data, transform both train & test
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

print("BOW Train shape:", X_train_bow.shape)
print("BOW Test shape:", X_test_bow.shape)


BOW Train shape: (3876, 5000)
BOW Test shape: (970, 5000)


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
)

# Fit on training data, transform both
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("TFIDF Train shape:", X_train_tfidf.shape)
print("TFIDF Test shape:", X_test_tfidf.shape)


TFIDF Train shape: (3876, 5000)
TFIDF Test shape: (970, 5000)


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# RandomForest model for BOW
rf_bow = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

# Train
rf_bow.fit(X_train_bow, y_train)

# Predict
y_pred_bow = rf_bow.predict(X_test_bow)

# Evaluation
print("=== RandomForest (BOW) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_bow))
print("\nClassification Report:\n", classification_report(y_test, y_pred_bow))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_bow))


=== RandomForest (BOW) ===
Accuracy: 0.7309278350515463

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.39      0.49       121
           1       0.76      0.88      0.81       576
           2       0.67      0.56      0.61       273

    accuracy                           0.73       970
   macro avg       0.70      0.61      0.64       970
weighted avg       0.72      0.73      0.72       970

Confusion Matrix:
 [[ 47  54  20]
 [ 12 508  56]
 [ 10 109 154]]


In [11]:
# RandomForest model for TF-IDF
rf_tfidf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

# Train
rf_tfidf.fit(X_train_tfidf, y_train)

# Predict
y_pred_tfidf = rf_tfidf.predict(X_test_tfidf)

# Evaluation
print("=== RandomForest (TF-IDF) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_tfidf))

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_tfidf))


=== RandomForest (TF-IDF) ===
Accuracy: 0.7319587628865979

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.37      0.48       121
           1       0.75      0.91      0.82       576
           2       0.68      0.52      0.59       273

    accuracy                           0.73       970
   macro avg       0.70      0.60      0.63       970
weighted avg       0.72      0.73      0.71       970

Confusion Matrix:
 [[ 45  52  24]
 [ 11 522  43]
 [ 11 119 143]]


In [12]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd

comparison = pd.DataFrame({
    "Model": ["RandomForest (BOW)", "RandomForest (TF-IDF)"],
    "Accuracy": [
        accuracy_score(y_test, y_pred_bow),
        accuracy_score(y_test, y_pred_tfidf)
    ],
    "Precision (macro)": [
        precision_score(y_test, y_pred_bow, average="macro"),
        precision_score(y_test, y_pred_tfidf, average="macro")
    ],
    "Recall (macro)": [
        recall_score(y_test, y_pred_bow, average="macro"),
        recall_score(y_test, y_pred_tfidf, average="macro")
    ],
    "F1 Score (macro)": [
        f1_score(y_test, y_pred_bow, average="macro"),
        f1_score(y_test, y_pred_tfidf, average="macro")
    ]
})

comparison


Unnamed: 0,Model,Accuracy,Precision (macro),Recall (macro),F1 Score (macro)
0,RandomForest (BOW),0.730928,0.702601,0.611492,0.640606
1,RandomForest (TF-IDF),0.731959,0.701947,0.600653,0.631184
