In [1]:
# Install extra NLP libraries
!pip install nltk textblob contractions

#Imports
import pandas as pd
import numpy as np
import re
import nltk
from bs4 import BeautifulSoup
from textblob import TextBlob
import contractions
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from keras import layers, models

from google.colab import files

# Download NLTK resources
nltk.download('stopwords')


Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.3-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.1/345.1 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
# Upload the CSV file
uploaded = files.upload()

# Get the uploaded filename
filename = list(uploaded.keys())[0]

# Load WITHOUT header row because the dataset's header is corrupted
try:
    df = pd.read_csv(filename, encoding='utf-8', header=None)
except UnicodeDecodeError:
    df = pd.read_csv(filename, encoding='latin1', header=None)

# Assign correct column names for a 2-column dataset
df.columns = ["Sentiment", "News"]

print(df.head())
print("\nColumn names:", df.columns.tolist())
print("\nDataset shape:", df.shape)


Saving all-data (1).csv to all-data (1).csv
  Sentiment                                               News
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...
3  positive  With the new production plant the company woul...
4  positive  According to the company 's updated strategy f...

Column names: ['Sentiment', 'News']

Dataset shape: (4846, 2)


In [5]:
# Load stopwords
stop_words = set(stopwords.words("english"))

# Common financial acronyms
acronym_dict = {
    "fii": "foreign institutional investor",
    "dii": "domestic institutional investor",
    "fed": "federal reserve",
    "ecb": "european central bank",
    "eps": "earnings per share",
}

def clean_text(text):
    text = str(text)

    # Remove URLs
    text = re.sub(r"http\S+|www\S+", "", text)

    # Remove HTML tags
    text = BeautifulSoup(text, "lxml").get_text()

    # Expand contractions
    text = contractions.fix(text)

    # Lowercase
    text = text.lower()

    # Remove unwanted characters
    text = re.sub(r"[^a-zA-Z\s]", " ", text)

    # Replace acronyms
    for ac, full in acronym_dict.items():
        text = re.sub(rf"\b{ac}\b", full, text)

    # Tokenize
    tokens = text.split()

    # Remove stopwords
    tokens = [w for w in tokens if w not in stop_words]

    return " ".join(tokens)



In [6]:
df["clean_text"] = df["News"].apply(clean_text)
df[["Sentiment", "News", "clean_text"]].head()


Unnamed: 0,Sentiment,News,clean_text
0,neutral,"According to Gran , the company has no plans t...",according gran company plans move production r...
1,neutral,Technopolis plans to develop in stages an area...,technopolis plans develop stages area less squ...
2,negative,The international electronic industry company ...,international electronic industry company elco...
3,positive,With the new production plant the company woul...,new production plant company would increase ca...
4,positive,According to the company 's updated strategy f...,according company updated strategy years baswa...


In [7]:
# Encode sentiment labels into numeric form
le = LabelEncoder()
df["label"] = le.fit_transform(df["Sentiment"])

# Prepare input and target
X = df["clean_text"]
y = df["label"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))
print("Classes:", le.classes_)


Train size: 3876
Test size: 970
Classes: ['negative' 'neutral' 'positive']


In [8]:
#Bag of Words (BOW)
bow_vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_bow = bow_vectorizer.fit_transform(X_train).toarray()
X_test_bow = bow_vectorizer.transform(X_test).toarray()

print("BOW Train Shape:", X_train_bow.shape)
print("BOW Test Shape:", X_test_bow.shape)


#TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

print("TF-IDF Train Shape:", X_train_tfidf.shape)
print("TF-IDF Test Shape:", X_test_tfidf.shape)


BOW Train Shape: (3876, 5000)
BOW Test Shape: (970, 5000)
TF-IDF Train Shape: (3876, 5000)
TF-IDF Test Shape: (970, 5000)


In [9]:
def build_dnn(input_dim):
    model = models.Sequential([
        layers.Dense(128, activation="relu", input_shape=(input_dim,)),
        layers.Dense(64, activation="relu"),
        layers.Dense(3, activation="softmax")
    ])

    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=["accuracy"]
    )

    return model


In [10]:
model_bow = build_dnn(X_train_bow.shape[1])

history_bow = model_bow.fit(
    X_train_bow,
    y_train,
    epochs=8,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/8
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 33ms/step - accuracy: 0.5478 - loss: 0.9669 - val_accuracy: 0.7126 - val_loss: 0.6758
Epoch 2/8
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 30ms/step - accuracy: 0.8350 - loss: 0.4315 - val_accuracy: 0.7423 - val_loss: 0.6838
Epoch 3/8
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step - accuracy: 0.9531 - loss: 0.1525 - val_accuracy: 0.7332 - val_loss: 0.8475
Epoch 4/8
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - accuracy: 0.9827 - loss: 0.0620 - val_accuracy: 0.7423 - val_loss: 1.0436
Epoch 5/8
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9915 - loss: 0.0306 - val_accuracy: 0.7410 - val_loss: 1.1531
Epoch 6/8
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9951 - loss: 0.0228 - val_accuracy: 0.7268 - val_loss: 1.1821
Epoch 7/8
[1m97/97[0m [32m━━━━━━━━━━━

In [11]:
# Predict on test data
pred_bow = model_bow.predict(X_test_bow)
pred_bow = np.argmax(pred_bow, axis=1)

print("=== BOW Classification Report ===\n")
print(classification_report(y_test, pred_bow, target_names=le.classes_))

print("\n=== BOW Confusion Matrix ===")
print(confusion_matrix(y_test, pred_bow))


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
=== BOW Classification Report ===

              precision    recall  f1-score   support

    negative       0.60      0.59      0.59       121
     neutral       0.78      0.84      0.81       576
    positive       0.63      0.53      0.58       273

    accuracy                           0.72       970
   macro avg       0.67      0.65      0.66       970
weighted avg       0.72      0.72      0.72       970


=== BOW Confusion Matrix ===
[[ 71  33  17]
 [ 23 486  67]
 [ 25 103 145]]


In [12]:
model_tfidf = build_dnn(X_train_tfidf.shape[1])

history_tfidf = model_tfidf.fit(
    X_train_tfidf,
    y_train,
    epochs=8,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/8
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 27ms/step - accuracy: 0.5922 - loss: 0.9690 - val_accuracy: 0.6972 - val_loss: 0.7138
Epoch 2/8
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 24ms/step - accuracy: 0.7592 - loss: 0.5372 - val_accuracy: 0.7371 - val_loss: 0.6515
Epoch 3/8
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 28ms/step - accuracy: 0.9448 - loss: 0.2063 - val_accuracy: 0.7345 - val_loss: 0.7794
Epoch 4/8
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.9832 - loss: 0.0750 - val_accuracy: 0.7255 - val_loss: 0.9088
Epoch 5/8
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9922 - loss: 0.0343 - val_accuracy: 0.7255 - val_loss: 1.0258
Epoch 6/8
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9930 - loss: 0.0224 - val_accuracy: 0.7229 - val_loss: 1.1197
Epoch 7/8
[1m97/97[0m [32m━━━━━━━━━━━

In [13]:
# Predict on TF-IDF test data
pred_tfidf = model_tfidf.predict(X_test_tfidf)
pred_tfidf = np.argmax(pred_tfidf, axis=1)

print("=== TF-IDF Classification Report ===\n")
print(classification_report(y_test, pred_tfidf, target_names=le.classes_))

print("\n=== TF-IDF Confusion Matrix ===")
print(confusion_matrix(y_test, pred_tfidf))


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
=== TF-IDF Classification Report ===

              precision    recall  f1-score   support

    negative       0.58      0.51      0.55       121
     neutral       0.78      0.81      0.79       576
    positive       0.57      0.56      0.57       273

    accuracy                           0.70       970
   macro avg       0.65      0.63      0.63       970
weighted avg       0.70      0.70      0.70       970


=== TF-IDF Confusion Matrix ===
[[ 62  32  27]
 [ 25 464  87]
 [ 19 101 153]]


In [14]:
# Calculate accuracy for both models
bow_accuracy = np.mean(pred_bow == y_test)
tfidf_accuracy = np.mean(pred_tfidf == y_test)

print("=== PERFORMANCE COMPARISON ===\n")
print(f"BOW Accuracy:   {bow_accuracy:.4f}")
print(f"TF-IDF Accuracy:{tfidf_accuracy:.4f}")

print("""
=== INTERPRETATION GUIDE ===

• BOW (CountVectorizer) focuses on raw word frequency.
  - Works well when financial sentiment depends on the presence
    of specific strong words (e.g., "gain", "loss", "cut", "hike").

• TF-IDF gives more weight to rare but meaningful terms.
  - Helps when nuanced context matters (e.g., "unexpected slowdown",
    "strong forward guidance").

• If BOW > TF-IDF:
      → Headlines rely heavily on common sentiment-heavy words.

• If TF-IDF > BOW:
      → Rarer words offer stronger sentiment cues.

Either model can outperform the other depending on dataset characteristics.
""")


=== PERFORMANCE COMPARISON ===

BOW Accuracy:   0.7237
TF-IDF Accuracy:0.7000

=== INTERPRETATION GUIDE ===

• BOW (CountVectorizer) focuses on raw word frequency.
  - Works well when financial sentiment depends on the presence
    of specific strong words (e.g., "gain", "loss", "cut", "hike").

• TF-IDF gives more weight to rare but meaningful terms.
  - Helps when nuanced context matters (e.g., "unexpected slowdown", 
    "strong forward guidance").

• If BOW > TF-IDF:
      → Headlines rely heavily on common sentiment-heavy words.

• If TF-IDF > BOW:
      → Rarer words offer stronger sentiment cues.

Either model can outperform the other depending on dataset characteristics.



In [15]:
def predict_sentiment(text, model, vectorizer):
    # Clean the text using same preprocessing
    cleaned = clean_text(text)

    # Vectorize
    vect = vectorizer.transform([cleaned]).toarray()

    # Predict
    pred = model.predict(vect)
    pred_label = np.argmax(pred, axis=1)[0]

    # Convert numeric label back to original class
    return le.inverse_transform([pred_label])[0]


# Example usage:
print(predict_sentiment(
    "Fed signals possible rate cut next quarter",
    model_tfidf,         # or model_bow
    tfidf_vectorizer     # or bow_vectorizer
))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
negative
