In [54]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split


In [55]:
df  = pd.read_csv('../data/all-data.csv',encoding='ISO-8859-1', header=None)

df.columns = ['Sentiment', 'Headline']
df.head()

Unnamed: 0,Sentiment,Headline
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [56]:
print(df['Sentiment'].value_counts())

Sentiment
neutral     2879
positive    1363
negative     604
Name: count, dtype: int64


In [57]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation/numbers
    return text

df['News'] = df['Headline'].apply(clean_text)

In [58]:
# from sklearn.preprocessing import LabelEncoder

# le = LabelEncoder()
# df['label'] = le.fit_transform(df['Sentiment'])
label_map = {'negative':0,'neutral':1,'positive':2}
df['label'] = df['Sentiment'].map(label_map)

In [59]:
print("Total samples:", len(df))

Total samples: 4846


In [60]:
train_df, test_df = train_test_split(
    df[['News','label']], test_size=0.2, random_state=42, stratify=df['label']
)

In [61]:
# ## 3. TF–IDF + Traditional ML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Vectorize
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_train = vectorizer.fit_transform(train_df['News'])
X_test  = vectorizer.transform(test_df['News'])
y_train = train_df['label']
y_test  = test_df['label']

# Define models
models = {
    'LogisticRegression': LogisticRegression(),
    'MultinomialNB': MultinomialNB(),
    'LinearSVC': LinearSVC(max_iter=10000),
    'RandomForest': RandomForestClassifier(n_estimators=100, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}

results = {}
for name, m in models.items():
    m.fit(X_train, y_train)
    preds = m.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, preds, target_names=label_map.keys()))
    results[name] = acc


LogisticRegression Accuracy: 0.7567
              precision    recall  f1-score   support

    negative       0.79      0.46      0.58       121
     neutral       0.75      0.93      0.83       576
    positive       0.75      0.52      0.61       273

    accuracy                           0.76       970
   macro avg       0.76      0.64      0.68       970
weighted avg       0.76      0.76      0.74       970

MultinomialNB Accuracy: 0.7289
              precision    recall  f1-score   support

    negative       0.86      0.30      0.44       121
     neutral       0.73      0.95      0.83       576
    positive       0.71      0.44      0.55       273

    accuracy                           0.73       970
   macro avg       0.76      0.57      0.60       970
weighted avg       0.74      0.73      0.70       970

LinearSVC Accuracy: 0.7598
              precision    recall  f1-score   support

    negative       0.73      0.62      0.67       121
     neutral       0.80      0.85  

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 0.7608
              precision    recall  f1-score   support

    negative       0.73      0.55      0.62       121
     neutral       0.77      0.91      0.83       576
    positive       0.74      0.55      0.63       273

    accuracy                           0.76       970
   macro avg       0.74      0.67      0.70       970
weighted avg       0.76      0.76      0.75       970



In [62]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

# Tokenize and pad
MAX_VOCAB = 10000
MAX_LEN = 100
EMB_DIM = 100

tok = Tokenizer(num_words=MAX_VOCAB, oov_token='<OOV>')
tok.fit_on_texts(train_df['News'])

seq_train = tok.texts_to_sequences(train_df['News'])
seq_test  = tok.texts_to_sequences(test_df['News'])

X_train_seq = pad_sequences(seq_train, maxlen=MAX_LEN, padding='post')
X_test_seq  = pad_sequences(seq_test, maxlen=MAX_LEN, padding='post')

y_train_oh = pd.get_dummies(y_train).values
y_test_oh  = pd.get_dummies(y_test).values

# Build LSTM model
gdlstm = Sequential([
    Embedding(MAX_VOCAB, EMB_DIM, input_length=MAX_LEN),
    LSTM(128, return_sequences=False),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

gdlstm.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

gdlstm.summary()

# Train with early stopping
early = EarlyStopping(patience=3, restore_best_weights=True)

gdlstm.fit(
    X_train_seq, y_train_oh,
    validation_split=0.1,
    epochs=10,
    batch_size=128,
    callbacks=[early]
)

# Evaluate on test set
loss, acc_lstm = gdlstm.evaluate(X_test_seq, y_test_oh)
print(f"LSTM Test Accuracy: {acc_lstm:.4f}")
results['LSTM'] = acc_lstm



Epoch 1/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 64ms/step - accuracy: 0.5482 - loss: 0.9752 - val_accuracy: 0.5954 - val_loss: 0.9299
Epoch 2/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 57ms/step - accuracy: 0.5930 - loss: 0.9287 - val_accuracy: 0.5954 - val_loss: 0.9296
Epoch 3/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 60ms/step - accuracy: 0.6022 - loss: 0.9244 - val_accuracy: 0.5954 - val_loss: 0.9289
Epoch 4/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 63ms/step - accuracy: 0.6092 - loss: 0.9218 - val_accuracy: 0.5954 - val_loss: 0.9367
Epoch 5/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 63ms/step - accuracy: 0.6012 - loss: 0.9283 - val_accuracy: 0.5954 - val_loss: 0.9301
Epoch 6/10
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 59ms/step - accuracy: 0.5953 - loss: 0.9226 - val_accuracy: 0.5954 - val_loss: 0.9371
[1m31/31[0m [32m━━━━━━━━━━━━━━━

In [None]:
#credit: https://www.youtube.com/watch?v=JrtXX4cHgBI&ab_channel=GabrielAtkin
#mainly for learning, XGboost still have slightly better accuracy
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import tensorflow as tf


In [63]:
import pandas as pd
res_df = pd.DataFrame(
    list(results.items()),
    columns=['Model','Accuracy']
).sort_values('Accuracy', ascending=False)
res_df

Unnamed: 0,Model,Accuracy
4,XGBoost,0.760825
2,LinearSVC,0.759794
0,LogisticRegression,0.756701
3,RandomForest,0.745361
1,MultinomialNB,0.728866
5,LSTM,0.593814
