In [1]:
import pandas as pd

In [None]:
df  = pd.read_csv('../data/all-data.csv',encoding='ISO-8859-1', header=None)


In [3]:
df.columns = ['Sentiment', 'Headline']
print(df.head())

  Sentiment                                           Headline
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...
3  positive  With the new production plant the company woul...
4  positive  According to the company 's updated strategy f...


In [4]:
print(df['Sentiment'].value_counts())

Sentiment
neutral     2879
positive    1363
negative     604
Name: count, dtype: int64


In [5]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # remove punctuation/numbers
    return text

df['cleaned'] = df['Headline'].apply(clean_text)

In [6]:
df.head()

Unnamed: 0,Sentiment,Headline,cleaned
0,neutral,"According to Gran , the company has no plans t...",according to gran the company has no plans to...
1,neutral,Technopolis plans to develop in stages an area...,technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...,the international electronic industry company ...
3,positive,With the new production plant the company woul...,with the new production plant the company woul...
4,positive,According to the company 's updated strategy f...,according to the company s updated strategy fo...


In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['label'] = le.fit_transform(df['Sentiment'])
print(le.classes_)  # ['negative', 'neutral', 'positive']

['negative' 'neutral' 'positive']


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['cleaned'], df['label'], test_size=0.2, random_state=42)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

print(classification_report(y_test, y_pred, target_names=le.classes_))


              precision    recall  f1-score   support

    negative       0.92      0.51      0.65       110
     neutral       0.75      0.94      0.84       571
    positive       0.79      0.53      0.64       289

    accuracy                           0.77       970
   macro avg       0.82      0.66      0.71       970
weighted avg       0.78      0.77      0.76       970



In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline    import Pipeline
from sklearn.metrics     import classification_report

nb_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000)),
    ("nb",     MultinomialNB()),
])
nb_pipeline.fit(X_train, y_train)
y_pred_nb = nb_pipeline.predict(X_test)

print(classification_report(y_test, y_pred_nb, target_names=le.classes_))


              precision    recall  f1-score   support

    negative       1.00      0.12      0.21       110
     neutral       0.69      0.98      0.81       571
    positive       0.69      0.35      0.47       289

    accuracy                           0.70       970
   macro avg       0.80      0.49      0.50       970
weighted avg       0.73      0.70      0.64       970



In [12]:
from sklearn.svm import LinearSVC

svm_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000)),
    ("svc",    LinearSVC(C=1.0, max_iter=10000)),
])
svm_pipeline.fit(X_train, y_train)
y_pred_svm = svm_pipeline.predict(X_test)

print(classification_report(y_test, y_pred_svm, target_names=le.classes_))


              precision    recall  f1-score   support

    negative       0.80      0.60      0.69       110
     neutral       0.79      0.89      0.84       571
    positive       0.74      0.63      0.68       289

    accuracy                           0.78       970
   macro avg       0.78      0.71      0.74       970
weighted avg       0.78      0.78      0.78       970



In [14]:
import numpy as np
from tensorflow.keras.preprocessing.text   import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models                import Sequential
from tensorflow.keras.layers               import Embedding, Bidirectional, LSTM, Dense

# 1) Tokenize & pad
max_words   = 10000
max_len     = 50

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
seq_train = tokenizer.texts_to_sequences(X_train)
seq_test  = tokenizer.texts_to_sequences(X_test)

X_tr = pad_sequences(seq_train, maxlen=max_len)
X_te = pad_sequences(seq_test,  maxlen=max_len)

# 2) Build model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dense(3, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# 3) Train
model.fit(X_tr, y_train, validation_split=0.1, epochs=20, batch_size=32)

# 4) Evaluate
loss, acc = model.evaluate(X_te, y_test)
print(f"Bi-LSTM accuracy: {acc:.3f}")


Epoch 1/20




[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.6192 - loss: 0.9074 - val_accuracy: 0.6521 - val_loss: 0.7677
Epoch 2/20
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7759 - loss: 0.5370 - val_accuracy: 0.7268 - val_loss: 0.7099
Epoch 3/20
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9293 - loss: 0.2086 - val_accuracy: 0.7320 - val_loss: 0.7976
Epoch 4/20
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9725 - loss: 0.0978 - val_accuracy: 0.7371 - val_loss: 0.8925
Epoch 5/20
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9808 - loss: 0.0639 - val_accuracy: 0.7423 - val_loss: 0.9121
Epoch 6/20
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9860 - loss: 0.0436 - val_accuracy: 0.7294 - val_loss: 1.0881
Epoch 7/20
[1m109/109[0m [32m━

In [None]:
from transformers import BertTokenizerFast, TFBertForSequenceClassification
import tensorflow as tf

# 1) Load tokenizer & model
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
model     = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# 2) Tokenize the dataset
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=64)
test_encodings  = tokenizer(list(X_test),  truncation=True, padding=True, max_length=64)

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).shuffle(1000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test
)).batch(16)

# 3) Compile & train
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer,
              loss=model.compute_loss,  # handles logits + labels
              metrics=["accuracy"])
model.fit(train_dataset, validation_data=test_dataset, epochs=3)

# 4) Predict & report
y_pred_bert = model.predict(test_dataset).logits.argmax(axis=-1)
print(classification_report(y_test, y_pred_bert, target_names=le.classes_))


ValueError: Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.