In [1]:
!pip install transformers pandas numpy scikit-learn tensorflow -q

In [2]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers import BertTokenizer, TFBertForSequenceClassification
import os

In [4]:
tf.random.set_seed(42)
np.random.seed(42)

In [5]:
def load_and_validate_data (file_path="/content/drive/My Drive/Project and Coding/BERT AND ELECTRA/stock_data.csv"):
  try:
    df=pd.read_csv(file_path)
    if 'Text' not in df.columns or 'Sentiment' not in df.columns:
      raise ValueError("Dataset must contain 'Text' and 'Sentiment' columns")
    df=df.dropna(subset=['Text','Sentiment'])
    df['Sentiment'] = df['Sentiment'].apply(lambda x: 0 if x == -1 else 1)

    print("Dataset Info:")
    print(df.info())
    print("\nSample Data:")
    print(df.head())
    print("\nLabel Distribution:")
    print(df['Sentiment'].value_counts())

    return df
  except FileNotFoundError:
    print("Error: stock_data.csv not found.")
    return None


In [6]:
df=load_and_validate_data()
if df is None:
  raise SystemExit("Failed to load dataset. Exiting.")

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5791 entries, 0 to 5790
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       5791 non-null   object
 1   Sentiment  5791 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 90.6+ KB
None

Sample Data:
                                                Text  Sentiment
0  Kickers on my watchlist XIDE TIT SOQ PNK CPW B...          1
1  user: AAP MOVIE. 55% return for the FEA/GEED i...          1
2  user I'd be afraid to short AMZN - they are lo...          1
3                                  MNTA Over 12.00            1
4                                   OI  Over 21.37            1

Label Distribution:
Sentiment
1    3685
0    2106
Name: count, dtype: int64


In [7]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['Text'],
    df['Sentiment'],
    test_size=0.2,
    random_state=42,
    stratify=df['Sentiment']
)


In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
# Tokenize data
def tokenize_data(texts, max_length=128):
    return tokenizer(
        list(texts),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='tf'
    )

train_encodings = tokenize_data(train_texts)
test_encodings = tokenize_data(test_texts)

In [10]:
# Create TensorFlow datasets
def create_dataset(encodings, labels):
    dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask'],
            'token_type_ids': encodings['token_type_ids']
        },
        labels
    ))
    return dataset

train_dataset = create_dataset(train_encodings, tf.convert_to_tensor(list(train_labels)))
test_dataset = create_dataset(test_encodings, tf.convert_to_tensor(list(test_labels)))

In [11]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5, epsilon=1e-8)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [
    tf.keras.metrics.SparseCategoricalAccuracy('accuracy'),
    tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1, name='top_1_accuracy')
]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [13]:
# Training callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=1,
        restore_best_weights=True
    ),
    tf.keras.callbacks.ModelCheckpoint(
        '/content/drive/MyDrive/Project and Coding/BERT AND ELECTRA/best_bert_model',
        monitor='val_accuracy',
        save_best_only=True,
        mode='max'
    )
]

In [14]:
print("\nTraining BERT model...")
history = model.fit(
    train_dataset.shuffle(1000).batch(16),
    epochs=3,
    batch_size=16,
    validation_data=test_dataset.batch(16),
    callbacks=callbacks
)


Training BERT model...
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [15]:
print("\nEvaluating model...")
predictions = model.predict(test_dataset.batch(16))
pred_labels = np.argmax(predictions.logits, axis=1)


Evaluating model...


In [16]:
print("\nClassification Report:")
print(classification_report(test_labels, pred_labels, target_names=['Negative', 'Positive']))
print("Accuracy:", accuracy_score(test_labels, pred_labels))
print("F1 Score:", f1_score(test_labels, pred_labels))


Classification Report:
              precision    recall  f1-score   support

    Negative       0.82      0.67      0.73       421
    Positive       0.83      0.91      0.87       738

    accuracy                           0.82      1159
   macro avg       0.82      0.79      0.80      1159
weighted avg       0.82      0.82      0.82      1159

Accuracy: 0.8248490077653149
F1 Score: 0.8692852543464262


In [17]:
save_path = "/content/drive/MyDrive/Project and Coding/BERT AND ELECTRA/bert_sentiment_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"\nModel saved to {save_path}")


Model saved to /content/drive/MyDrive/Project and Coding/BERT AND ELECTRA/bert_sentiment_model


In [18]:
# Prediction function
def predict_sentiment(text):
    try:
        inputs = tokenizer(
            text,
            return_tensors="tf",
            truncation=True,
            padding=True,
            max_length=128
        )
        outputs = model(inputs)
        probs = tf.nn.softmax(outputs.logits, axis=-1)
        pred = np.argmax(probs, axis=1)[0]
        confidence = probs[0][pred].numpy()
        return ("Positive" if pred == 1 else "Negative", confidence)
    except Exception as e:
        print(f"Error in prediction: {str(e)}")
        return None, None

In [19]:
sample_texts = [
    "The stock market is showing strong growth this quarter",
    "Company profits are declining due to market conditions"
]

print("\nExample Predictions:")
for text in sample_texts:
    sentiment, confidence = predict_sentiment(text)
    print(f"Text: {text}")
    if sentiment is not None and confidence is not None:
        print(f"Sentiment: {sentiment}, Confidence: {confidence:.4f}\n")
    else:
        print(f"Error predicting sentiment for text: {text}\n")


Example Predictions:
Text: The stock market is showing strong growth this quarter
Sentiment: Positive, Confidence: 0.9782

Text: Company profits are declining due to market conditions
Sentiment: Negative, Confidence: 0.9107

