In [1]:
!pip install transformers pandas numpy scikit-learn tensorflow -q

In [2]:
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,classification_report,accuracy_score
from transformers import ElectraTokenizer,TFElectraForSequenceClassification
import os

In [4]:
tf.random.set_seed(42)
np.random.seed(42)


In [5]:
def load_and_validate_data(file_path="/content/drive/My Drive/Project and Coding/BERT AND ELECTRA/Apple_tweets_setiment.csv"):
    try:
        df = pd.read_csv(file_path, on_bad_lines='skip', quoting=3)
        if 'tweets' not in df.columns or 'labels' not in df.columns:
            raise ValueError("Dataset must contain 'tweets' and 'labels' columns")
        df = df.dropna(subset=['tweets', 'labels'])

        # Filter out neutral (0.0) labels
        df = df[df['labels'] != 0.0]

        df['labels'] = df['labels'].apply(lambda x: 0 if x == -1.0 else 1)

        print("Dataset Info:")
        print(df.info())
        print("\nSample Data:")
        print(df.head())
        print("\nLabel Distribution:")
        print(df['labels'].value_counts())

        return df

    except FileNotFoundError:
        print("Error: Dataset file not found.")
        return None


In [6]:
df=load_and_validate_data()
if df is None:
  raise SystemExit("Failed to load dataset. Exiting.")

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 2149 entries, 5 to 4532
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   labels  2149 non-null   int64 
 1   tweets  2149 non-null   object
dtypes: int64(1), object(1)
memory usage: 50.4+ KB
None

Sample Data:
    labels                                             tweets
5        1            top 3 all @apple #tablets. damn right! 
6        1  cnbctv: #apple's margins better than expected?...
9        0  wtf my battery was 31% one second ago and now ...
11       1  rt @peterpham: bought my @augustsmartlock at t...
12       0  @apple contact sync between yosemite and ios8 ...

Label Distribution:
labels
0    1648
1     501
Name: count, dtype: int64


In [7]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['tweets'],
    df['labels'],
    test_size=0.2,
    random_state=42,
    stratify=df['labels']
)


In [8]:
tokenizer=ElectraTokenizer.from_pretrained('google/electra-small-discriminator')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
def tokenize_data(texts):
  return tokenizer(
      list(texts),
      truncation=True,
      padding=True,
      max_length=128,
      return_tensors='tf'
  )
train_encoding=tokenize_data(train_texts)
test_encoding=tokenize_data(test_texts)

In [10]:
def create_dataset(encoding,labels):
  dataset=tf.data.Dataset.from_tensor_slices((
      {
          'input_ids': encoding['input_ids'],
          'attention_mask':encoding['attention_mask']
      },
      tf.convert_to_tensor(list(labels))
  ))
  return dataset
train_dataset=create_dataset(train_encoding,train_labels)
test_dataset=create_dataset(test_encoding,test_labels)


In [11]:
model = TFElectraForSequenceClassification.from_pretrained('google/electra-small-discriminator', num_labels=2)

Some layers from the model checkpoint at google/electra-small-discriminator were not used when initializing TFElectraForSequenceClassification: ['discriminator_predictions']
- This IS expected if you are initializing TFElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5,epsilon=1e-8)
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics=[
    tf.keras.metrics.SparseCategoricalAccuracy('accuracy'),
    tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1,name='top_1_accuracy')
]
model.compile(optimizer=optimizer,loss=loss,metrics=metrics)

In [13]:
callbacks=[
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=1,
        restore_best_weights=True
    ),
    tf.keras.callbacks.ModelCheckpoint(
        '/content/drive/My Drive/Project and Coding/BERT AND ELECTRA/best_electra_model_2',
        monitor='val_accuracy',
        save_best_only=True,
        mode='max'
    )
]

In [14]:
print('\n Training Electra Model...')
history=model.fit(
    train_dataset.shuffle(1000).batch(16),
    epochs=3,
    validation_data=test_dataset.batch(16),
    callbacks=callbacks
)


 Training Electra Model...
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [15]:
print("\nEvaluating model...")
predictions = model.predict(test_dataset.batch(16))
pred_labels = np.argmax(predictions.logits, axis=1)


Evaluating model...


In [16]:
print("\nClassification Report:")
print(classification_report(test_labels, pred_labels, labels=[0, 1], target_names=['negative', 'positive']))
print("Accuracy:", accuracy_score(test_labels, pred_labels))
print("F1 Score:", f1_score(test_labels, pred_labels))



Classification Report:
              precision    recall  f1-score   support

    negative       0.96      0.96      0.96       330
    positive       0.87      0.86      0.86       100

    accuracy                           0.94       430
   macro avg       0.91      0.91      0.91       430
weighted avg       0.94      0.94      0.94       430

Accuracy: 0.9372093023255814
F1 Score: 0.864321608040201


In [17]:
save_path = "/content/drive/My Drive/Project and Coding/BERT AND ELECTRA/electra_sentiment_model_2"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"\nModel saved to {save_path}")


Model saved to /content/drive/My Drive/Project and Coding/BERT AND ELECTRA/electra_sentiment_model_2


In [18]:
def predict_sentiment(text):
    try:
        inputs = tokenizer(
            text,
            return_tensors="tf",
            truncation=True,
            padding=True,
            max_length=128
        )

        outputs = model(inputs)
        probs = tf.nn.softmax(outputs.logits, axis=-1)
        probs_np = probs.numpy()
        pred = np.argmax(probs_np, axis=1)[0]
        confidence = probs_np[0][pred]

        return ("Positive" if pred == 1 else "Negative", confidence)
    except Exception as e:
        print(f"Error in prediction: {str(e)}")
        return None, None

In [19]:
sample_texts = [
    "Absolutely love the new iPhone! Apple nailed it again.",
    "Apple stock is dropping after the disappointing earnings report.",
    "Thinking of switching to Android, tired of Apple's constant price hikes.",
    "The new iOS update is so smooth and fast. Great job Apple!",
    "Apple's customer service was incredibly helpful today.",
    "My MacBook just crashed again, so frustrating!",
    "Apple Pay makes checkout so much easier. Love this feature!",
    "Battery life on my iPhone is terrible after the latest update.",
    "AirPods Pro sound amazing! Totally worth the price.",
    "Apple Watch saved my life by detecting my irregular heartbeat."
]


print("\nExample Predictions:")
for text in sample_texts:
    sentiment, confidence = predict_sentiment(text)
    print(f"Text: {text}")
    if sentiment is not None and confidence is not None:
        print(f"Sentiment: {sentiment}, Confidence: {confidence:.4f}\n")
    else:
        print("Error...\n")


Example Predictions:
Text: Absolutely love the new iPhone! Apple nailed it again.
Sentiment: Positive, Confidence: 0.9732

Text: Apple stock is dropping after the disappointing earnings report.
Sentiment: Negative, Confidence: 0.9508

Text: Thinking of switching to Android, tired of Apple's constant price hikes.
Sentiment: Negative, Confidence: 0.9787

Text: The new iOS update is so smooth and fast. Great job Apple!
Sentiment: Positive, Confidence: 0.9730

Text: Apple's customer service was incredibly helpful today.
Sentiment: Positive, Confidence: 0.9737

Text: My MacBook just crashed again, so frustrating!
Sentiment: Negative, Confidence: 0.9768

Text: Apple Pay makes checkout so much easier. Love this feature!
Sentiment: Positive, Confidence: 0.9723

Text: Battery life on my iPhone is terrible after the latest update.
Sentiment: Negative, Confidence: 0.9843

Text: AirPods Pro sound amazing! Totally worth the price.
Sentiment: Positive, Confidence: 0.9739

Text: Apple Watch saved my