In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import re

In [2]:
# Step 1: Load and preprocess the dataset
def load_and_preprocess(file_path, max_words=10000, max_len=50):
    # Check if file exists
    if not os.path.exists(file_path):
      raise FileNotFoundError(
        f"Dataset file not found at: {file_path}"
 )

    # Read dataset USING the file path
    df = pd.read_csv("data.csv")

    # Validate required columns
    required_cols = {'text', 'airline_sentiment'}
    if not required_cols.issubset(df.columns):
        missing = required_cols - set(df.columns)
        raise ValueError(
           f"Missing required columns: {missing}. "
           f"Available columns: {list(df.columns)}"
 )

    # Keep required columns
    df = df[['text', 'airline_sentiment']]

    # Clean tweet text
    def clean_text(text):
      text = str(text).lower()
      text = re.sub(r"http\S+", "", text)
      text = re.sub(r"@\w+", "", text)
      text = re.sub(r"[^a-z\s]", "", text)
      text = re.sub(r"\s+", " ", text).strip()
      return text

    df['text'] = df['text'].apply(clean_text)

    # 4-class label mapping
    label_mapping = {
      'positive': 0,
      'negative': 1,
      'neutral': 2,
      'irrelevant': 3
 }
    df['label'] = df['airline_sentiment'].map(label_mapping)
    df = df.dropna(subset=['label'])
    df['label'] = df['label'].astype(int)

    # Tokenization
    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
    tokenizer.fit_on_texts(df['text'])

    sequences = tokenizer.texts_to_sequences(df['text'])
    padded_sequences = pad_sequences(
        sequences, maxlen=max_len, padding='post', truncating='post'
 )

    # One-hot encode labels
    labels = tf.keras.utils.to_categorical(df['label'], num_classes=4)

    return padded_sequences, labels, tokenizer

In [3]:
# Step 2: Build the RNN (LSTM) model
def build_rnn(input_length, vocab_size, embedding_dim=100):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=input_length),
        LSTM(130),
        Dropout(0.5),
        Dense(4, activation='softmax')
 ])

    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
 )
    return model

In [4]:
# Step 3: Train the model
def train_model(model, X_train, y_train, X_val, y_val,
                epochs=7, batch_size=32):
 model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=epochs,
    batch_size=batch_size
 )


In [5]:
# Step 4: Evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true = np.argmax(y_test, axis=1)

    print("\nClassification Report:\n")
    print(classification_report(

        y_true,
        y_pred_classes,
        labels=[0,1,2,3],
        target_names=['Positive', 'Negative', 'Neutral', 'Irrelevant']))

    print("\nConfusion Matrix:\n")
    print(confusion_matrix(y_true, y_pred_classes))


In [6]:
from google.colab import files
import warnings
warnings.filterwarnings("ignore")
# Main function
def main():
 FILE_PATH = "data.csv" # put file in same folder OR give full path
 MAX_WORDS = 10000
 MAX_LEN = 50
 EMBEDDING_DIM = 100


 X, y, tokenizer = load_and_preprocess(
      FILE_PATH, MAX_WORDS, MAX_LEN
 )
 X_train, X_test, y_train, y_test = train_test_split(
      X, y, test_size=0.2, random_state=50
 )
 X_train, X_val, y_train, y_val = train_test_split(
      X_train, y_train, test_size=0.1, random_state=50
 )
 vocab_size = min(MAX_WORDS, len(tokenizer.word_index) + 1)
 model = build_rnn(MAX_LEN, vocab_size, EMBEDDING_DIM)

 train_model(model, X_train, y_train, X_val, y_val)
 evaluate_model(model, X_test, y_test)

if __name__ == "__main__":
    main()

Epoch 1/7
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 92ms/step - accuracy: 0.6079 - loss: 0.9880 - val_accuracy: 0.6331 - val_loss: 0.9134
Epoch 2/7
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 91ms/step - accuracy: 0.6330 - loss: 0.9159 - val_accuracy: 0.6988 - val_loss: 0.6940
Epoch 3/7
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 95ms/step - accuracy: 0.7187 - loss: 0.6422 - val_accuracy: 0.7654 - val_loss: 0.5941
Epoch 4/7
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 91ms/step - accuracy: 0.7816 - loss: 0.5252 - val_accuracy: 0.7730 - val_loss: 0.5825
Epoch 5/7
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 91ms/step - accuracy: 0.8639 - loss: 0.3893 - val_accuracy: 0.7986 - val_loss: 0.5423
Epoch 6/7
[1m330/330[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 95ms/step - accuracy: 0.8860 - loss: 0.3466 - val_accuracy: 0.7696 - val_loss: 0.6512
Epoch 7/7
[1m330/330