In [27]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

In [28]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
DATA_FILE = "/content/sample_data/all-data.csv"
stop_words = set(stopwords.words('english'))

In [30]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    words = text.lower().split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

In [31]:
df = pd.read_csv("/content/sample_data/all-data.csv", encoding='latin-1', header=None)
df.columns = ['Sentiment', 'Sentence']
df['Sentence'] = df['Sentence'].apply(clean_text)
print("Sample cleaned sentence:", df['Sentence'].iloc[0])
print("Unique sentiment classes:", df['Sentiment'].unique())

Sample cleaned sentence: according gran company plans move production russia although company growing
Unique sentiment classes: ['neutral' 'negative' 'positive']


In [12]:
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
df['Sentiment'] = df['Sentiment'].map(label_map)
y = to_categorical(df['Sentiment'])

In [13]:
x_train, x_test, y_train, y_test = train_test_split(
    df['Sentence'], y, test_size=0.2, random_state=42, stratify=df['Sentiment']
)

In [14]:
MAX_VOCAB = 10000

tokenizer = Tokenizer(num_words=MAX_VOCAB)
tokenizer.fit_on_texts(x_train)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

# Dynamic max length
max_len = int(np.mean([len(seq) for seq in x_train_seq])) + 5

x_train_pad = pad_sequences(x_train_seq, maxlen=max_len, padding='post', truncating='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=max_len, padding='post', truncating='post')

print("Vocabulary size:", len(tokenizer.word_index))
print("Max sequence length:", max_len)

Vocabulary size: 8323
Max sequence length: 16


In [15]:
EMBED_SIZE = 64
LSTM_UNITS = 64
NUM_CLASSES = 3  # negative, neutral, positive

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_SIZE, input_length=max_len),
    LSTM(LSTM_UNITS, dropout=0.3, recurrent_dropout=0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()



In [16]:
checkpoint = ModelCheckpoint(
    filepath="models/LSTM_multiclass.h5",
    monitor="val_accuracy",
    save_best_only=True,
    verbose=1
)

history = model.fit(
    x_train_pad,
    y_train,
    batch_size=64,
    epochs=7,
    validation_split=0.2,
    callbacks=[checkpoint]
)

Epoch 1/7
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.5348 - loss: 1.0301
Epoch 1: val_accuracy improved from -inf to 0.60696, saving model to models/LSTM_multiclass.h5




[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 66ms/step - accuracy: 0.5357 - loss: 1.0291 - val_accuracy: 0.6070 - val_loss: 0.8701
Epoch 2/7
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.6108 - loss: 0.8446
Epoch 2: val_accuracy improved from 0.60696 to 0.67655, saving model to models/LSTM_multiclass.h5




[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 58ms/step - accuracy: 0.6114 - loss: 0.8437 - val_accuracy: 0.6765 - val_loss: 0.7293
Epoch 3/7
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.7492 - loss: 0.6106
Epoch 3: val_accuracy did not improve from 0.67655
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 54ms/step - accuracy: 0.7495 - loss: 0.6101 - val_accuracy: 0.6753 - val_loss: 0.7168
Epoch 4/7
[1m48/49[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 64ms/step - accuracy: 0.8034 - loss: 0.4805
Epoch 4: val_accuracy did not improve from 0.67655
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 71ms/step - accuracy: 0.8038 - loss: 0.4800 - val_accuracy: 0.6727 - val_loss: 0.8401
Epoch 5/7
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.8333 - loss: 0.3805
Epoch 5: val_accurac



[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 59ms/step - accuracy: 0.8335 - loss: 0.3805 - val_accuracy: 0.6869 - val_loss: 0.9148
Epoch 6/7
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step - accuracy: 0.8613 - loss: 0.3235
Epoch 6: val_accuracy improved from 0.68686 to 0.69974, saving model to models/LSTM_multiclass.h5




[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 92ms/step - accuracy: 0.8612 - loss: 0.3237 - val_accuracy: 0.6997 - val_loss: 0.8399
Epoch 7/7
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.8891 - loss: 0.2700
Epoch 7: val_accuracy improved from 0.69974 to 0.72294, saving model to models/LSTM_multiclass.h5




[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 62ms/step - accuracy: 0.8893 - loss: 0.2698 - val_accuracy: 0.7229 - val_loss: 0.8529


In [17]:
loss, acc = model.evaluate(x_test_pad, y_test)
print(f"\nTest Accuracy: {acc*100:.2f}%")

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7000 - loss: 1.0142

Test Accuracy: 70.62%


In [34]:
reverse_label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

def predict_sentiment(sentence):
    cleaned = clean_text(sentence)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=max_len, padding='post')

    prediction = model.predict(padded)
    predicted_class = np.argmax(prediction)

    print(f"\nCleaned Sentence: {cleaned}")
    print("Prediction Scores:", prediction)
    return reverse_label_map[predicted_class]

# Example prediction 1
sample_sentence = "The company's revenue grew rapidly this year and the future looks very promising."
result = predict_sentiment(sample_sentence)
print("Predicted Sentiment:", result)

# Example prediction 2
sample_sentence = "The company announced its quarterly report during a press conference today."
result = predict_sentiment(sample_sentence)
print("Predicted Sentiment:", result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step

Cleaned Sentence: companys revenue grew rapidly year future looks promising
Prediction Scores: [[4.5183967e-03 4.2997814e-05 9.9543864e-01]]
Predicted Sentiment: positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step

Cleaned Sentence: company announced quarterly report press conference today
Prediction Scores: [[0.13363299 0.70913047 0.15723649]]
Predicted Sentiment: neutral
