In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [3]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.16.1-cp311-cp311-win_amd64.whl.metadata (3.5 kB)
Collecting tensorflow-intel==2.16.1 (from tensorflow)
  Downloading tensorflow_intel-2.16.1-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=23.5.26 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading flatbuffers-24.3.7-py2.py3-none-any.whl.metadata (849 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading gast-0.5.4-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.16.1->tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensorflow-i

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [5]:
# Load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [6]:
# Text Preprocessing
# (You may need to install nltk and download its data using nltk.download() for stopwords and WordNet)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [7]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
def preprocess_text(text):
    tokens = text.split()
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

In [10]:
df_train['clean_text'] = df_train['safe_text'].apply(preprocess_text)
df_test['clean_text'] = df_test['safe_text'].astype(str).apply(preprocess_text)

In [11]:
# Tokenization and Padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['clean_text'])

In [12]:
X_train = tokenizer.texts_to_sequences(df_train['clean_text'])
X_test = tokenizer.texts_to_sequences(df_test['clean_text'])

In [13]:
maxlen = 100  # Choose the maximum length of sequences
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [14]:
# Splitting the training data into training and validation sets
y_train = df_train['label']
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [16]:
# Model Training
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

In [17]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val))

Epoch 1/5
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 203ms/step - accuracy: 0.4881 - loss: nan - val_accuracy: 0.4938 - val_loss: nan
Epoch 2/5
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 195ms/step - accuracy: 0.4926 - loss: nan - val_accuracy: 0.4938 - val_loss: nan
Epoch 3/5
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 205ms/step - accuracy: 0.4903 - loss: nan - val_accuracy: 0.4938 - val_loss: nan
Epoch 4/5
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 241ms/step - accuracy: 0.4916 - loss: nan - val_accuracy: 0.4938 - val_loss: nan
Epoch 5/5
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 200ms/step - accuracy: 0.4878 - loss: nan - val_accuracy: 0.4938 - val_loss: nan


<keras.src.callbacks.history.History at 0x2e43ec13bd0>

In [18]:
# Model Evaluation
y_pred = (model.predict(X_val) > 0.5).astype(int)
print("Validation Set Performance: ")
print(classification_report(y_val, y_pred))

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 60ms/step
Validation Set Performance: 
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00       231
         0.0       0.49      1.00      0.66       988
         1.0       0.00      0.00      0.00       782

    accuracy                           0.49      2001
   macro avg       0.16      0.33      0.22      2001
weighted avg       0.24      0.49      0.33      2001



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# model accuracy score
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy*100)

In [None]:
# Model Deployment: Predict sentiments of test data
y_pred_test = (model.predict(X_test) > 0.5).astype(int)

# Save predictions to submission.csv
submission_df = pd.DataFrame({
    'tweet_id': df_test['tweet_id'],
    'label': y_pred_test.flatten()
})
submission_df.to_csv('submission_lstm.csv', index=False)
print("Submission file saved successfully.")