<a href="https://colab.research.google.com/github/ellyasiml/CNN-LSTM-sentiment-analysis/blob/main/ta_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install symspellpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from symspellpy import SymSpell, Verbosity

In [None]:
# load GloVe embedding
embedding_dim = 100
embedding_dict = {}
with open("drive/MyDrive/dataset/dataset TA/glove.twitter.27B.100d.txt", 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_dict[word] = vector

In [None]:
# Load the pre-trained CNN model
cnn_model = tf.keras.models.load_model('drive/MyDrive/dataset/dataset TA/cnn_model.h5')

# Load the pre-trained LSTM model
lstm_model = tf.keras.models.load_model('drive/MyDrive/dataset/dataset TA/lstm_model.h5')

# Load the dataset of tweets you want to predict sentiment for
tweets_df = pd.read_csv('drive/MyDrive/dataset/dataset TA/data_combined.csv')
tweets = tweets_df["tweet"].values

  tweets_df = pd.read_csv('drive/MyDrive/dataset/dataset TA/data_combined.csv')


In [None]:
# create SymSpell instance
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = "drive/My Drive/dataset/dataset TA/wiki-id-formatted-1000.txt"
term_index = 0
count_index = 1
if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
    print("Dictionary file not found")

# function to correct spelling errors in tweets
def correct_spellings(text):
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)
    corrected_text = suggestions[0].term if suggestions else text
    return corrected_text

# correct spelling errors in tweets
tweets = [correct_spellings(tweet) for tweet in tweets]

In [None]:
# Convert the tweets to sequences of integers using the same tokenizer that was used to train the model
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweets_df['tweet'])
sequences = tokenizer.texts_to_sequences(tweets_df['tweet'])

In [None]:
# Define a function to compute the maximum input shape across multiple models
def get_max_input_shape(models):
    max_length = 0
    for model in models:
        if model.input_shape[1] > max_length:
            max_length = model.input_shape[1]
    return max_length
  
# Pad the sequences to the maximum input shape across both models
max_length = get_max_input_shape([cnn_model, lstm_model])
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length)

In [None]:
# create embedding matrix
num_words = min(5000, len(tokenizer.word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= 5000:
        continue
    embedding_vector = embedding_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# Use the CNN model to predict the sentiment of the tweets
cnn_predictions = cnn_model.predict(padded_sequences)

# Use the LSTM model to predict the sentiment of the tweets
lstm_predictions = lstm_model.predict(padded_sequences)

# Round the predictions to either 0 or 1
cnn_labels = np.round(cnn_predictions).astype(int)
lstm_labels = np.round(lstm_predictions).astype(int)



In [None]:
cnn_labels

array([[1, 0],
       [1, 0],
       [0, 1],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]])

In [None]:
lstm_labels

array([[1, 0],
       [1, 0],
       [1, 0],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]])

In [None]:
# Convert the predictions to 1D arrays
cnn_labels = np.array([1 if pred[0] < 0.5 else 0 for pred in cnn_predictions])
lstm_labels = np.array([1 if pred[0] < 0.5 else 0 for pred in lstm_predictions])

In [None]:
cnn_labels

array([0, 0, 1, ..., 0, 0, 0])

In [None]:
lstm_labels

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
# Add the predictions to the tweets_df dataframe
tweets_df["cnn_prediction"] = cnn_labels
tweets_df["lstm_prediction"] = lstm_labels

In [None]:
# Save the predictions to a single CSV file
combined_df = tweets_df[["tweet", "cnn_prediction", "lstm_prediction"]]
combined_df.to_csv("combined_predictions.csv", index=False)