In [None]:
#preprocessing Twitter data and create a CSV file with tweet text and sentiment labels for use in text embedding through LSTM:
import pandas as pd
import re
from textblob import TextBlob

# Load the Twitter data
data = pd.read_csv('covid_data.csv')

# Remove duplicates
data.drop_duplicates(inplace=True)

# Extract unique users
unique_users = data['text'].unique()

# Keep only the columns with tweet text and author
data = data[['screen_name','text','location','friends_count']]

# Define a function to clean the tweet text
def clean_tweet(text):
    # Remove mentions, hashtags, and URLs
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+', '', text)
    # Remove non-alphanumeric characters
    text = re.sub(r'\W+', ' ', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Clean the tweet text
data['text'] = data['text'].apply(clean_tweet)

# Define a function to get the sentiment label
def get_sentiment(text):
    # Use TextBlob to get the sentiment polarity
    sentiment = TextBlob(text).sentiment.polarity
    # Classify as positive (1), negative (0), or neutral (2)
    if sentiment > 0:
        return 1
    elif sentiment < 0:
        return -1
    else:
        return 0

# Get the sentiment labels
data['sentimental_value'] = data['text'].apply(get_sentiment)

#data['sentimental_value'] = data.groupby('author')['label'].agg(lambda x: x.mode()[0])


# Keep only the columns with clean tweet text and sentiment labels
data = data[['screen_name','text', 'sentimental_value','location','friends_count']]

# Save the preprocessed data to a CSV file
data.to_csv('preprocessed_twitter_data.csv', index=False)


In [None]:
len(data)

In [None]:
#implementation of text embedding through LSTM for Twitter data using Keras

import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import logging
plt.style.use('fivethirtyeight')
from keras.models import Model

# Load the Twitter data
data = pd.read_csv('preprocessed_twitter_data.csv')

print("processing data...")
sns.countplot(x='sentimental_value',data=data)


# Define the maximum number of words to keep in the vocabulary
vocab_size = len(data)

# Define the maximum length of a tweet
max_length = 1000

# Define the embedding dimension
embedding_dim = 100

# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(data['text'], data['sentimental_value'], test_size=0.25, random_state=42)

# Tokenize the training data
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(train_data)
train_sequences = tokenizer.texts_to_sequences(train_data)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')

# Tokenize the testing data
test_sequences = tokenizer.texts_to_sequences(test_data)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# Define the LSTM model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(Dropout(0.5))
model.add(LSTM(64, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='softmax'))

print(model.summary())

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 10

# Train the model
history = model.fit(train_padded, train_labels, validation_data=(test_padded, test_labels), epochs=epochs, batch_size=32)



def plot_model_output(history, epochs):
    plt.figure()
    plt.plot(range(1, epochs + 1), history['loss'], label='Training Loss')
    plt.plot(range(1, epochs + 1), history['val_loss'], label='Validation Loss')
    plt.legend()
    plt.xlabel('Epochs')
    plt.ylabel('Loss')

    plt.figure()
    plt.plot(range(1, epochs + 1), history['accuracy'], label='Training Accuracy')
    plt.plot(range(1, epochs + 1), history['val_accuracy'], label='Validation Accuracy')
    plt.legend()
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')

    plt.show()


# Evaluate the model
loss, accuracy = model.evaluate(test_padded, test_labels, verbose=0)
print('Test accuracy: %f' % (accuracy*100))

#Plot the model
metric_names = ['loss', 'val_loss', 'accuracy', 'val_accuracy']
plot_labels = ['Training Loss', 'Validation Loss', 'Training Accuracy', 'Validation Accuracy']
plot_model_output(history.history, epochs)




In [None]:
lstm_embeddings = model.layers[0].get_weights()[0]

In [None]:
print(lstm_embeddings)
print(len(lstm_embeddings))
print(len(lstm_embeddings[0]))

In [None]:
np.savetxt("lstm_embedding.txt",lstm_embeddings)

This code performs the following steps:

1.Loads the Twitter data from a CSV file.
2.Defines the maximum number of words to keep in the vocabulary, the maximum length of a tweet, and the embedding dimension.
3.Splits the data into training and testing sets.
4.Tokenizes the training and testing data using Keras' Tokenizer class.
5.Pads the sequences to a fixed length using Keras' pad_sequences function.
6.Defines an LSTM model using Keras' Sequential API.
7.Compiles the model with binary cross-entropy loss and the Adam optimizer.
8.Trains the model on the training data.
9.Evaluates the model on the testing data and prints the accuracy.