In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

# Load the dataset

In [None]:
!curl -O https://raw.githubusercontent.com/akshayjoshii/COVID19-Tweet-Sentiment-Analysis-and-EDA/master/finalSentimentdata2.csv

In [None]:
df = pd.read_csv("finalSentimentdata2.csv")

In [None]:
df.head()

In [None]:
sns.countplot(x=df['sentiment']);

# Data Cleaning

In [None]:
import nltk
import re
import string

In [None]:
def clean_text(text):
    # Make text lowercase
    text = text.lower()
    # Remove text within square brackets
    text = re.sub('\[.*?\]', '', text)
    # Remove URLs
    text = re.sub('https?://\S+|www\.\S+', '', text)
    # Remove text within <>
    text = re.sub('<.*?>+', '', text)
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # Remove newline
    text = re.sub('\n', '', text)
    # Remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)
    # Remove unicode emojis (todo) but this could mean something! 
    return text

In [None]:
# Apply `clean_text` to the text element of the dataframe

df['text'] = df['text'].apply(lambda x: clean_text(x))

In [None]:
# Let's see the max number of words in the tweets

df['n_words'] = df['text'].apply(lambda x:len(str(x).split()))
df['n_words'].max()

In [None]:
# Achtung we have empty tweets

df['n_words'].min()

In [None]:
sns.displot(df['n_words']);

# Create a Model

In [None]:
import tensorflow as tf

In [None]:
from sklearn import model_selection

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(df['text'], df['sentiment'], test_size=0.30)

In [None]:
X_train = np.array(X_train.values.tolist())
X_test = np.array(X_test.values.tolist())
y_train = np.array(y_train.values.tolist())
y_test = np.array(y_test.values.tolist())

# Binarizing labels

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y_train_enc = mlb.fit_transform([list(y_train)]) # pay attention to the []

#mlb.classes_

In [None]:
y_train_enc, set(list(y_train))

# Tokening tweets

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()

tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1
vocab_size

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)

In [None]:
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
max([len(x) for x in X_train])

In [None]:
maxlen = 100

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 20, input_length=maxlen),
    tf.keras.layers.GRU(units=32, dropout=0.2, recurrent_dropout=0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
model.summary()

In [None]:
X_train_pad = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test_pad = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])