In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

# Load the dataset

In [None]:
!curl -O https://raw.githubusercontent.com/akshayjoshii/COVID19-Tweet-Sentiment-Analysis-and-EDA/master/finalSentimentdata2.csv

In [None]:
df = pd.read_csv("finalSentimentdata2.csv")

In [None]:
df.head()

In [None]:
sns.countplot(x=df["sentiment"]);

# Data Cleaning

In [None]:
import re
import string

In [None]:
def clean_text(text):
    # Make text lowercase
    text = text.lower()
    # Remove text within square brackets
    text = re.sub("\[.*?\]", "", text)
    # Remove URLs
    text = re.sub("https?://\S+|www\.\S+", "", text)
    # Remove text within <>
    text = re.sub("<.*?>+", "", text)
    # Remove punctuation
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
    # Remove newline
    text = re.sub("\n", "", text)
    # Remove words containing numbers
    text = re.sub("\w*\d\w*", "", text)
    # Remove unicode emojis (todo) but this could mean something!
    return text

In [None]:
# Apply `clean_text` to the text element of the dataframe

df["text"] = df["text"].apply(lambda x: clean_text(x))

In [None]:
# Let's see the max number of words in the tweets

df["n_words"] = df["text"].apply(lambda x: len(str(x).split()))
df["n_words"].max()

In [None]:
# Achtung we have empty tweets

df["n_words"].min()

In [None]:
sns.displot(df["n_words"]);

# Create a Model

In [None]:
import tensorflow as tf

In [None]:
from sklearn import model_selection

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    df["text"], df["sentiment"], test_size=0.30
)

In [None]:
X_train = np.array(X_train.values.tolist())
X_test = np.array(X_test.values.tolist())
y_train = np.array(y_train.values.tolist()).reshape(-1, 1)
y_test = np.array(y_test.values.tolist()).reshape(-1, 1)

In [None]:
# tf.keras.utils.to_categorical(df.sentiment) needs integer inputs

# Encoding labels

In [None]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OneHotEncoder

In [None]:
print(y_train[:5], y_test.shape)

## LabelBinarizer

In [None]:
lb = LabelBinarizer()
y_train_lb = lb.fit_transform(y_train)  # pay attention to the []

print(lb.classes_)

y_test_lb = lb.transform(list(y_test))

## OneHotEncoder

In [None]:
# ohe = OneHotEncoder(sparse=False, drop='first') # Remove the first column
ohe = OneHotEncoder(sparse=False)  # Is needed for TF
y_train_ohe = ohe.fit_transform(y_train)  # pay attention to the []

y_test_ohe = ohe.transform(y_test)

In [None]:
y_test_ohe.shape

## LabelEncoder

In [None]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)  # pay attention to the []

print(le.classes_)

y_test_le = le.transform(list(y_test))

In [None]:
assert np.all(y_test_lb == y_test_ohe)

# Tokening tweets

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()

tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1
vocab_size

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)

In [None]:
X_test = tokenizer.texts_to_sequences(X_test)

In [None]:
max([len(x) for x in X_train])

In [None]:
maxlen = 100

# Last layer reflects the problem we are solving

model = tf.keras.Sequential(
    [
        tf.keras.layers.Embedding(
            vocab_size, output_dim=16, mask_zero=True, input_length=maxlen
        ),
        tf.keras.layers.GRU(units=32, dropout=0.2, recurrent_dropout=0.2),
        tf.keras.layers.Dense(4, activation="softmax"),
    ]
)

In [None]:
model.summary()

In [None]:
X_train_pad = tf.keras.preprocessing.sequence.pad_sequences(
    X_train, padding="post", maxlen=maxlen
)
X_test_pad = tf.keras.preprocessing.sequence.pad_sequences(
    X_test, padding="post", maxlen=maxlen
)

In [None]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
X_train_pad.shape, y_train_ohe.shape

In [None]:
es = tf.keras.callbacks.EarlyStopping(restore_best_weights=True, patience=10, verbose=1)
history = model.fit(
    X_train_pad,
    y_train_ohe,
    batch_size=8,
    epochs=1_000,
    validation_split=0.2,
    shuffle=True,
    callbacks=[es],
)

In [None]:
import matplotlib.pyplot as plt

f = plt.figure(figsize=(10, 7))
f.add_subplot()

# Adding Subplot
plt.plot(
    history.epoch, history.history["accuracy"], label="loss"
)  # Loss curve for training set
plt.plot(
    history.epoch, history.history["val_accuracy"], label="val_loss"
)  # Loss curve for validation set

plt.title("Loss Curve", fontsize=18)
plt.xlabel("Epochs", fontsize=15)
plt.ylabel("Loss", fontsize=15)
plt.grid(alpha=0.3)
plt.legend()

In [None]:
test_loss, test_acc = model.evaluate(X_test_pad, y_test_ohe)

# with sklearn

In [None]:
# logistic regression for multi-class classification using built-in one-vs-rest
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression

# define dataset
X, y = make_classification(
    n_samples=1000,
    n_features=10,
    n_informative=5,
    n_redundant=5,
    n_classes=3,
    random_state=1,
)
# define model
model = LogisticRegression(multi_class="ovr")
# fit model
model.fit(X, y)
# make predictions
yhat = model.predict(X)

| Problem                           | Loss function        | Metrics             |
| ---                               | ---                  |---                  |
| Two exclusive classes             | binary_cross_entropy |                     |
| More than 2 exclusive classes     |                      |                     |
| More than 2 non exclusive classes |                      |                     |