In [None]:
%pip install contractions

# ------------------------------------------------------------------------------------------- #

import pandas as pd
import numpy as np
import contractions
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet

# ------------------------------------------------------------------------------------------- #

import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# ------------------------------------------------------------------------------------------- #

from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, LSTM, Bidirectional
from tensorflow.keras.models import Sequential

# ------------------------------------------------------------------------------------------- #

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# ------------------------------------------------------------------------------------------- #

import matplotlib.pyplot as plt

# ------------------------------------------------------------------------------------------- #

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
train_data = pd.____("____", delimiter = ";", names = ["text", "sentiment"])
test_data = pd.read_csv("____", delimiter = ";", names = ["text", "sentiment"])
display(train_data.head())
display(test_data.info())

In [None]:
def expand_contractions(df_series):
    """ Expands contractions from text in pandas series.
        (Eg: can't --> cannot)
        
    Args:
        df_series (pd.Series): Pandas series containing text data.
    
    Returns:
        df_series (pd.Series): Pandas series containing text data after 
                               expanding contractions.
    """
    
    for i in range(len(df_series)):
        df_series[i] = contractions.fix(df_series[i])
    
    return df_series


def get_pos(token):
    """ Returns "part of speech" of the token which is understandable 
        by WordNetLemmatizer.
        
    Args:
        token (str): Single token whose POS to be identified.
    
    Returns:
        (str): POS tag of the token in a format understandable by WordNetLemmatizer.
    """
    
    pos_tag = nltk.pos_tag(token)[0][1][0].upper()
    pos_tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
    
    # Returns wordnet.NOUN as default if it can't find exact POS 
    return pos_tag_dict.get(pos_tag, wordnet.NOUN)


def lemmatize_series(df_series, remove_stopwords=False):
    """ Lemmatizes text data in pandas series and removes stopwords.
        
    Args:
        df_series (pd.Series): Pandas series containing text data.
        remove_stopwords (bool): Removes stopwords from the text if True. 
                                 Defaults to False.
                                 
    Returns:
        df_series (pd.Series): Pandas series containing lemmatized text data 
                               without stopwords if specified.
    """
    
    if remove_stopwords:
        stop_words = set(stopwords.words("english"))
        lm = WordNetLemmatizer()
        for i in range(len(df_series)):
            df_series[i] = ' '.join(
                [
                    lm.lemmatize(word, get_pos(word)) 
                    for word in df_series[i].split() 
                    if not word.lower() in stop_words
                ]
            )
    else:
        lm = WordNetLemmatizer()
        for i in range(len(df_series)):
            df_series[i] = ' '.join(
                [
                    lm.lemmatize(word, get_pos(word)) 
                    for word in df_series[i].split()
                ]
            )
    
    return df_series


def preprocess_text(df_series, remove_stopwords=True):
    """ Removes all non-alphanumeric characters except whitespace.
        
    Args:
        df_series (pd.series): Pandas series object containing text.
        remove_stopwords (bool): Removes stopwords from text if True. Defaults to True. 
        
    Returns:
        df_series (pd.series): Pandas series object containing preprocessed text. 
    """
    
    # Expand contractions (Eg: can't --> cannot)
    df_series = expand_contractions(df_series)
    
    # Removes non alphanumeric characters
    df_series = df_series.str.replace("[^a-zA-Z0-9 ]", " ")
    
    # Lemmatize text
    df_series = lemmatize_series(df_series, remove_stopwords = remove_stopwords)
    
    return df_series

In [None]:
train_data["text"] = preprocess_text(train_data["text"])
test_data["text"] = preprocess_text(test_data["text"])
display(train_data)

In [None]:
# Plotting boxplot for number of tokens in each observation
ax = train_data["text"].str.split().map(lambda x: len(x)).plot.box(figsize=(6,8))
ax.set_ylabel("( Number of Tokens )")

In [None]:
# Preprocessing targets
le = LabelEncoder()
le.fit(train_data["sentiment"])

train_targets = le.____(train_data["sentiment"])
train_targets = to_categorical(np.asarray(train_targets))

test_targets = le.transform(test_data["sentiment"])
test_targets = ____(np.asarray(test_targets))

In [None]:
# Defining parameters
vocab_size = 10000
embedding_dim = 32
max_len = 25
trunc_type = "____"
padding_type = "____"
oov_token = "<OOV>"

In [None]:
# Fitting tokenizer
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_token)
tokenizer.fit_on_texts(train_data["text"])

In [None]:
# Converting texts to sequences and padding them to make
# them compatible with embedding layers

training_seq = tokenizer.texts_to_sequences(train_data["text"])
training_padded = pad_sequences(
    training_seq,
    truncating = trunc_type,
    padding = padding_type,
    maxlen = max_len
)

test_seq = tokenizer.____(test_data["text"])
test_padded = ____(
    test_seq,
    truncating = trunc_type,
    padding = padding_type,
    maxlen = max_len
)

In [None]:
# Label Encoding targets
train_data["sentiment"].unique()

In [None]:
lstm_model = Sequential([
        Embedding(vocab_size, ____, input_length = ____),
        Bidirectional(LSTM(64)),
        Dense(256, activation = '____'),
        Dense(6, activation = '____')
    ])

In [None]:
lstm_model.compile(loss = "____", optimizer = "____", metrics = ["____"])
lstm_model.summary()

In [None]:
NUM_EPOCHS = 10
BATCH_SIZE = 32

In [None]:
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 100, patience = 3)
mc = ModelCheckpoint(
    filepath = "./checkpoint",
    monitor = 'val_accuracy',
    mode = 'max',
    save_best_only = True
)

In [None]:
history = lstm_model.fit(
    training_padded,
    train_targets,
    validation_data = (test_padded, test_targets),
    epochs = NUM_EPOCHS,
    batch_size = BATCH_SIZE,
    callbacks = [es, mc]
)

In [None]:
history_dict = history.history
train_acc = history_dict['loss']
val_acc = history_dict['val_loss']
epochs = range(1, len(history_dict['loss'])+1)
plt.plot(epochs, train_acc,'b', label='Training error')
plt.plot(epochs, val_acc,'b', color="orange", label='Validation error')
plt.title('Training and Validation error')
plt.xlabel('Epochs')
plt.ylabel('Error')
plt.legend()
plt.show()

In [None]:
y_preds = lstm_model.____(test_padded)
y_preds = np.argmax(y_preds, axis = 1)

test_targets = np.argmax(test_targets, axis=1)

In [None]:
cm = confusion_matrix(test_targets, y_preds)

disp = ConfusionMatrixDisplay(
    confusion_matrix = cm,
    display_labels = le.classes_
)
disp.plot()
plt.show()

print(classification_report(test_targets, y_preds))

In [None]:
lstm_model.____("./model")