In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import json
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from string import punctuation
import spacy
import optuna
import torch

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense ,LSTM, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout,Input
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from keras.layers import MaxPooling1D,Conv1D

spacy_nlp = spacy.load('en_core_web_sm')
tf.random.set_seed(66)
np.random.seed(6)

In [None]:
df_train_val = pd.read_csv('review_train.csv')
dic = pd.read_json('review_metadata.json',orient='index').reset_index()
dic

# data inspection

## missing values

In [None]:
df_train_val.isnull().sum()

In [None]:
df_train_val.info()

### negative_reason                

In [None]:
missing_in_reason_s = df_train_val.loc[(pd.isnull(df_train_val['negative_reason_confidence']))]
missing_in_reason_s

In [None]:
null_counts = df_train_val.groupby('airline_sentiment')['negative_reason'].apply(lambda x: x.isna().sum())
null_counts

In [None]:
all_counts =df_train_val['airline_sentiment'].value_counts()
all_counts

In [None]:
df_train_val['negative_reason_confidence'] = df_train_val['negative_reason_confidence'].fillna(0)

In [None]:
df_train_val.info()

In [None]:
df_train_val['review_city'].isnull().value_counts()

# EDA


In [None]:
from dataprep.eda import plot
plot(df_train_val, 'airline_sentiment')

In [None]:
plot(df_train_val, 'airline_sentiment','sentiment_confidence')

In [None]:
plot(df_train_val,'airline_name')

In [None]:
plot(df_train_val, 'airline_sentiment','airline_name')

In [None]:
plot(df_train_val, 'review_text')

In [None]:
plot(df_train_val, 'negative_reason')

In [None]:
from wordcloud import WordCloud

positive_reviews = df_train_val[df_train_val['airline_sentiment'] == 'positive']

# Combine the rows' review_text columns into one long string
text = ' '.join(review for review in positive_reviews['review_text'])

# Generate and display word clouds
wordcloud = WordCloud(width=800, height=400, background_color ='white', min_font_size = 10).generate(text)

plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.title("Positive WordCloud")
plt.show()

In [None]:
negative_reviews = df_train_val[df_train_val['airline_sentiment'] == 'negative']

# Combine the rows' review_text columns into one long string
text = ' '.join(review for review in negative_reviews['review_text'])

# Generate and display word clouds
wordcloud = WordCloud(width=800, height=400, background_color ='white', min_font_size = 10).generate(text)

plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.title("Negative WordCloud")
plt.show()

In [None]:
neutral_reviews = df_train_val[df_train_val['airline_sentiment'] == 'neutral']

# Combine the rows' review_text columns into one long string
text = ' '.join(review for review in neutral_reviews['review_text'])

# Generate and display word clouds
wordcloud = WordCloud(width=800, height=400, background_color ='white', min_font_size = 10).generate(text)

plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.title("Neutral WordCloud")
plt.show()

# part B

### target variable

In [None]:
y = df_train_val['airline_sentiment']
y

### review text

In [None]:
review = df_train_val['review_text'].str.lower().tolist()
review

text pre processing

Stop words removal

In [None]:
punctuations = list(punctuation)
punctuations

In [None]:
## drop URL
df_train_val['review_text'] = df_train_val['review_text'].str.replace(r'http\S+', '', regex=True)

In [None]:
#reference:https://medium.com/coinmonks/text-classifier-with-keras-tensorflow-using-recurrent-neural-networks-ad63dd5fc316
spacy_nlp = spacy.load("en_core_web_sm")
def text_pre_process(sent, stemming=True, lemmatising=True,stop_words = None):
  # Convert the text to lowercase and remove leading/trailing whitespaces
  sent = str(sent).lower()
  sent = sent.strip()

  if punctuations is not None:
    for punctuation in punctuations:
      sent = sent.replace(punctuation, '')

    # Tokenize
    doc = spacy_nlp(sent)

    # Lemmatizing
    if lemmatising == True:
        sent_list = [token.lemma_ for token in doc]
    else:
        sent_list = [token.text for token in doc]

    # Stop word removal
    if stop_words is not None:
        sent_list = [c for c in sent_list if c not in stop_words]

    sent = " ".join(sent_list)
    return sent

In [None]:
stop_words = nltk.corpus.stopwords.words("english")

In [None]:
df_train_val['processed_text_review'] = df_train_val['review_text'].apply(
    lambda x: text_pre_process(x,stemming=True, lemmatising=True,stop_words = stop_words))
df_train_val.head()

In [None]:
## Train test split
### split before encoding to mitigate data leakage
train_indices, val_indices = train_test_split(np.array(df_train_val.index), test_size=0.2, random_state=6)

df_train = df_train_val.loc[train_indices].copy()
df_val = df_train_val.loc[val_indices].copy()

In [None]:
print('shape of train set:', df_train.shape)
print('shape of validation set:', df_val.shape)

In [None]:
x_train = df_train['processed_text_review']
x_val = df_val['processed_text_review']

In [None]:
x_train

In [None]:
y_train = df_train['airline_sentiment']
y_val = df_val['airline_sentiment']
y_train

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
y_train_array = np.array(y_train).reshape(-1, 1)
y_val_array = np.array(y_val).reshape(-1, 1)


y_train = encoder.fit_transform(y_train_array)
y_val = encoder.transform(y_val_array)

In [None]:
y_train

### TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer_tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

x_tfidf_train = vectorizer_tfidf.fit_transform(x_train)
x_tfidf_val = vectorizer_tfidf.transform(x_val)

#### random forest

In [None]:
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, random_state=6, shuffle=True)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from optuna.samplers import TPESampler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score


def objective(trial):
    criterion = 'gini'
    max_depth = trial.suggest_int('max_depth', 1, 20)
    n_estimators = trial.suggest_int('n_estimators', 50, 500, step = 10 )
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)

    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_depth = max_depth,
                                   min_samples_leaf= min_samples_leaf,
                                   criterion=criterion,
                                   random_state=0
                                 )

    scores = cross_val_score(model, x_tfidf_train, y_train, cv=cv, scoring='accuracy')
    accuracy = np.mean(scores)

    return 1 - accuracy

sampler = TPESampler(seed=0)
study = optuna.create_study(direction='minimize', sampler=sampler)
study.optimize(objective, n_trials = 100, timeout = 7200, n_jobs= -1)

In [None]:
params_rf = study.best_params
params_rf
#{'max_depth': 5, 'n_estimators': 280, 'min_samples_leaf': 7}

In [None]:
rf = RandomForestClassifier(**params_rf,n_jobs = -1,random_state=0)
rf.fit(x_tfidf_train, y_train)

In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report

y_pred_rf = rf.predict(x_tfidf_val)
y_pred_classes_rf = y_pred_rf.argmax(axis=-1)  # For multi-class classification


report = classification_report(y_val.argmax(axis=-1), y_pred_classes_rf, digits=3)
print(report)

In [None]:
y_pred_tfidf_rf = rf.predict(x_tfidf_val)
accuracy_tfidf_rf = accuracy_score(y_val, y_pred_tfidf_rf).round(3)
print("Accuracy:", accuracy_tfidf_rf)
# Accuracy: 0.604

#### gradient boosting

In [None]:
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV


xgb_classifier = xgb.XGBClassifier()
param_grid = {
    'learning_rate': [0.1, 0.01, 0.05],
    'n_estimators': np.arange(50, 500, 10),
    'max_depth': np.arange(1, 20, 1),
}

xgb = RandomizedSearchCV(
    estimator=xgb_classifier,
    param_distributions=param_grid,
    n_jobs=-1,
    cv=cv,
    scoring='accuracy',
    verbose=5,
    return_train_score=True,
    random_state = 6
)

xgb.fit(x_tfidf_train, y_train)

In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report

y_pred_xgb = xgb.predict(x_tfidf_val)
y_pred_classes_xgb = y_pred_xgb.argmax(axis=-1)  # For multi-class classification

report = classification_report(y_val.argmax(axis=-1), y_pred_classes_xgb, digits=3)
print(report)

In [None]:
params_xgb = xgb.best_params_
print("Hyperparameters:", params_xgb)

y_pred_tfidf_xgb = xgb.predict(x_tfidf_val)
accuracy_tfidf_xgb = accuracy_score(y_val, y_pred_tfidf_xgb).round(3)
print("Accuracy:", accuracy_tfidf_xgb)
#Hyperparameters: {'n_estimators': 430, 'max_depth': 19, 'learning_rate': 0.1}

TASK C RNN


In [None]:
from torchtext.data.utils import get_tokenizer

# build a tokenizer with spacy
tokenizer = get_tokenizer('spacy')

In [None]:
from torchtext.vocab import vocab
from collections import Counter

#Select the words occurred more than three times
counter = Counter()
for i in df_train_val['processed_text_review']:
  counter.update(tokenizer(i))

vocabulary = vocab(counter, min_freq=3)
vocabulary.set_default_index(0)

len(vocabulary)

In [None]:
def sent_encode(docs,vocabulary):
    encoded_docs = []
    for doc in docs:
        tokens = tokenizer(doc)
        encoded_doc = [vocabulary[token] for token in tokens if token in vocabulary]
        encoded_docs.append(encoded_doc)
    return encoded_docs

In [None]:
x_rnn_tokenized_train = sent_encode(x_train,vocabulary)
x_rnn_tokenized_val = sent_encode(x_val,vocabulary)

In [None]:
sequence_length= np.max([len(s) for s in x_rnn_tokenized_train])

In [None]:
# padding output as np array
def padding(sents, sequence_length):
    sent_padded = []
    for sent in sents:
        if len(sent) < sequence_length:
            padding_size = sequence_length - len(sent)
            padding = torch.zeros(padding_size, dtype=torch.long)
            padded_sent = torch.cat((sent, padding))
        elif len(sent) > sequence_length:
            padded_sent = sent[:sequence_length]
        else:
            padded_sent = sent
        sent_padded.append(padded_sent)
    padded_array = np.array([sent.numpy() for sent in sent_padded])

    return padded_array

In [None]:
x_rnn_tokenized_train = [torch.tensor(sent, dtype=torch.long) for sent in x_rnn_tokenized_train]
x_rnn_tokenized_val = [torch.tensor(sent, dtype=torch.long) for sent in x_rnn_tokenized_val]

In [None]:
x_rnn_encoded_train=padding(x_rnn_tokenized_train,sequence_length)
x_rnn_encoded_val=padding(x_rnn_tokenized_val,sequence_length)

In [None]:
sequence_length = sequence_length
vocab_size = len(vocabulary)#
n_classes = 3

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

def objective(trial):

    rnn_units = int(trial.suggest_categorical('rnn_units', [2**i for i in range(5, 9)]))# 32, 256
    embedding_dim = int(trial.suggest_categorical('embedding_dim', [2**i for i in range(5, 9)]))# 32, 256
    learning_rate = float(trial.suggest_categorical('learning_rate', [10**i for i in range(-6, -1)]))


    early_stopping = EarlyStopping( monitor='val_loss', patience=5, restore_best_weights=True)
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=sequence_length))
    model.add(SimpleRNN(rnn_units))
    model.add(Dense(n_classes, activation='softmax'))
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    history = model.fit(x_rnn_encoded_train, y_train, epochs=20, validation_data=(x_rnn_encoded_val, y_val), batch_size=64, callbacks = [early_stopping], verbose=0)

    val_loss = history.history['val_loss'][-1]

    return val_loss


study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=6))
study.optimize(objective, n_trials= 50, timeout = 1200000, n_jobs= -1, show_progress_bar=True)

In [None]:
best_params = study.best_params
best_params
#{'rnn_units': 256, 'embedding_dim': 128, 'learning_rate': 1e-05}

In [None]:
best_rnn_units = best_params['rnn_units']
best_learning_rate = best_params['learning_rate']
best_embedding_dim = best_params['embedding_dim']

In [None]:
early_stopping = EarlyStopping( monitor='val_accuracy', patience=5, restore_best_weights=True)
best_rnn_model = Sequential()
best_rnn_model.add(Embedding(input_dim=vocab_size, output_dim=best_embedding_dim, input_length=sequence_length))
best_rnn_model.add(SimpleRNN(best_rnn_units, kernel_regularizer=l2(0.01)))
best_rnn_model.add(Dropout(0.2))
best_rnn_model.add(Dense(n_classes, activation='softmax',kernel_regularizer=l2(0.01)))
best_rnn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=best_learning_rate),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
history =best_rnn_model.fit(x_rnn_encoded_train, y_train, epochs=150, validation_data=(x_rnn_encoded_val, y_val), batch_size=64, callbacks = [early_stopping])


In [None]:
best_rnn_model.summary()

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
ax[0].set(title='Loss')
ax[0].plot(history.history['loss'], label='Training')
ax[0].plot(history.history['val_loss'], label='Validation')
ax[0].legend(loc="upper right")

ax[1].set(title='Accuracy')
ax[1].plot(history.history['accuracy'], label='Training')
ax[1].plot(history.history['val_accuracy'], label='Validation')
ax[1].legend(loc="lower right")

In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report

y_pred = best_rnn_model.predict(x_rnn_encoded_val)
y_pred_classes = y_pred.argmax(axis=-1)  # For multi-class classification


report = classification_report(y_val.argmax(axis=-1), y_pred_classes, digits=3)
print(report)

In [None]:
accuracy_rnn = history.history['val_accuracy'][-1]
print('Accuracy Training data: {:.1%}'.format(history.history['accuracy'][-1]))
print('Accuracy Test data: {:.1%}'.format(history.history['val_accuracy'][-1]))

### Glove

In [None]:
# read glove vector
# reference: https://towardsdatascience.com/sentiment-analysis-using-lstm-and-glove-embeddings-99223a87fe8e
def glove_emb(glove_file):
  glove_embeddings = {}

  with open(glove_file, 'r', encoding='utf-8') as file:
      for line in file:
          parts = line.strip().split()
          word = parts[0]
          embedding = np.array(parts[1:], dtype=float)
          glove_embeddings[word] = embedding
  return glove_embeddings

In [None]:
glove_vac = glove_emb('glove.6B.300d.txt')

In [None]:
# create glove matrix d:len(vocabulary) X 300
glove_dim = 300 # dimension of glove
vocab_lenth = len(vocabulary)
emb_matrix = np.zeros((vocab_lenth, glove_dim))

itos = vocabulary.get_itos()
for i, word in enumerate(itos):
  if word in glove_vac:
    emb_matrix[i] = glove_vac[word]
  else:
    emb_matrix[i] = [0]*300

emb_matrix = torch.FloatTensor(emb_matrix)
emb_matrix

In [None]:
x_glove_train = x_rnn_encoded_train
x_glove_val = x_rnn_encoded_val

In [None]:
### hyperparameter sitting
embedding_dim = glove_dim
input_dim = vocab_lenth
weight = [emb_matrix]

rnn_units = 16
sequence_length = sequence_length
vocab_size = len(vocabulary)#
n_classes = 3

In [None]:
early_stopping = EarlyStopping( monitor='val_accuracy', patience=5, restore_best_weights=True)
glove_rnn_model = Sequential()
glove_rnn_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=weight, trainable=False))
glove_rnn_model.add(SimpleRNN(rnn_units))
glove_rnn_model.add(Dropout(0.2))
glove_rnn_model.add(Dense(n_classes, activation='softmax'))
glove_rnn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0002),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
history_glove = glove_rnn_model.fit(x_glove_train, y_train, epochs=150, validation_data=(x_glove_val, y_val), batch_size=64, callbacks = [early_stopping])

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
ax[0].set(title='Loss')
ax[0].plot(history_glove.history['loss'], label='Training')
ax[0].plot(history_glove.history['val_loss'], label='Validation')
ax[0].legend(loc="upper right")

ax[1].set(title='Accuracy')
ax[1].plot(history_glove.history['accuracy'], label='Training')
ax[1].plot(history_glove.history['val_accuracy'], label='Validation')
ax[1].legend(loc="lower right")

In [None]:
y_pred_glove = glove_rnn_model.predict(x_glove_val)
y_pred_classes_glove = y_pred_glove.argmax(axis=-1) # For multi-class classification


report_glove = classification_report(y_val.argmax(axis=-1), y_pred_classes_glove, digits=3)
print(report_glove)

Task E


In [None]:
embedding_dim = glove_dim
weight = [emb_matrix]

rnn_units = 128
vocab_size = len(vocabulary)#
n_classes = 3

In [None]:
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=15,
    verbose=1,
    restore_best_weights=True
)

def leaky_relu_e(x):
    return tf.nn.leaky_relu(x, alpha=0.01)

In [None]:
# try1_model = Sequential()
# try1_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=weight, trainable=False))
# try1_model.add(SimpleRNN(rnn_units,activation = leaky_relu_e))
# try1_model.add(Dropout(0.2))
# try1_model.add(Dense(n_classes, activation='softmax'))
# try1_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),
#               loss='categorical_crossentropy',
#               metrics=['accuracy'])
# history_try1 = try1_model.fit(x_glove_train, y_train, epochs=100, validation_data=(x_glove_val, y_val), batch_size=16,
#                               callbacks = [early_stopping]
#                                              )

In [None]:
# y_pred_try1 = try1_model.predict(x_glove_val)
# y_pred_classes_try1 = y_pred_try1.argmax(axis=-1)  # For multi-class classification


# report_try1 = classification_report(y_val.argmax(axis=-1), y_pred_classes_try1, digits=3)
# print(report_try1)
'''
87/87 [==============================] - 4s 50ms/step
              precision    recall  f1-score   support

           0      0.814     0.879     0.845      1662
           1      0.530     0.502     0.516       625
           2      0.740     0.579     0.650       466

    accuracy                          0.743      2753
   macro avg      0.694     0.654     0.670      2753
weighted avg      0.737     0.743     0.737      2753
'''

trial 2 using LSTM

In [None]:
# ### lstm
# try2_model = Sequential()
# try2_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=weight, trainable=False))
# try2_model.add(LSTM(rnn_units))
# try2_model.add(Dropout(0.2))
# try2_model.add(Dense(n_classes, activation='softmax'))
# try2_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),
#               loss='categorical_crossentropy',
#               metrics=['accuracy'])
# history_try2 = try2_model.fit(x_glove_train, y_train, epochs=100,
#                       validation_data=(x_glove_val, y_val), batch_size=16,
#                       callbacks = [early_stopping]
# )

In [None]:
# y_pred_try2 = try2_model.predict(x_glove_val)
# y_pred_classes_try2 = y_pred_try2.argmax(axis=-1)  # For multi-class classification


# report_try2 = classification_report(y_val.argmax(axis=-1), y_pred_classes_try2, digits=3)
# print(report_try2)
'''
87/87 [==============================] - 5s 52ms/step
              precision    recall  f1-score   support

           0      0.738     0.945     0.829      1662
           1      0.661     0.293     0.406       625
           2      0.772     0.573     0.658       466

    accuracy                          0.734      2753
   macro avg      0.723     0.604     0.631      2753
weighted avg      0.726     0.734     0.704      2753
'''

trail 3 using bi LSTM

In [None]:
#### bid lstm

try3_model = Sequential()
try3_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=weight, trainable=False))
try3_model.add(Bidirectional(LSTM(rnn_units, return_sequences=False),
                            input_shape=(sequence_length, embedding_dim)))
try3_model.add(Dropout(0.2))
try3_model.add(Dense(n_classes, activation='softmax'))
try3_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
history_try3 = try3_model.fit(x_glove_train, y_train, epochs=100, validation_data=(x_glove_val, y_val), batch_size=16,
                              callbacks = [early_stopping]
                              )

In [None]:
y_pred_try3 = try3_model.predict(x_glove_val)
y_pred_classes_try3 = y_pred_try3.argmax(axis=-1)  # For multi-class classification

report_try3 = classification_report(y_val.argmax(axis=-1), y_pred_classes_try3, digits=3)
print(report_try3)

trail 4 bi LSTM with CNN

In [None]:
# try4_model = Sequential()
# try4_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=weight, trainable=False))
# try4_model.add(Conv1D(filters=32, kernel_size=3, padding='same'))
# try4_model.add(MaxPooling1D(pool_size=2))
# try4_model.add(Bidirectional(LSTM(rnn_units, return_sequences=False),
#                             input_shape=(sequence_length, embedding_dim)))
# try4_model.add(Dropout(0.2))
# try4_model.add(Dense(n_classes, activation='softmax'))
# try4_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),
#                    loss='categorical_crossentropy',
#                    metrics=['accuracy'])
# history_try4 = try4_model.fit(x_glove_train, y_train, epochs=100, validation_data=(x_glove_val, y_val), batch_size=16,
#                               callbacks=[early_stopping])

In [None]:
# try4_model.summary()

In [None]:
# fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))
# ax[0].set(title='Loss')
# ax[0].plot(history_try4.history['loss'], label='Training')
# ax[0].plot(history_try4.history['val_loss'], label='Validation')
# ax[0].legend(loc="upper right")

# ax[1].set(title='Accuracy')
# ax[1].plot(history_try4.history['accuracy'], label='Training')
# ax[1].plot(history_try4.history['val_accuracy'], label='Validation')
# ax[1].legend(loc="lower right")

In [None]:
# y_pred_try4 = try4_model.predict(x_glove_val)
# y_pred_classes_try4 = y_pred_try4.argmax(axis=-1)  # For multi-class classification


# report_try4 = classification_report(y_val.argmax(axis=-1), y_pred_classes_try4, digits=3)
# print(report_try4)

In [None]:
# from sklearn.metrics import f1_score, accuracy_score

# # Compare and find the best model from task E

# models = [("rnn_leakyrelu", try1_model), ("LSTM", try2_model), ("BiLSTM", try3_model), ("BiLSTM_CNN", try4_model)]

# for model_name, model in models:

#     predictions = model.predict(x_glove_val)
#     y_val_indices = y_val.argmax(axis=1)
#     predictions_indices = predictions.argmax(axis=1)

#     f1 = round(f1_score(y_val_indices, predictions_indices, average='macro'), 3)
#     accuracy = round(accuracy_score(y_val_indices, predictions_indices), 3)

#     print(f"Model {model_name}:")
#     print(f"Macro Avg F1 Score: {f1}")
#     print(f"Validation Accuracy: {accuracy}\n")

In [None]:
# # Compare and find the best model from all tasks

# models_task_b = [("Random Forest", rf), ("XGBoost", xgb)]
# models_task_cde = [("Vanilla RNN", best_rnn_model), ("RNN with GloVe", glove_rnn_model), ("BiLSTM", try3_model)]

# for model_name, model in models_task_b:

#     predictions = model.predict(x_tfidf_val)
#     y_val_indices = y_val.argmax(axis=1)
#     predictions_indices = predictions.argmax(axis=1)

#     f1 = round(f1_score(y_val_indices, predictions_indices, average='macro'), 3)
#     accuracy = round(accuracy_score(y_val_indices, predictions_indices), 3)

#     print(f"Model {model_name}:")
#     print(f"Macro Avg F1 Score: {f1}")
#     print(f"Validation Accuracy: {accuracy}\n")

# for model_name, model in models_task_cde:

#     predictions = model.predict(x_glove_val)
#     y_val_indices = y_val.argmax(axis=1)
#     predictions_indices = predictions.argmax(axis=1)

#     f1 = round(f1_score(y_val_indices, predictions_indices, average='macro'), 3)
#     accuracy = round(accuracy_score(y_val_indices, predictions_indices), 3)

#     print(f"Model {model_name}:")
#     print(f"Macro Avg F1 Score: {f1}")
#     print(f"Validation Accuracy: {accuracy}\n")

In [None]:
# best_model = try3_model

In [None]:
# # untune able
# embedding_dim = glove_dim
# weight = [emb_matrix]
# vocab_size = len(vocabulary)
# n_classes = 3

# #tune able
# drop_rate = 0.2
# batch_size = 16

In [None]:
# def objective(trial):
#     rnn_units = int(trial.suggest_categorical('rnn_units', [2**i for i in range(5, 9)]))
#     learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)

#     best_model = Sequential()
#     best_model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=weight, trainable=False))
#     best_model.add(Bidirectional(LSTM(rnn_units, return_sequences=False), input_shape=(sequence_length, embedding_dim)))
#     best_model.add(Dropout(drop_rate))
#     best_model.add(Dense(n_classes, activation='softmax'))
#     best_model.compile(
#         optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
#         loss='categorical_crossentropy',
#         metrics=['accuracy']
#     )


#     history_try3 = best_model.fit(x_glove_train, y_train, epochs=100, validation_data=(x_glove_val, y_val),
#                                   batch_size=batch_size, callbacks=[early_stopping])

#     acc = history_try3.history['val_accuracy'][-1]

#     return acc

# study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=6))
# study.optimize(objective, n_trials=20, timeout=36000, n_jobs=-1, show_progress_bar=True)

In [None]:
# best_params = study.best_params
# best_params
# 'rnn_units': 64, 'learning_rate': 0.0008752767781788608

In [None]:
# best_units = best_params['rnn_units']
# best_learning_rate = best_params['learning_rate']

In [None]:
# best_model1 = Sequential()
# best_model1.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=weight, trainable=False))
# best_model1.add(Bidirectional(LSTM(best_units, return_sequences=False),
#                             input_shape=(sequence_length, embedding_dim)))
# best_model1.add(Dropout(drop_rate))
# best_model1.add(Dense(n_classes, activation='softmax'))
# best_model1.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=best_learning_rate),
#               loss='categorical_crossentropy',
#               metrics=['accuracy'])
# history_best = best_model1.fit(x_glove_train, y_train, epochs=100, validation_data=(x_glove_val, y_val), batch_size=batch_size,
#                               callbacks = [early_stopping]
#                               )

In [None]:
# y_pred_best = best_model1.predict(x_glove_val)
# y_pred_classes_best = y_pred_best.argmax(axis=-1)  # For multi-class classification


# report_best = classification_report(y_val.argmax(axis=-1), y_pred_classes_best, digits=3)
# print(report_best)

Task F

In [None]:
# df_test = pd.read_csv('review_challenge.csv')

In [None]:
# df_test['review_text'] = df_test['review_text'].str.replace(r'http\S+', '', regex=True)

In [None]:
# df_test['processed_text_review'] = df_test['review_text'].apply(
#     lambda x: text_pre_process(x,lemmatising=True,stop_words = stop_words))

In [None]:
# x_test = df_test['processed_text_review']

In [None]:
# for i in df_test['processed_text_review']:
#   counter.update(tokenizer(i))

# vocabulary_test = vocab(counter, min_freq=3)
# vocabulary_test.set_default_index(0)

In [None]:
# len(vocabulary_test)

In [None]:
# x_tokenized_test = sent_encode(x_test,vocabulary)

In [None]:
# sequence_length_test= np.max([len(s) for s in x_tokenized_test])

In [None]:
# x_tokenized_test = [torch.tensor(sent, dtype=torch.long) for sent in x_tokenized_test]
# x_encoded_test =padding(x_tokenized_test,sequence_length_test)

In [None]:
# glove_dim = 300
# vocab_lenth_test = len(vocabulary_test)
# emb_matrix_test = np.zeros((vocab_lenth_test, glove_dim))

# itos = vocabulary_test.get_itos()
# for i, word in enumerate(itos):
#   if word in glove_vac:
#     emb_matrix_test[i] = glove_vac[word]
#   else:
#   # If a word is not in GloVe, you can initialize it with random values or zeros
#     emb_matrix_test[i] = [0]*300

# emb_matrix_test = torch.FloatTensor(emb_matrix_test)

In [None]:
# predictions

In [None]:
# label_names = ['negative', 'neutral', 'positive']
# predictions = best_model1.predict(x_encoded_test)
# predicted_labels = [label_names[np.argmax(prediction)] for prediction in predictions]
# df_test['airline_sentiment'] = predicted_labels
# result_df = df_test[['review_id', 'airline_sentiment']]
# result_df

In [None]:
# df_test

In [None]:
# result_df.to_csv('Group3_QBUS6850_2023S2.csv', index=False)