**NLP Coursework - Fancy Model - Bi-LSTM**

In [1]:
#install the dataset
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
Coll

In [2]:
#PACKAGES USED 

#baisc
import numpy as np
import pandas as pd

#to load dataset & split
from datasets import load_dataset
from sklearn.model_selection import train_test_split

#for pre-processing
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

#for feature extraction
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot

from keras.preprocessing.text import Tokenizer
from google.colab import drive
from gensim.models import KeyedVectors
import gensim.downloader as api

#for Bi-LSTM model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

#performance metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [3]:
#Step 1: Load the dataset and split into train, validation and test dataset
dataset = load_dataset('rotten_tomatoes')

x_data = dataset['train']['text'] + dataset['validation']['text'] + dataset['test']['text']

train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

print(train_dataset)
print(val_dataset)
print(test_dataset)

# Split the dataset into reviews and labels as x and y. 
x_train = train_dataset['text']
y_train = train_dataset['label']

x_val = val_dataset['text']
y_val = val_dataset['label']

x_test = val_dataset['text']
y_test = val_dataset['label']

Downloading builder script:   0%|          | 0.00/5.03k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.25k [00:00<?, ?B/s]

Downloading and preparing dataset rotten_tomatoes/default to /root/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46...


Downloading data:   0%|          | 0.00/488k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Dataset rotten_tomatoes downloaded and prepared to /root/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})
Dataset({
    features: ['text', 'label'],
    num_rows: 1066
})
Dataset({
    features: ['text', 'label'],
    num_rows: 1066
})


PRE-PROCESSING

In [4]:
# Step 2: Pre-processing

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www.\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text.lower())
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Stem the tokens
    #stemmer = PorterStemmer()
    #stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    #since stemming is creating some spelling mistakes which may affect the contextual representation, we have not performed this. eg: Centurys ----stemming---> centuri

    #lemmatizing the tokens
    lemmatizer = WordNetLemmatizer()
    lemma_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the stemmed tokens
    preprocessed_text = ' '.join(lemma_tokens)
    
    return preprocessed_text

# apply this in x_train, x_val, and x_test,
x_train_preprocessed = [preprocess_text(text) for text in x_train]
x_val_preprocessed = [preprocess_text(text) for text in x_val]
x_test_preprocessed = [preprocess_text(text) for text in x_test]

print(x_train_preprocessed[0:2])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


['rock destined st century new conan he going make splash even greater arnold schwarzenegger jeanclaud van damme steven segal', 'gorgeously elaborate continuation lord ring trilogy huge column word adequately describe cowriterdirector peter jackson expanded vision j r r tolkien middleearth']


FEATURE EXTRACTION & MODELING - MODEL 3

In [5]:
#this code build with the support of tensorflow keras Bi-LSTM page.
# Step 3: Feature extraction
# 3.1 Using onehot encoding  

# vocab size and pad length 
vocab_size = 5000
pad_len = 50

# One hot encoding texts - indexing: 
onehotencoded_text=[one_hot(words,vocab_size) for words in x_train_preprocessed]
print('onehotencoded_text', onehotencoded_text)

#pass onehot to embedding layer - Embeeding Representation
padded_text = pad_sequences(onehotencoded_text, padding='pre', maxlen=pad_len)
print('padded_text', padded_text)

#Creating Model
Embedding_dimensions = 100
model = Sequential()
model.add(Embedding(vocab_size, Embedding_dimensions, input_length=pad_len))
model.add(Bidirectional(LSTM(100)))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print('model summary', model.summary())

#converting to array
x_train_pad = np.array(padded_text)
y_train = np.array(y_train)

#traintest split
x_trainsplit, x_testsplit, y_trainsplit, y_testsplit = train_test_split(x_train_pad, y_train, test_size=0.2, random_state=42)

# Model Training
model.fit(x_trainsplit, y_trainsplit, epochs=5, batch_size=32, validation_data=(x_testsplit, y_testsplit))

#performance metrics
test_pred_prob = model.predict(x_testsplit)
test_pred = (test_pred_prob > 0.5).astype(int)
test_pred
# Print the classification report and confusion matrix
print('classification report', classification_report(y_testsplit, test_pred))
print('confusion matrix', confusion_matrix(y_testsplit, test_pred))


onehotencoded_text [[2971, 4845, 4588, 4375, 2418, 1580, 2541, 745, 253, 1355, 2113, 3097, 2380, 3456, 1036, 2106, 3128, 3560, 4706], [3740, 3622, 4221, 356, 1216, 2352, 916, 1359, 568, 3911, 2626, 1795, 3690, 2251, 1655, 3543, 2693, 2072, 2072, 141, 3727], [2367, 3955, 121], [3359, 4418, 125, 2306, 1282, 3892, 1966, 3586, 243], [1136, 3112, 1440, 1400, 2306, 1843, 3930, 4416, 4106, 3873, 4543, 4418, 221], [3197, 1598, 3204, 2272, 1496, 4656, 684, 2113, 4845, 2947, 3024, 1227], [2692, 1440, 1555, 370, 2320], [1648, 2505, 767, 4053, 2395, 1940, 2671, 2972, 1004, 1966, 675], [2805, 2966, 1864, 4466, 3197, 3170, 1868, 2558, 4299, 2571, 937], [234, 2136, 4819, 2692, 4485, 295, 879, 1327, 435], [3197, 1361, 1997, 3928, 1115, 1499, 3724], [1927, 1192, 456, 3959, 3899, 2995, 2654], [4696, 125, 435, 678, 606, 1779, 1240], [221, 3211, 4297, 791, 2306, 767], [3452, 39, 414, 4636, 2683, 2362], [955, 894, 2393, 2580, 2774, 2606, 767, 660, 2524, 1268], [821, 3470, 786, 3149], [1419, 3970, 4610, 124

In [7]:
# 3.2 trying with pre-trained Word2Vec feature selection

#tokenizing and sequencing 
tokenizer = Tokenizer(nb_words=10000)
tokenizer.fit_on_texts(x_train_preprocessed)
tokenizer.fit_on_texts(x_val_preprocessed)
tokenizer.fit_on_texts(x_test_preprocessed)

train_sequences = tokenizer.texts_to_sequences(x_train_preprocessed)
val_sequences = tokenizer.texts_to_sequences(x_val_preprocessed)
test_sequences = tokenizer.texts_to_sequences(x_test_preprocessed)


word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

pad_len = 50

#train data into padded input
x_train_preprocessed_pad = pad_sequences(train_sequences, maxlen=pad_len)
x_val_preprocessed_pad = pad_sequences(val_sequences, maxlen=pad_len)
x_test_preprocessed_pad = pad_sequences(test_sequences, maxlen=pad_len)

word2vec_model = api.load('word2vec-google-news-300')
#to import GoogleNews-vectors-negative300.bin.gz from online - took 10 mins, so for trial and errors mounting drive and executing, 

# Mount Google Drive
#drive.mount('/content/drive')
# Set path to the file in your Drive
#path = '/content/drive/MyDrive/NLP/Coursework-Drive/GoogleNews-vectors-negative300.bin.gz'
# Load the Word2Vec model
#word2vec_model = KeyedVectors.load_word2vec_format(path, binary=True)

#embedding dimension
embed_dim = word2vec_model.vector_size

# Create a dictionary of word embeddings
embeddings_index = {}
for word in word2vec_model.key_to_index:
    embeddings_index[word] = word2vec_model.get_vector(word)

print('Found %s word vectors.' % len(embeddings_index))

#create embeding matrix
embedding_matrix = np.zeros((len(word_index) + 1, embed_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector


#Creating Model
model_wv = Sequential()
model_wv.add(Embedding(len(word_index) + 1,
                            embed_dim,
                            weights=[embedding_matrix],
                            input_length=pad_len,
                            trainable=False))
model_wv.add(Bidirectional(LSTM(100)))
model_wv.add(Dense(1, activation="sigmoid"))
model_wv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print('model_wv summary', model_wv.summary())

#convert to array
x_train_final = np.array(x_train_preprocessed_pad)
x_val_final = np.array(x_val_preprocessed_pad)
x_test_final = np.array(x_test_preprocessed_pad)
y_train_final = np.array(y_train)
y_val_final = np.array(y_val)
y_test_final = np.array(y_test)

#traintest split
#from sklearn.model_selection import train_test_split
#x_trainsplit, x_testsplit, y_trainsplit, y_testsplit = train_test_split(x_train_pad, y_train, test_size=0.2, random_state=42)

# Model Training
model_wv.fit(x_train_final, y_train_final, epochs=5, batch_size=32, validation_data=(x_val_final, y_val_final))

#performance metrics
test_pred_prob = model_wv.predict(x_test_final)
test_pred = (test_pred_prob > 0.5).astype(int)
test_pred
# Print the classification report and confusion matrix
print('classification_report', classification_report(y_test_final, test_pred))
print('confusion_matrix', confusion_matrix(y_test_final, test_pred))


Found 17239 unique tokens.
Found 3000000 word vectors.
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 300)           5172000   
                                                                 
 bidirectional_1 (Bidirectio  (None, 200)              320800    
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 1)                 201       
                                                                 
Total params: 5,493,001
Trainable params: 321,001
Non-trainable params: 5,172,000
_________________________________________________________________
model_wv summary None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
classification_report               precision    recall  f1-score   support

           0       0.76      0.8

In [8]:
#plotting incorrect predictions for analysis
# Get indices of incorrect predictions
incorrect_indices = np.nonzero(test_pred.squeeze() != y_test_final.squeeze())[0]

# Get incorrect predictions in text format
incorrect_pred_text = [tokenizer.sequences_to_texts([x_test_final[i]])[0] for i in incorrect_indices]

# Create DataFrame with incorrect predictions
incorrect_predictions_BILSTM = pd.DataFrame({'Text': incorrect_pred_text, 'True Label': y_test_final[incorrect_indices], 'Predicted Label': test_pred[incorrect_indices].squeeze()})

print('Incorrect Predictions:')
print(incorrect_predictions_BILSTM)

# Save the DataFrame to a CSV file
incorrect_predictions_BILSTM.to_csv('incorrect_predictions_BILSTM.csv', index=False)
from google.colab import files

# Download the CSV file
files.download('incorrect_predictions_BILSTM.csv')


Incorrect Predictions:
                                                  Text  True Label  \
0    importance earnest thick wit play like reading...           1   
1    moviegoer would automatically bypass hiphop do...           1   
2    babyfaced renner eerily convincing bland blank...           1   
3    competent unpretentious entertainment destined...           1   
4    janice beard falter recycled aspect implausibi...           1   
..                                                 ...         ...   
234                                intriguing nearmiss           0   
235               there comedic moment romantic comedy           0   
236  unlike trey parker sandler doesnt understand i...           0   
237  andunders looking caddyshack adopt generationa...           0   
238  feel like nothing quite much middleaged moviem...           0   

     Predicted Label  
0                  0  
1                  0  
2                  0  
3                  0  
4                  0 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>