In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing Packages

In [2]:
import pandas as pd
import numpy as np
import random

import os
import gc
from PIL import Image
from textwrap import wrap
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl


from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

from tensorflow.python.keras import models, layers, optimizers
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.convolutional import Conv1D
from keras.layers import Embedding
from keras.layers import GlobalMaxPooling1D
from keras.layers.core import Activation, Dropout, Dense
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
import bz2
import re

%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory



# Any results you write to the current directory are saved as output.

In [3]:
train = "/content/drive/MyDrive/IR/IR Dataset Project/train.ft.txt.bz2"
test = "/content/drive/MyDrive/IR/IR Dataset Project/test.ft.txt.bz2"

In [4]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [5]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m94.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m96.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [6]:
import transformers
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification

In [7]:
tf.random.set_seed(18)
np.random.seed(18)

In [8]:
def get_labels_and_texts(file):
    labels = []
    texts = []
    for line in bz2.BZ2File(file):
        x = line.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    labels = labels[:int(len(labels)*0.01)]
    texts = texts[:int(len(texts)*0.01)]
    return np.array(labels), texts
train_labels, train_texts = get_labels_and_texts(train)
test_labels, test_texts = get_labels_and_texts(test)

In [9]:
NON_ALPHANUM = re.compile(r'[\W]')
NON_ASCII = re.compile(r'[^a-z0-1\s]')
def normalize_texts(texts):
    normalized_texts = []
    for text in texts:
        lower = text.lower()
        no_punctuation = NON_ALPHANUM.sub(r' ', lower)
        no_non_ascii = NON_ASCII.sub(r'', no_punctuation)
        normalized_texts.append(no_non_ascii)
    return normalized_texts
        
train_texts = normalize_texts(train_texts)
test_texts = normalize_texts(test_texts)

In [10]:
train_texts_Bert = train_texts
train_labels_Bert = train_labels
test_texts_Bert = test_texts
test_labels_Bert = test_labels
train_reviews = train_texts

#### Train/Validation Split
We split the dataset, keeping 20% of the training set for validation.

In [11]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, random_state=57643892, test_size=0.2)

In [12]:
MAX_FEATURES = 12000
#Tokenize texts
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(train_texts)
train_texts = tokenizer.texts_to_sequences(train_texts)
val_texts = tokenizer.texts_to_sequences(val_texts)
test_texts = tokenizer.texts_to_sequences(test_texts)


In [None]:
print(train_texts[0])

[669, 1583, 49, 2177, 59, 2, 105, 700, 8, 97, 51, 49, 65, 9454, 57, 234, 59, 4, 178, 1, 2749, 20, 1, 202, 9, 74, 23, 1148, 45, 22, 5, 27, 26, 362, 255, 1727, 256, 508, 3100, 1878, 4168, 3, 47, 2, 227, 1, 153, 2, 145, 21, 65, 509, 12, 25, 1519, 2, 66, 21, 34, 11, 202, 5413, 2, 152, 1, 202, 45, 27, 222, 3, 465, 9, 104, 74, 205, 3, 25, 687, 2, 2792, 21, 700, 8, 202, 20, 8, 97]


In [13]:
MAX_LENGTH = max(len(train_ex) for train_ex in train_texts)
#Add padding
train_texts = pad_sequences(train_texts, maxlen=MAX_LENGTH)
val_texts = pad_sequences(val_texts, maxlen=MAX_LENGTH)
test_texts = pad_sequences(test_texts, maxlen=MAX_LENGTH)

In [None]:
train_texts[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        669, 1583,   49, 2177,   59,    2,  105,  700,    8,   97,   51,
         49,   65, 9454,   57,  234,   59,    4,  178,    1, 2749,   20,
          1,  202,    9,   74,   23, 1148,   45,   

### Long Short-Term Memory (LSTM)

In [23]:
def build_lstm_model():
    sequences = layers.Input(shape=(MAX_LENGTH,))
    embedded = layers.Embedding(MAX_FEATURES, 64)(sequences)
    x = layers.LSTM(32, return_sequences=True)(embedded)
    x = layers.GlobalMaxPool1D()(x)
    x = layers.Dense(16, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    predictions = layers.Dense(1, activation='sigmoid')(x)
    model = models.Model(inputs=sequences, outputs=predictions)
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model
    
lstm_model = build_lstm_model()

In [24]:
print(lstm_model.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 208)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 208, 64)           768000    
_________________________________________________________________
lstm (LSTM)                  (None, 208, 32)           12416     
_________________________________________________________________
global_max_pooling1d (Global (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                528       
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17    

In [25]:
lstm_model.fit(
    train_texts, 
    train_labels, 
    batch_size=128,
    epochs=2,
    validation_data=(val_texts, val_labels), )

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f79a3460be0>

In [26]:
preds = lstm_model.predict(test_texts)
print('Accuracy score: {:0.4}'.format(accuracy_score(test_labels, 1 * (preds > 0.5))))
print('F1 score: {:0.4}'.format(f1_score(test_labels, 1 * (preds > 0.5))))
print('ROC AUC score: {:0.4}'.format(roc_auc_score(test_labels, preds)))

Accuracy score: 0.8855
F1 score: 0.8914
ROC AUC score: 0.9537


### BERT

For BERT we do same split on train set (20% for validation)

In [14]:
train_texts_Bert, val_texts_Bert, train_labels_Bert, val_labels_Bert = train_test_split(
    train_texts_Bert, train_labels_Bert, random_state=57643892, test_size=0.2)

In [16]:
#Assign tokenizer object to the tokenizer class
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [17]:
train_encodings = tokenizer(train_texts_Bert,
                            truncation=True,
                            padding=True,
                            max_length=256)
val_encodings = tokenizer(val_texts_Bert,
                          truncation=True,
                          padding=True,
                          max_length=256)
train_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(train_encodings),
                            train_labels_Bert
                            ))
val_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(val_encodings),
                            val_labels_Bert
                            ))
test_encodings = tokenizer(test_texts_Bert,
                          truncation=True,
                          padding=True,
                          max_length=256)
test_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(test_encodings),
                            test_labels_Bert
                            ))

In [19]:
bert_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=2)

Downloading tf_model.h5:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_projector', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_19', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [20]:
print(bert_model.summary())

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0
_________________________________________________________________
None


In [21]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
bert_model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=["binary_accuracy"])
bert_model.fit(train_dataset.shuffle(100).batch(16),
          epochs=2,
          batch_size=16,
          validation_data=val_dataset.shuffle(100).batch(16))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f79ade8f760>

In [22]:
results_bert = bert_model.evaluate(test_dataset.batch(8))

