In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 9.1MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 50.7MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     

In [2]:
from transformers import InputExample, InputFeatures
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
import tensorflow as tf
import pandas as pd
import os
import shutil
from sklearn.metrics import classification_report
import numpy as np

In [6]:
URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz", 
                                  origin=URL,
                                  untar=True,
                                  cache_dir='.',
                                  cache_subdir='')

In [7]:
# Create main directory path ("/aclImdb")
main_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
# Create sub directory path ("/aclImdb/train")
train_dir = os.path.join(main_dir, 'train')
# Remove unsup folder since this is a supervised learning task
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)
# View the final train folder
print(os.listdir(train_dir))

['urls_unsup.txt', 'labeledBow.feat', 'urls_pos.txt', 'unsupBow.feat', 'neg', 'urls_neg.txt', 'pos']


In [8]:
train = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='training', seed=123)
test = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', batch_size=30000, validation_split=0.2, 
    subset='validation', seed=123)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [11]:
for i in train.take(1):
  train_feat = i[0].numpy()
  train_lab = i[1].numpy()

train = pd.DataFrame([train_feat, train_lab]).T
train.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train['DATA_COLUMN'] = train['DATA_COLUMN'].str.decode("utf-8")

AssertionError: ignored

In [13]:
for j in test.take(1):
  test_feat = j[0].numpy()
  test_lab = j[1].numpy()

test = pd.DataFrame([test_feat, test_lab]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
test['DATA_COLUMN'] = test['DATA_COLUMN'].str.decode("utf-8")
test.head()

AssertionError: ignored

In [15]:
train_sentences = train['DATA_COLUMN']
train_labels = train['LABEL_COLUMN']

validation_sentences = test['DATA_COLUMN'][:4500]
validation_labels = test['LABEL_COLUMN'][:4500]

In [16]:
vocab_size = 20000
max_length = 500
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

tokenizer.fit_on_texts(train_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(train_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

valid_sequences = tokenizer.texts_to_sequences(validation_sentences)
valid_padded = pad_sequences(valid_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

training_padded = np.array(training_padded,dtype=int)
train_labels = np.array(train_labels,dtype=int)
valid_padded = np.array(valid_padded,dtype=int)
valid_labels = np.array(validation_labels,dtype=int)

In [None]:
optim = tf.keras.optimizers.Adam(learning_rate=0.001)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size,64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(200,return_sequences=True),input_shape=(64,)),
    tf.keras.layers.Dense(50, input_shape=(150,), activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid'),

])

model.summary()

model.compile(loss='binary_crossentropy',optimizer=optim, metrics=['accuracy']) 

num_epochs = 30
history = model.fit(training_padded, train_labels, epochs=num_epochs, validation_data=(valid_padded, valid_labels), verbose=2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          1280000   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 400)         424000    
_________________________________________________________________
dense (Dense)                (None, None, 50)          20050     
_________________________________________________________________
dropout (Dropout)            (None, None, 50)          0         
_________________________________________________________________
dense_1 (Dense)              (None, None, 1)           51        
Total params: 1,724,101
Trainable params: 1,724,101
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
625/625 - 46s - loss: 0.6319 - accuracy: 0.6056 - val_loss: 0.5796 - val_accuracy: 0.6984
Epoch 2/30


In [None]:
custom_sentences = ['Nice phone','Very bad model','It is working fine','Better architecture','Not so good']
custom_labels = [1,0,1,1,0]


test_sequences = tokenizer.texts_to_sequences(custom_sentences)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

test_padded = np.array(test_padded)
test_labels = np.array(custom_labels)

predicted = model.predict(test_padded)

y_pred = []
for i in range(len(predicted)):
  sum=0
  for j in range(len(predicted[i])):
    sum+=predicted[i][j]
  sum = sum/len(predicted[i])
  if sum>0.5:
    y_pred.append(1)
  else:
    y_pred.append(0)

NameError: ignored

In [None]:
print(classification_report(test_labels, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.75      1.00      0.86         3

    accuracy                           0.80         5
   macro avg       0.88      0.75      0.76         5
weighted avg       0.85      0.80      0.78         5



In [None]:
model.evaluate(test_padded,test_labels,verbose=2)

1/1 - 1s - loss: 0.5172 - accuracy: 0.8012


[0.5172228217124939, 0.8012000322341919]