In [1]:
import tensorflow as tf
device_list = tf.test.gpu_device_name()
device_list

''

In [2]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
import re
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


# Importing sklearn.metrics libraries

# Libraries required for hyper-parameter tuning

In [3]:
data = pd.read_csv("deceptive-opinion.csv")

In [4]:
data.head()

Unnamed: 0,deceptive,hotel,polarity,source,text
0,truthful,conrad,positive,TripAdvisor,We stayed for a one night getaway with family ...
1,truthful,hyatt,positive,TripAdvisor,Triple A rate with upgrade to view room was le...
2,truthful,hyatt,positive,TripAdvisor,This comes a little late as I'm finally catchi...
3,truthful,omni,positive,TripAdvisor,The Omni Chicago really delivers on all fronts...
4,truthful,hyatt,positive,TripAdvisor,I asked for a high floor away from the elevato...


In [5]:
data.shape

(1600, 5)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600 entries, 0 to 1599
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   deceptive  1600 non-null   object
 1   hotel      1600 non-null   object
 2   polarity   1600 non-null   object
 3   source     1600 non-null   object
 4   text       1600 non-null   object
dtypes: object(5)
memory usage: 62.6+ KB


In [7]:
data.drop(['hotel', 'source'], axis=1, inplace=True)

In [8]:
data.deceptive.value_counts()

deceptive
truthful     800
deceptive    800
Name: count, dtype: int64

In [9]:
data.loc[data["deceptive"] == "truthful", "LABEL"] = 1
data.loc[data["deceptive"] == "deceptive", "LABEL"] = 0

In [10]:
data.head()

Unnamed: 0,deceptive,polarity,text,LABEL
0,truthful,positive,We stayed for a one night getaway with family ...,1.0
1,truthful,positive,Triple A rate with upgrade to view room was le...,1.0
2,truthful,positive,This comes a little late as I'm finally catchi...,1.0
3,truthful,positive,The Omni Chicago really delivers on all fronts...,1.0
4,truthful,positive,I asked for a high floor away from the elevato...,1.0


In [11]:
data.tail()

Unnamed: 0,deceptive,polarity,text,LABEL
1595,deceptive,negative,Problems started when I booked the InterContin...,0.0
1596,deceptive,negative,The Amalfi Hotel has a beautiful website and i...,0.0
1597,deceptive,negative,The Intercontinental Chicago Magnificent Mile ...,0.0
1598,deceptive,negative,"The Palmer House Hilton, while it looks good i...",0.0
1599,deceptive,negative,"As a former Chicagoan, I'm appalled at the Ama...",0.0


In [12]:
data.polarity.value_counts()

polarity
positive    800
negative    800
Name: count, dtype: int64

In [13]:
data['review_length'] = data['text'].map(len)

In [14]:
data.head()

Unnamed: 0,deceptive,polarity,text,LABEL,review_length
0,truthful,positive,We stayed for a one night getaway with family ...,1.0,572
1,truthful,positive,Triple A rate with upgrade to view room was le...,1.0,286
2,truthful,positive,This comes a little late as I'm finally catchi...,1.0,1104
3,truthful,positive,The Omni Chicago really delivers on all fronts...,1.0,707
4,truthful,positive,I asked for a high floor away from the elevato...,1.0,384


In [15]:
max(data['review_length'])

4159

In [16]:
min(data['review_length'])

151

In [17]:
data['combined'] = data['text'] + ' ' + data['polarity']

In [18]:
df_subset = pd.DataFrame(data[["combined", "LABEL"]])

In [19]:
data['LABEL'] = data['LABEL'].astype(int)

In [20]:
print(df_subset.dtypes)

combined     object
LABEL       float64
dtype: object


In [21]:
from transformers import BertTokenizer
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased', do_lower_case=True)

Loading BERT tokenizer...


In [22]:
combined = df_subset.combined.values
LABEL = df_subset.LABEL.values

In [23]:
import re
from string import punctuation


def preprocess(data):
    # remove url and hashtag
    for i in range(data.shape[0]):
        text = data[i].lower()
        text1 = ''.join([word+" " for word in text.split()])
        data[i] = text1
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
                       '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    space_pattern = '\s+'

    for i in range(data.shape[0]):
        text_string = data[i]
        parsed_text = re.sub(hashtag_regex, '', text_string)
        parsed_text = re.sub(giant_url_regex, '', parsed_text)
        parsed_text = re.sub(mention_regex, '', parsed_text)
        # remove punctuation
        parsed_text = re.sub(r"[{}]+".format(punctuation), '', parsed_text)
        parsed_text = re.sub(space_pattern, ' ', parsed_text)
        data[i] = parsed_text
    return data


combined = preprocess(combined)

In [24]:
combined

array(['we stayed for a one night getaway with family on a thursday triple aaa rate of 173 was a steal 7th floor room complete with 44in plasma tv bose stereo voss and evian water and gorgeous bathroomno tub but was fine for us concierge was very helpful you cannot beat this location only flaw was breakfast was pricey and service was very very slow2hours for four kids and four adults on a friday morning even though there were only two other tables in the restaurant food was very good so it was worth the wait i would return in a heartbeat a gem in chicago positive ',
       'triple a rate with upgrade to view room was less than 200 which also included breakfast vouchers had a great view of river lake wrigley bldg tribune bldg most major restaurants shopping sightseeing attractions within walking distance large room with a very comfortable bed positive ',
       'this comes a little late as im finally catching up on my reviews from the past several months a dear friend and i stayed at th

In [25]:
# Print the original sentence.
print(' Original: ', combined[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(combined[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(
    tokenizer.tokenize(combined[0])))

 Original:  we stayed for a one night getaway with family on a thursday triple aaa rate of 173 was a steal 7th floor room complete with 44in plasma tv bose stereo voss and evian water and gorgeous bathroomno tub but was fine for us concierge was very helpful you cannot beat this location only flaw was breakfast was pricey and service was very very slow2hours for four kids and four adults on a friday morning even though there were only two other tables in the restaurant food was very good so it was worth the wait i would return in a heartbeat a gem in chicago positive 
Tokenized:  ['we', 'stayed', 'for', 'a', 'one', 'night', 'get', '##away', 'with', 'family', 'on', 'a', 'thursday', 'triple', 'aaa', 'rate', 'of', '173', 'was', 'a', 'steal', '7th', 'floor', 'room', 'complete', 'with', '44', '##in', 'plasma', 'tv', 'bose', 'stereo', 'voss', 'and', 'ev', '##ian', 'water', 'and', 'gorgeous', 'bathroom', '##no', 'tub', 'but', 'was', 'fine', 'for', 'us', 'con', '##cier', '##ge', 'was', 'very',

In [26]:
import torch
import torch.nn as nn
from transformers import BertModel

import torch.optim as optim
from sklearn.metrics import roc_auc_score, f1_score
import time
input_ids = []
attention_masks = []
for tweet in combined:
    encoded_dict = tokenizer.encode_plus(
        tweet,                      # Sentence to encode.
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        max_length=512,           # Pad & truncate all sentences.
        pad_to_max_length=True,
        return_attention_mask=True,   # Construct attn. masks.
        return_tensors='pt',     # Return pytorch tensors.
    )

    # Add the encoded sentence to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(LABEL)

# Print sentence 0, now as a list of IDs.
print('Original: ', combined[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  we stayed for a one night getaway with family on a thursday triple aaa rate of 173 was a steal 7th floor room complete with 44in plasma tv bose stereo voss and evian water and gorgeous bathroomno tub but was fine for us concierge was very helpful you cannot beat this location only flaw was breakfast was pricey and service was very very slow2hours for four kids and four adults on a friday morning even though there were only two other tables in the restaurant food was very good so it was worth the wait i would return in a heartbeat a gem in chicago positive 
Token IDs: tensor([  101,  2057,  4370,  2005,  1037,  2028,  2305,  2131,  9497,  2007,
         2155,  2006,  1037,  9432,  6420, 13360,  3446,  1997, 19410,  2001,
         1037,  8954,  5504,  2723,  2282,  3143,  2007,  4008,  2378, 12123,
         2694, 21299, 12991, 24878,  1998, 23408,  2937,  2300,  1998,  9882,
         5723,  3630, 14366,  2021,  2001,  2986,  2005,  2149,  9530, 19562,
         3351,  2001,  22

In [27]:

from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(
    dataset, [train_size, val_size], generator=torch.Generator().manual_seed(42))

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

1,440 training samples
  160 validation samples


In [28]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 16

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    batch_size=batch_size
)

validation_dataloader = DataLoader(
    val_dataset,
    shuffle=False,
    batch_size=batch_size
)

In [29]:
def format_time(elapsed):
    import datetime
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [30]:
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')

In [31]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False,
)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [32]:

class BertBiLSTMClassifier(nn.Module):
    def __init__(self, bert_model, embed_dim=768, hidden_dim=128, num_layers=2, num_labels=2, bidirectional=True):
        super().__init__()
        self.num_labels = num_labels
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.softmax = nn.functional.softmax
        self.bert = bert_model  # Assign the BertModel directly
        self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim,
                            num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(
            hidden_dim * (2 if bidirectional else 1), num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask,
                           token_type_ids=token_type_ids)[0]  # Get the BERT output
        lstm_out, _ = self.lstm(output)

        # Use the last hidden state of the LSTM as input to the classifier
        if self.bidirectional:
            lstm_out = lstm_out[:, -1, :]
        else:
            lstm_out = lstm_out[:, -1, :]

        output = self.dropout(lstm_out)
        logits = self.classifier(output)
        return self.softmax(logits, 1)

In [33]:
# Load pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [34]:

# Pass the loaded BERT model to your classifier
model_bilstm = BertBiLSTMClassifier(bert_model)

In [35]:

# Set parameters
epochs = 4
learning_rate = 5e-5
optimizer = optim.AdamW(model_bilstm.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [36]:

# Training loop
training_stats = []
total_t0 = time.time()
best_accuracy = 0

device = torch.device("cpu")  # Use CPU

In [41]:
import torch
import time
from sklearn.metrics import roc_auc_score, f1_score

# Assuming you have defined your model, criterion, optimizer, and data loaders
# model_bilstm, criterion, optimizer, train_dataloader, validation_dataloader

# Initialize variables for tracking best accuracy
best_accuracy = 0
best_model = None

# Training loop
epochs = 3  # Set the number of epochs
total_t0 = time.time()

training_stats = []

for epoch_i in range(0, epochs):
    # Training
    print("")
    print('Epoch {:} / {:}'.format(epoch_i + 1, epochs))
    print('Training...')
    t0 = time.time()
    total_train_loss = 0
    total_train_accuracy = 0
    model_bilstm.train()

    for step, batch in enumerate(train_dataloader):
        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        # Ensure labels are of type torch.long
        labels = batch[2].to(device).long()

        model_bilstm.zero_grad()
        out = model_bilstm(input_ids=input_ids,
                           attention_mask=input_mask, token_type_ids=None)
        loss = criterion(out, labels)
        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_bilstm.parameters(), 1.0)
        optimizer.step()

        pred = torch.argmax(out, dim=1)
        total_train_accuracy += torch.sum(pred == labels).item()

    avg_train_accuracy = total_train_accuracy / len(train_dataloader.dataset)
    avg_train_loss = total_train_loss / len(train_dataloader.dataset)

    print("  Accuracy: {}".format(avg_train_accuracy))
    print("  Training loss: {}".format(avg_train_loss))

    # Validation
    print("")
    print("Validation...")
    model_bilstm.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    y_true = []
    y_pred = []

    for batch in validation_dataloader:
        input_ids = batch[0].to(device)
        input_mask = batch[1].to(device)
        # Ensure labels are of type torch.long
        labels = batch[2].to(device).long()

        with torch.no_grad():
            out = model_bilstm(input_ids=input_ids,
                               attention_mask=input_mask, token_type_ids=None)
            loss = criterion(out, labels)
            total_eval_loss += loss.item()

            pred = torch.argmax(out, dim=1)
            total_eval_accuracy += torch.sum(pred == labels).item()

            y_true.append(labels.flatten())
            y_pred.append(pred.flatten())

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader.dataset)
    print("  Accuracy: {}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader.dataset)
    print("  Validation loss: {}".format(avg_val_loss))

    training_time = format_time(time.time() - t0)
    print("  This epoch took: {:}".format(training_time))
    print()

    y_true = torch.cat(y_true).tolist()
    y_pred = torch.cat(y_pred).tolist()
    print('  roc_auc score: ', roc_auc_score(y_true, y_pred))
    print('  F1 score:', f1_score(y_true, y_pred))

    training_stats.append({
        'epoch': epoch_i + 1,
        'Train Accur.': avg_train_accuracy,
        'Training Loss': avg_train_loss,
        'Valid. Loss': avg_val_loss,
        'Valid. Accur.': avg_val_accuracy,
        'Training Time': training_time,
    })

    if avg_val_accuracy > best_accuracy:
        best_accuracy = avg_val_accuracy
        best_model = model_bilstm

print("===")
print("Summary")
print("Total time {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
print('best acc:', best_accuracy)


Epoch 1 / 3
Training...
  Accuracy: 0.6972222222222222
  Training loss: 0.037298186019890836

Validation...
  Accuracy: 0.8625
  Validation loss: 0.029519927129149436
  This epoch took: 3:20:16

  roc_auc score:  0.8510101010101011
  F1 score: 0.828125

Epoch 2 / 3
Training...
  Accuracy: 0.8527777777777777
  Training loss: 0.02921379158894221

Validation...
  Accuracy: 0.9275
  Validation loss: 0.029978754557669164
  This epoch took: 0:57:37

  roc_auc score:  0.8207070707070706
  F1 score: 0.7833333333333333

Epoch 3 / 3
Training...
  Accuracy: 0.9090277777777778
  Training loss: 0.025512815680768755

Validation...
  Accuracy: 0.9445
  Validation loss: 0.029277219623327255
  This epoch took: 0:53:25

  roc_auc score:  0.9409090909090909
  F1 score: 0.9481818181818182
===
Summary
Total time 5:11:18 (h:mm:ss)
best acc: 0.9481




In [38]:
PATH_BiLSTM = "bilstm_model.pt"
torch.save(best_model, PATH_BiLSTM)

In [39]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel


class BertBiLSTMClassifier(nn.Module):
    def __init__(self, bert_model, embed_dim=768, hidden_dim=128, num_layers=2, num_labels=2, bidirectional=True):
        super().__init__()
        self.num_labels = num_labels
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.softmax = nn.functional.softmax
        self.bert = bert_model  # Assign the BertModel directly
        self.lstm = nn.LSTM(input_size=embed_dim, hidden_size=hidden_dim,
                            num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(
            hidden_dim * (2 if bidirectional else 1), num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask,
                           token_type_ids=token_type_ids)[0]  # Get the BERT output
        lstm_out, _ = self.lstm(output)

        # Use the last hidden state of the LSTM as input to the classifier
        if self.bidirectional:
            lstm_out = lstm_out[:, -1, :]
        else:
            lstm_out = lstm_out[:, -1, :]

        output = self.dropout(lstm_out)
        logits = self.classifier(output)
        return self.softmax(logits, 1)


def checkbilstm(text_input):
    # Load the saved model
    PATH_BiLSTM = "bilstm_model.pt"
    model = torch.load(PATH_BiLSTM)

    # Load pre-trained BERT model and tokenizer
    bert_model = BertModel.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Create an instance of your custom classifier
    model_bilstm = BertBiLSTMClassifier(bert_model)

    # Load the saved model weights into your custom classifier
    model_bilstm.load_state_dict(model.state_dict())

    # Set the model to evaluation mode
    model_bilstm.eval()

    # Define a function to preprocess text input

    def preprocess_text(text):
        # Tokenize the text and convert it into input IDs, attention mask, and token type IDs
        inputs = tokenizer(text, return_tensors='pt',
                           max_length=512, truncation=True, padding=True)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']
        return input_ids, attention_mask, token_type_ids

    # Define a function to predict

    def predict_single_text(text):
        # Preprocess the text input
        input_ids, attention_mask, token_type_ids = preprocess_text(text)

        # Perform the prediction
        with torch.no_grad():
            output = model_bilstm(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        # Get the predicted labels
        predicted_labels = torch.argmax(output, dim=1)
        return predicted_labels.item()

    # Perform prediction
    predicted_label = predict_single_text(text_input)
    return predicted_label

In [40]:
v = "my german shepherd absolutely loves this my stuff has been officially tooth markfree for the first time in a very long time good no pet products"
x = checkbilstm(v)
print(x)

0
