# Transformers embeddings and models
- Daniel Branco, 20220599
- Fernando Cruz, 20220646
- Inês Ventura, 20220612
- Maria Mendonça, 20220625

In [None]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, TensorDataset, random_split
import torch
from transformers import DistilBertTokenizer, DistilBertModel
import tensorflow as tf
tf.random.set_seed(221)
from keras.models import Model
from keras.layers import LSTM, Dense, TimeDistributed, Bidirectional, Masking
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Text Mining/

/content/drive/.shortcut-targets-by-id/1N9Y-dcZIQh9qUPRXgB1grO-NTXypIRl8/Text Mining


In [None]:
airbnb_df_train = pd.read_excel('train.xlsx')
airbnb_df_test = pd.read_excel('test.xlsx')

reviews_df_train = pd.read_excel('train_reviews.xlsx')
reviews_df_test = pd.read_excel('test_reviews.xlsx')

In [None]:
import pickle
with open('/content/drive/MyDrive/Text Mining/preprocessed_train_df.pickle', 'rb') as data:
    reviews_df_train = pickle.load(data)

with open('/content/drive/MyDrive/Text Mining/preprocessed_airbnb_df_train.pickle', 'rb') as data:
    airbnb_df_train = pickle.load(data)

In [None]:
airbnb_df_train

Unnamed: 0,description,description_language,host_about,host_about_language,index,unlisted
0,"[joli, appartemer, bordure, mer, min, pied, pl...",fr,"[être, famille, enfant, an, habiter, année, po...",fr,5.0,0.0
1,"[apartamento, charmoso, chiado, largo, carmo, ...",pt,"[hello, m, portuguese, love, meet, people, wor...",en,4.0,0.0
2,"[dear, guest, pleaser, receive, home, guest, r...",en,"[quiet, person, like, cinema, reading, writing...",en,11.0,0.0
3,"[walk, original, wooden, stair, entrance, apar...",en,"[service, holiday, apartment, casa, azenhas, m...",en,10.0,0.0
4,"[espaço, ficar, parque, eduardo, vii, saldanha...",pt,"[friendly, host, try, need, quit, strict, hous...",en,2.0,1.0
...,...,...,...,...,...,...
11965,"[beautifully, locate, heart, lisbon, historic,...",en,"[found, travel, enthusiast, like, bnbird, want...",en,12493.0,0.0
11966,"[welcome, lapa, garden, luxurious, bedroom, ap...",en,"[bear, spain, grow, study, portugal, spain, fr...",en,12486.0,0.0
11967,"[national, tourism, board, create, clean, safe...",en,"[hello, my, name, is, pedro, and, lover, to, t...",fr,12488.0,1.0
11968,"[cat, bairro, altothis, cozy, apartment, lisbo...",en,"[travel, favorite, hobby, visit, country, euro...",en,12492.0,0.0


In [None]:
final_df.dtypes

apartment_id            int64
comments               object
description            object
host_about             object
word_count              int64
language               object
sentiment              object
unlisted              float64
normal_comments        object
normal_description     object
normal_host_about      object
dtype: object

In [None]:
# merge the two dataframes on the apartment index
final_df = pd.merge(reviews_df_train, airbnb_df_train, left_on='apartment_id', right_on='index', how='left')
final_df = final_df[['apartment_id', 'comments', 'description', 'host_about', 'word_count', 'language', 'sentiment', 'unlisted']]
final_df['normal_comments'] = final_df['comments'].apply(lambda tokens: ' '.join(tokens))
final_df['normal_description'] = final_df['description'].apply(lambda tokens: ' '.join(tokens) if isinstance(tokens, (list, str)) else '')
final_df['normal_host_about'] = final_df['host_about'].apply(lambda tokens: ' '.join(tokens) if isinstance(tokens, (list, str)) else '')
final_df['combined_text'] = final_df.apply(lambda row: ' '.join([row['normal_comments'], row['normal_description'], row['normal_host_about']]), axis=1)

In [None]:
final_df

Unnamed: 0,apartment_id,comments,description,host_about,word_count,language,sentiment,unlisted,normal_comments,normal_description,normal_host_about
0,1,"[cozy, comfortable, house, stay, in.never, wor...","[share, mixed, room, hostel, share, bathroom, ...","[alojamento, registro, /al]",28,en,positive,0.0,cozy comfortable house stay in.never worry saf...,share mixed room hostel share bathroom locate ...,alojamento registro /al
1,1,"[hostel, experience, pretty, hard, beat, place...","[share, mixed, room, hostel, share, bathroom, ...","[alojamento, registro, /al]",261,en,positive,0.0,hostel experience pretty hard beat place book ...,share mixed room hostel share bathroom locate ...,alojamento registro /al
2,1,"[hostel, neu, funktionieren, tadellos, schnell...","[share, mixed, room, hostel, share, bathroom, ...","[alojamento, registro, /al]",81,de,positive,0.0,hostel neu funktionieren tadellos schnelle w-l...,share mixed room hostel share bathroom locate ...,alojamento registro /al
3,1,"[fine, dorm, think, people, stay, far, bathroo...","[share, mixed, room, hostel, share, bathroom, ...","[alojamento, registro, /al]",27,en,positive,0.0,fine dorm think people stay far bathroom toile...,share mixed room hostel share bathroom locate ...,alojamento registro /al
4,1,"[stay, lisbon, tip, hostel, good, mixed, room,...","[share, mixed, room, hostel, share, bathroom, ...","[alojamento, registro, /al]",42,en,positive,0.0,stay lisbon tip hostel good mixed room clean c...,share mixed room hostel share bathroom locate ...,alojamento registro /al
...,...,...,...,...,...,...,...,...,...,...,...
55161,12494,"[apto, excelente, foto, cama, confortável, chu...","[enjoy, breakfast, sleek, kitchen, freestande,...","[i´m, portugal, love, dance, travel, pleasure,...",53,pt,positive,0.0,apto excelente foto cama confortável chuveiror...,enjoy breakfast sleek kitchen freestande knott...,i´m portugal love dance travel pleasure welcom...
55162,12494,"[lovely, stay, lisbon, apartment, great, clean...","[enjoy, breakfast, sleek, kitchen, freestande,...","[i´m, portugal, love, dance, travel, pleasure,...",61,en,positive,0.0,lovely stay lisbon apartment great clean tidy ...,enjoy breakfast sleek kitchen freestande knott...,i´m portugal love dance travel pleasure welcom...
55163,12494,"[liliana, apartment, beautiful, perfect, locat...","[enjoy, breakfast, sleek, kitchen, freestande,...","[i´m, portugal, love, dance, travel, pleasure,...",25,en,positive,0.0,liliana apartment beautiful perfect location e...,enjoy breakfast sleek kitchen freestande knott...,i´m portugal love dance travel pleasure welcom...
55164,12494,"[great, apartment, spacious, modern, centrally...","[enjoy, breakfast, sleek, kitchen, freestande,...","[i´m, portugal, love, dance, travel, pleasure,...",33,en,positive,0.0,great apartment spacious modern centrally loca...,enjoy breakfast sleek kitchen freestande knott...,i´m portugal love dance travel pleasure welcom...


# Transformer embedding

In [None]:
def transformer_embedding():

    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    model = DistilBertModel.from_pretrained('distilbert-base-uncased')

    # Extract labels
    labels = torch.tensor(final_df['unlisted'].to_list())

    # Create tokens
    text = final_df['combined_text'].to_list()
    tokens = tokenizer(text, return_tensors='pt', add_special_tokens=True, padding=True, truncation=True)

    # Create the Dataset
    dataset = TensorDataset(tokens['input_ids'], tokens['attention_mask'], labels)

    # Split the Dataset in train and validation
    train_dataset, val_dataset = random_split(dataset, [0.7, 0.3])

    # Prepare your data (assuming you have train_loader and test_loader)
    dataloader_train = DataLoader(train_dataset, batch_size=32, shuffle=True)
    dataloader_test = DataLoader(val_dataset, batch_size=32, shuffle=False)

    embeddings_train = []
    embeddings_test = []
    labels_train = []
    labels_test = []

    for batch in dataloader_train:
        with torch.no_grad():
            outputs_train = model(input_ids=batch[0], attention_mask=batch[1])
            embeddings_train.append(outputs_train.last_hidden_state)
            labels_train.append(batch[2])

    for batch in dataloader_test:
        with torch.no_grad():
            outputs_test = model(input_ids=batch[0], attention_mask=batch[1])
            embeddings_test.append(outputs_test.last_hidden_state)
            labels_test.append(batch[2])

    embeddings_train = torch.cat(embeddings_train, dim=0)
    embeddings_test = torch.cat(embeddings_test, dim=0)
    labels_train = torch.cat(labels_train, dim=0)
    labels_test = torch.cat(labels_test, dim=0)

    return [embeddings_train, embeddings_test, labels_train, labels_test]

# Training

In [None]:
train, test = train_test_split(final_df, test_size = 0.3)

In [None]:
len(train), len(test)

(38616, 16550)

In [None]:
embeddings_train, embeddings_test = transformer_embedding()

X = tf.convert_to_tensor(embeddings_train, dtype='float32')
y = np.array(train['unlisted'])

X_test = tf.convert_to_tensor(embeddings_test, dtype='float32')
y_test = np.array(test['unlisted'])


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## LSTM

In [None]:
embeddings = X.shape

# Prepare your data for the LSTM model
time_steps = embeddings[1]
features = embeddings[2]

model = Sequential()
model.add(Masking(mask_value=0, input_shape=(time_steps, features)))
model.add(Bidirectional(LSTM(units=4, input_shape=(time_steps, features))))
model.add(Dense(2, activation='sigmoid'))

output = model(X)

#compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["binary_accuracy"])
model.summary()


In [None]:
history = model.fit(X, y, batch_size=16, epochs=20, verbose=1, validation_split=0.20)

## Evaluate

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize =(10, 6))
plt.plot(history.history['binary_accuracy'])
plt.plot(history.history['val_binary_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# Transformer model

In [None]:
def distilbert_transformer_model(num_epochs=10):

  embeddings_train, embeddings_test, labels_train, labels_test = transformer_embedding()

  prediction_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

  num_classes = 2
  prediction_model.classifier = torch.nn.Linear(prediction_model.config.hidden_size, num_classes)

  train_dataset = TensorDataset(embeddings_train, labels_train)
  train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

  test_dataset = TensorDataset(embeddings_test, labels_test)
  test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

  # Define the loss function and optimizer for training
  loss_fn = torch.nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(prediction_model.parameters(), lr=1e-5)

  for epoch in range(num_epochs):
      prediction_model.train()
      for embeddings, labels in train_loader:
          optimizer.zero_grad()
          outputs = prediction_model(embeddings)
          loss = loss_fn(outputs, labels)
          loss.backward()
          optimizer.step()

  #Evaluate the trained model on the test set
  prediction_model.eval()
  correct = 0
  total = 0

  with torch.no_grad():
      for embeddings, labels in test_loader:
          outputs = prediction_model(embeddings)
          _, predicted = torch.max(outputs.logits, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()

  accuracy = correct / total
  print(f"Test Accuracy: {accuracy}")
