<a href="https://colab.research.google.com/github/ferygood/LLM_behavior_prediction/blob/main/03_bert_model_development.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
import pandas as pd
import numpy as np

web_visit_data = pd.DataFrame({
    'user_id': np.random.randint(1, 1000, size=1000),
    'visit_time': pd.date_range(start='2023-01-01', periods=1000, freq='T'),
    'page_url': np.random.choice(['home', 'product', 'cart', 'checkout'], size=1000),
    'referrer_url': np.random.choice(['google', 'facebook', 'twitter', 'direct'], size=1000)
})

purchase_data = pd.DataFrame({
    'user_id': np.random.randint(1, 1000, size=500),
    'purchase_time': pd.date_range(start='2023-01-01', periods=500, freq='2T'),
    'product_id': np.random.randint(1, 100, size=500),
    'amount': np.random.uniform(10, 500, size=500)
})

social_interaction_data = pd.DataFrame({
    'user_id': np.random.randint(1, 1000, size=300),
    'interaction_time': pd.date_range(start='2023-01-01', periods=300, freq='5T'),
    'platform': np.random.choice(['facebook', 'twitter', 'instagram'], size=300),
    'action': np.random.choice(['like', 'share', 'comment'], size=300)
})

In [3]:
# load user data
#web_visit_data = pd.read_csv('web_visit_data.csv')
#purchase_data = pd.read_csv('purchase_data.csv')
#social_interaction_data = pd.read_csv('social_interaction_data.csv')

# combine data (need to check column and user ID)
data = pd.concat([web_visit_data, purchase_data, social_interaction_data], ignore_index=True)


In [4]:
# create label feature, our goal is to predict if a user will be a certain product
data['label'] = data['amount'].apply(lambda x: 1 if x > 0 else 0)

# select related features
features = data[['page_url', 'referrer_url', 'platform', 'action']]
labels = data['label']

# train & test split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=42)

Then we start developing our model and train our model

In [5]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

class UserBehaviorDataset(Dataset):
    def __init__(self, features, labels, tokenizer, max_len):
        self.features = features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features.iloc[idx]
        label = self.labels.iloc[idx]

        # combine all features as one sentence
        text = ' '.join([str(value) for value in feature])

        # tokenize
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# model parameters
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 4
LEARNING_RATE = 2e-5

# load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# create training data and data loader
train_dataset = UserBehaviorDataset(train_features, train_labels, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = UserBehaviorDataset(test_features, test_labels, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# use AdamW optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# train model
def train_model(model, data_loader, optimizer, device, epochs):
    model = model.train()

    for epoch in range(epochs):
        total_loss = 0

        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(data_loader)
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}')

# model training
train_model(model, train_loader, optimizer, 'cuda' if torch.cuda.is_available() else 'cpu', EPOCHS)

# save model
model.save_pretrained('bert_user_behavior_model')
tokenizer.save_pretrained('bert_user_behavior_tokenizer')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4, Loss: 0.0580
Epoch 2/4, Loss: 0.0019
Epoch 3/4, Loss: 0.0009
Epoch 4/4, Loss: 0.0005


('bert_user_behavior_tokenizer/tokenizer_config.json',
 'bert_user_behavior_tokenizer/special_tokens_map.json',
 'bert_user_behavior_tokenizer/vocab.txt',
 'bert_user_behavior_tokenizer/added_tokens.json')

evaluate if the model is good or bad

In [6]:
from sklearn.metrics import accuracy_score, classification_report

def evaluate_model(model, data_loader, device):
    model = model.eval()

    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions)

    return accuracy, report

# 評估模型
accuracy, report = evaluate_model(model, test_loader, 'cuda' if torch.cuda.is_available() else 'cpu')
print(f'Accuracy: {accuracy:.4f}')
print(report)


Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       260
           1       1.00      1.00      1.00       100

    accuracy                           1.00       360
   macro avg       1.00      1.00      1.00       360
weighted avg       1.00      1.00      1.00       360



For reproducible, you can load the model and tokenizer into the workspace.

In [7]:
from transformers import BertTokenizer, BertForSequenceClassification

# load model and tokenizer
model_load_path = 'bert_user_behavior_model'
tokenizer_load_path = 'bert_user_behavior_tokenizer'

model = BertForSequenceClassification.from_pretrained(model_load_path)
tokenizer = BertTokenizer.from_pretrained(tokenizer_load_path)

# evaluate model
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')
model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

Then we can use the bert model as:

In [9]:
import torch

# define a prediction function
def predict_user_behavior(model, tokenizer, text, max_len=128):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to('cuda' if torch.cuda.is_available() else 'cpu')
    attention_mask = encoding['attention_mask'].to('cuda' if torch.cuda.is_available() else 'cpu')

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).cpu().item()

    return prediction

# simulate a one-time sample
sample_feature = {
    'page_url': 'home',
    'referrer_url': 'google',
    'platform': 'facebook',
    'action': 'comment'
}

# merge features into a sentence as an input for the model
sample_text = ' '.join([str(value) for value in sample_feature.values()])

# do prediction
prediction = predict_user_behavior(model, tokenizer, sample_text)
print(f'Predicted user behavior: {prediction}')


Predicted user behavior: 0
