<a href="https://colab.research.google.com/github/ferygood/LLM_behavior_prediction/blob/main/03_bert_model_development.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# load user data
web_visit_data = pd.read_csv('web_visit_data.csv')
purchase_data = pd.read_csv('purchase_data.csv')
social_interaction_data = pd.read_csv('social_interaction_data.csv')

# combine data (need to check column and user ID)
data = pd.concat([web_visit_data, purchase_data, social_interaction_data], ignore_index=True)


In [None]:
# create label feature, our goal is to predict if a user will be a certain product
data['label'] = data['amount'].apply(lambda x: 1 if x > 0 else 0)

# select related features
features = data[['page_url', 'referrer_url', 'platform', 'action']]
labels = data['label']

# train & test split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2, random_state=42)

Then we start developing our model and train our model

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

class UserBehaviorDataset(Dataset):
    def __init__(self, features, labels, tokenizer, max_len):
        self.features = features
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features.iloc[idx]
        label = self.labels.iloc[idx]

        # combine all features as one sentence
        text = ' '.join([str(value) for value in feature])

        # tokenize
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# model parameters
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 4
LEARNING_RATE = 2e-5

# load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 創建數據集和數據加載器
train_dataset = UserBehaviorDataset(train_features, train_labels, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = UserBehaviorDataset(test_features, test_labels, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# 加載BERT模型
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# 設置優化器 AdamW
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# 訓練模型
def train_model(model, data_loader, optimizer, device, epochs):
    model = model.train()

    for epoch in range(epochs):
        total_loss = 0

        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(data_loader)
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}')

# 訓練模型
train_model(model, train_loader, optimizer, 'cuda' if torch.cuda.is_available() else 'cpu', EPOCHS)

# 保存模型
model.save_pretrained('bert_user_behavior_model')
tokenizer.save_pretrained('bert_user_behavior_tokenizer')


evaluate if the model is good or bad

In [None]:
from sklearn.metrics import accuracy_score, classification_report

def evaluate_model(model, data_loader, device):
    model = model.eval()

    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions)

    return accuracy, report

# 評估模型
accuracy, report = evaluate_model(model, test_loader, 'cuda' if torch.cuda.is_available() else 'cpu')
print(f'Accuracy: {accuracy:.4f}')
print(report)
