<a href="https://colab.research.google.com/github/dongjaeseo/NLP_study/blob/main/courtsbot2_try_differentiating_timedateplacesports.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
from torchsummary import summary
import torch
import torch.nn as nn
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm

In [3]:
csv_path = '/content/drive/MyDrive/data/courtsbot2_try_differentiating.csv'

# Read the CSV file into a pandas DataFrame with default column names
df = pd.read_csv(csv_path, header=None, names=['texts', 'labels'])

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,texts,labels
0,"On the 20th at 7pm at Mumbai Stadium, which sp...",0
1,Can you tell me which sports are available to ...,0
2,What sports are open for play on the next Thur...,0
3,"On the 15th at 7pm at Kolkata Stadium, which s...",0
4,Which sports can I play on the next Monday at ...,0


In [4]:
# Extract texts and labels
texts = df['texts'].tolist()
labels = df['labels'].tolist()

# Display the first few elements of the lists
print("Texts:", texts[:5])
print("Labels:", labels[:5])

Texts: ['On the 20th at 7pm at Mumbai Stadium, which sports can I play?', 'Can you tell me which sports are available to play on the 25th at 8 in the evening at Bangalore Sports Complex?', 'What sports are open for play on the next Thursday at six in the morning at Delhi Sports Arena?', 'On the 15th at 7pm at Kolkata Stadium, which sports can I participate in?', 'Which sports can I play on the next Monday at 8 in the evening at Hyderabad Sports Facility?']
Labels: [0, 0, 0, 0, 0]


In [5]:
# Sample dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, max_length=32):
        self.texts = texts
        self.labels = labels
        self.max_length = max_length
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = int(self.labels[idx])

        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }


custom_dataset = CustomDataset(texts, labels)
batch_size = 2
custom_dataloader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
# Load pre-trained BERT model for sequence classification
model_name = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=4)  # Assuming 3 classes

# Set up optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
num_epochs = 30  # Adjust as needed
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(custom_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        batch_labels = batch['labels'].to(torch.long).to(device)

        optimizer.zero_grad()
        logits = model(input_ids, attention_mask=attention_mask).logits
        loss = criterion(logits, batch_labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(custom_dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss:.4f}")

# Save the fine-tuned model
model.save_pretrained('/content/drive/MyDrive/courtsbot/model/courtsbot2')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/30: 100%|██████████| 20/20 [00:04<00:00,  4.64it/s]


Epoch 1/30, Average Loss: 1.4221


Epoch 2/30: 100%|██████████| 20/20 [00:01<00:00, 12.73it/s]


Epoch 2/30, Average Loss: 1.2540


Epoch 3/30: 100%|██████████| 20/20 [00:01<00:00, 12.55it/s]


Epoch 3/30, Average Loss: 1.1425


Epoch 4/30: 100%|██████████| 20/20 [00:01<00:00, 12.38it/s]


Epoch 4/30, Average Loss: 0.8717


Epoch 5/30: 100%|██████████| 20/20 [00:01<00:00, 12.90it/s]


Epoch 5/30, Average Loss: 0.6417


Epoch 6/30: 100%|██████████| 20/20 [00:01<00:00, 15.16it/s]


Epoch 6/30, Average Loss: 0.3893


Epoch 7/30: 100%|██████████| 20/20 [00:01<00:00, 15.47it/s]


Epoch 7/30, Average Loss: 0.2638


Epoch 8/30: 100%|██████████| 20/20 [00:01<00:00, 15.22it/s]


Epoch 8/30, Average Loss: 0.1978


Epoch 9/30: 100%|██████████| 20/20 [00:01<00:00, 15.32it/s]


Epoch 9/30, Average Loss: 0.1348


Epoch 10/30: 100%|██████████| 20/20 [00:01<00:00, 15.16it/s]


Epoch 10/30, Average Loss: 0.1027


Epoch 11/30: 100%|██████████| 20/20 [00:01<00:00, 15.12it/s]


Epoch 11/30, Average Loss: 0.0882


Epoch 12/30: 100%|██████████| 20/20 [00:01<00:00, 15.28it/s]


Epoch 12/30, Average Loss: 0.0718


Epoch 13/30: 100%|██████████| 20/20 [00:01<00:00, 13.44it/s]


Epoch 13/30, Average Loss: 0.0612


Epoch 14/30: 100%|██████████| 20/20 [00:01<00:00, 12.40it/s]


Epoch 14/30, Average Loss: 0.0502


Epoch 15/30: 100%|██████████| 20/20 [00:01<00:00, 12.78it/s]


Epoch 15/30, Average Loss: 0.0455


Epoch 16/30: 100%|██████████| 20/20 [00:01<00:00, 12.86it/s]


Epoch 16/30, Average Loss: 0.0381


Epoch 17/30: 100%|██████████| 20/20 [00:01<00:00, 14.36it/s]


Epoch 17/30, Average Loss: 0.0355


Epoch 18/30: 100%|██████████| 20/20 [00:01<00:00, 15.42it/s]


Epoch 18/30, Average Loss: 0.0322


Epoch 19/30: 100%|██████████| 20/20 [00:01<00:00, 15.18it/s]


Epoch 19/30, Average Loss: 0.0300


Epoch 20/30: 100%|██████████| 20/20 [00:01<00:00, 15.31it/s]


Epoch 20/30, Average Loss: 0.0256


Epoch 21/30: 100%|██████████| 20/20 [00:01<00:00, 15.34it/s]


Epoch 21/30, Average Loss: 0.0240


Epoch 22/30: 100%|██████████| 20/20 [00:01<00:00, 15.60it/s]


Epoch 22/30, Average Loss: 0.0228


Epoch 23/30: 100%|██████████| 20/20 [00:01<00:00, 15.32it/s]


Epoch 23/30, Average Loss: 0.0209


Epoch 24/30: 100%|██████████| 20/20 [00:01<00:00, 14.83it/s]


Epoch 24/30, Average Loss: 0.0197


Epoch 25/30: 100%|██████████| 20/20 [00:01<00:00, 12.65it/s]


Epoch 25/30, Average Loss: 0.0185


Epoch 26/30: 100%|██████████| 20/20 [00:01<00:00, 12.63it/s]


Epoch 26/30, Average Loss: 0.0170


Epoch 27/30: 100%|██████████| 20/20 [00:01<00:00, 12.69it/s]


Epoch 27/30, Average Loss: 0.0161


Epoch 28/30: 100%|██████████| 20/20 [00:01<00:00, 13.20it/s]


Epoch 28/30, Average Loss: 0.0146


Epoch 29/30: 100%|██████████| 20/20 [00:01<00:00, 15.44it/s]


Epoch 29/30, Average Loss: 0.0149


Epoch 30/30: 100%|██████████| 20/20 [00:01<00:00, 15.15it/s]


Epoch 30/30, Average Loss: 0.0140


In [7]:
# Load the fine-tuned BERT model
fine_tuned_model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/courtsbot/model/courtsbot2', num_labels=4)

# Set the model to evaluation mode
fine_tuned_model.eval()

# Example texts for inference
example_texts = [
    'On the 17th at 6 pm at Seoul Stadium, which sports can I play?',
    'Next Thursday at 8 in the evening at the local sports center, which sports are available for playing?',
    'This Friday at 7 am at the community park, which sports can I engage in?',
    'On the 17th at 7 pm, I want to play soccer. Where can I play?',
    'Next Friday at 8 in the evening, Im interested in playing basketball. Where can I play?',
    'This Sunday at 6 in the morning, I plan to play tennis. Where can I find a suitable venue?',
    'I want to play soccer at 8 in the evening in the city park. Which days are available?',
    'Planning to play basketball at 6 in the morning at the sports complex. Which days are open?',
    'Id like to play tennis at 7 pm at the local sports center. What days are available?',
    'I want to play soccer on the 9th at Seoul Stadium. What timings are available?',
    'Planning to play basketball on the next Friday at the city sports complex. What slots are open?',
    'Id like to play tennis this Sunday at the local sports center. Which timings are available?'
]

# Tokenize and process the example texts
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(example_texts, return_tensors='pt', truncation=True, padding=True, max_length=32)

# Move inputs to the appropriate device (CPU or GPU)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Move the model to the same device as the inputs
fine_tuned_model.to(device)

with torch.no_grad():
    logits = fine_tuned_model(**inputs).logits

# Apply softmax to get probabilities
probs = torch.nn.functional.softmax(logits, dim=-1)

# Get predicted labels
predicted_labels = torch.argmax(probs, dim=1).cpu().numpy()

# Print the results
for i, text in enumerate(example_texts):
    print(f"Text: {text}")
    print(f"Predicted Label: {predicted_labels[i]}")
    print(f"Probabilities: {probs[i]}")
    print()

Text: On the 17th at 6 pm at Seoul Stadium, which sports can I play?
Predicted Label: 0
Probabilities: tensor([0.9755, 0.0130, 0.0029, 0.0086], device='cuda:0')

Text: Next Thursday at 8 in the evening at the local sports center, which sports are available for playing?
Predicted Label: 0
Probabilities: tensor([0.9171, 0.0650, 0.0059, 0.0120], device='cuda:0')

Text: This Friday at 7 am at the community park, which sports can I engage in?
Predicted Label: 0
Probabilities: tensor([0.8003, 0.1826, 0.0084, 0.0087], device='cuda:0')

Text: On the 17th at 7 pm, I want to play soccer. Where can I play?
Predicted Label: 1
Probabilities: tensor([0.0029, 0.9922, 0.0025, 0.0024], device='cuda:0')

Text: Next Friday at 8 in the evening, Im interested in playing basketball. Where can I play?
Predicted Label: 1
Probabilities: tensor([0.0033, 0.9901, 0.0032, 0.0034], device='cuda:0')

Text: This Sunday at 6 in the morning, I plan to play tennis. Where can I find a suitable venue?
Predicted Label: 1
P

Also test with ambiguous questions

In [8]:
# Load the fine-tuned BERT model
fine_tuned_model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/courtsbot/model/courtsbot2', num_labels=4)

# Set the model to evaluation mode
fine_tuned_model.eval()

# Example texts for inference
example_texts = [
    "Interested in activities at Seoul Stadium on the next Friday at 7 pm. What options are there?",
    "Checking out sports options at 7 pm next Friday in Seoul Stadium. Any recommendations?",
    "Exploring possibilities for play at Seoul Stadium on the upcoming Friday at 7 pm. What's available?",
    "Curious about things to do at Seoul Stadium on the next Friday at 7 pm. Suggestions?",
    "Planning for activities next Friday at 7 pm in Seoul Stadium. Any sports to consider?",

    "Planning to engage in tennis next Friday at 7 pm. Any venues for playing tennis at that time?",
    "Exploring options for playing tennis on the upcoming Friday at 7 pm. Any locations suitable for tennis?",
    "Considering tennis on the next Friday at 7 pm. Suggestions for places to play tennis?",
    "Thinking about a tennis game next Friday at 7 pm. Any recommendations for tennis-friendly locations?",
    "Interested in playing tennis next Friday at 7 pm. Where can I find tennis facilities?",

    "Planning to play tennis at 7 pm in Seoul Stadium. Are there available slots during the week?",
    "Interested in tennis at 7 pm in Seoul Stadium. Which days have openings for tennis?",
    "Considering a game of tennis at Seoul Stadium at 7 pm. Days with available tennis slots?",
    "Exploring tennis options at 7 pm in Seoul Stadium. Any particular days for tennis?",
    "Looking for available tennis slots at Seoul Stadium at 7 pm. Days when tennis is possible?",

    "Planning to play tennis at Seoul Stadium on the 9th. Which slots are available?",
    "Considering a game of tennis at Seoul Stadium on the 9th. Availability of slots?",
    "Exploring tennis options at Seoul Stadium on the 9th. What timings are open?",
    "Interested in tennis at Seoul Stadium on the 9th. Availability of playing slots?",
    "Looking for available tennis slots at Seoul Stadium on the 9th. Any specific timings?"
]

# Tokenize and process the example texts
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(example_texts, return_tensors='pt', truncation=True, padding=True, max_length=32)

# Move inputs to the appropriate device (CPU or GPU)
inputs = {key: value.to(device) for key, value in inputs.items()}

# Move the model to the same device as the inputs
fine_tuned_model.to(device)

with torch.no_grad():
    logits = fine_tuned_model(**inputs).logits

# Apply softmax to get probabilities
probs = torch.nn.functional.softmax(logits, dim=-1)

# Get predicted labels
predicted_labels = torch.argmax(probs, dim=1).cpu().numpy()

# Print the results
for i, text in enumerate(example_texts):
    print(f"Text: {text}")
    print(f"Predicted Label: {predicted_labels[i]}")
    print(f"Probabilities: {probs[i]}")
    print()

Text: Interested in activities at Seoul Stadium on the next Friday at 7 pm. What options are there?
Predicted Label: 1
Probabilities: tensor([0.1870, 0.5213, 0.0397, 0.2519], device='cuda:0')

Text: Checking out sports options at 7 pm next Friday in Seoul Stadium. Any recommendations?
Predicted Label: 3
Probabilities: tensor([0.0450, 0.1023, 0.0845, 0.7681], device='cuda:0')

Text: Exploring possibilities for play at Seoul Stadium on the upcoming Friday at 7 pm. What's available?
Predicted Label: 3
Probabilities: tensor([0.0560, 0.2525, 0.0513, 0.6401], device='cuda:0')

Text: Curious about things to do at Seoul Stadium on the next Friday at 7 pm. Suggestions?
Predicted Label: 1
Probabilities: tensor([0.1847, 0.5262, 0.0391, 0.2499], device='cuda:0')

Text: Planning for activities next Friday at 7 pm in Seoul Stadium. Any sports to consider?
Predicted Label: 0
Probabilities: tensor([0.5714, 0.2644, 0.0302, 0.1340], device='cuda:0')

Text: Planning to engage in tennis next Friday at 7 p