# Applying Bert-base-uncased model

# Goal : Test the Bert-based-uncased model
### Step 1: Intent prediction: house, apartment, rooms
### Step 2: Slot filling:  only slot filling and modifed with extract slot
### Step 3: Combine with the user intent prediction and slot filling using extract slot
### Conclusion:
#### The prediction performance is currently suboptimal. This is primarily due to the limited size of the training dataset and the high similarity between intent labels, which makes it difficult for the model to distinguish between them accurately. If we plan to deploy this model, further refinement is necessary. In particular, we should consider applying a fine-tuning approach with a more diverse and balanced dataset to improve both intent classification and slot extraction.



In [2]:
pip install transformers datasets torch scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [None]:
# ✅ Step 1. Import necessary packages
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score



In [145]:


# ✅ Step 2. Define  texts and labels
texts = [
    "Looking for 2 bedroom apartment in Berlin, price less than 1000 Euro",
    "Need a house near school and bus stops, price less than 500 Euro",
    "Looking for shared room in city center"
    " Need a house with garten, price less than 2000 Euro"
]
intent_labels = ["apartment", "house", "shared room"]

# Convert labels to numeric IDs
intent_label2id = {label: i for i, label in enumerate(intent_labels)}
intent_id2label = {i: label for label, i in intent_label2id.items()}

# Sample label IDs for demonstration
labels = [0, 1, 2]

# ✅ Step 3. Load tokenizer and model
intent_model_name = "bert-base-uncased"
intent_tokenizer = AutoTokenizer.from_pretrained(intent_model_name)

model_intent = AutoModelForSequenceClassification.from_pretrained(
    intent_model_name,
    num_labels=len(intent_labels),
    id2label=intent_id2label,
    label2id=intent_label2id
)

# ✅ Step 4. Tokenize the dataset
dataset = Dataset.from_dict({
    "text": texts,
    "label": labels
})

def tokenize_function(example):
    return intent_tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# ✅ Step 5. Training setup (CPU friendly)
training_args = TrainingArguments(
    output_dir="./intent_model",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    use_cpu= True  # ✅ force CPU mode
)

trainer = Trainer(
    model=model_intent,
    args=training_args,
    train_dataset=tokenized_dataset
)

# ✅ Step 6. Fine-tune (small example)
trainer.train()

# ✅ Step 7. Test prediction
test_text = "Looking for a room near Ubahn"
inputs = intent_tokenizer(test_text, return_tensors="pt")
with torch.no_grad():
    outputs = model_intent(**inputs)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()
print(f"Predicted intent: {intent_id2label[predicted_label]}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 3/3 [00:00<00:00, 47.57 examples/s]


Step,Training Loss


Predicted intent: shared room


In [132]:
# test the text
test_text=" I need a shared room with kitchen and nearby transport station"
inputs = intent_tokenizer(test_text, return_tensors="pt")
with torch.no_grad():
    outputs = model_intent(**inputs)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()
print(f"Predicted intent: {intent_id2label[predicted_label]}")


Predicted intent: apartment


# the prediction is not well

In [66]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification


In [None]:

# Define slot labels
slot_labels = ["O", "B-location", "I-location", "B-price", "I-price", "B-feature", "I-feature"]
id2label = {i: label for i, label in enumerate(slot_labels)}
label2id = {label: i for i, label in enumerate(slot_labels)}

# Load tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_slot = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(slot_labels))

# Example text for tokenization
texts = [
    "Looking for a 2 bedroom apartment in Berlin, price less than 1000 Euro",
    "Need a house near school and bus stops, price less than 500 Euro",
]

labels = [
    ["O", "O", "O", "B-size", "I-size", "O", "O", "B-location", "O", "B-price", "I-price", "I-price", "I-price", "I-price"],
    ["O", "O", "O", "O", "O", "O", "B-education", "O", "B-transport", "I-transport", "O", "B-price", "I-price", "I-price", "I-price"]
]

# Convert dataset
dataset = Dataset.from_dict({"tokens": texts, "labels": labels})

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        padding=True,
        is_split_into_words=False,
    )
    tokenized_inputs["labels"] = []
    for label in examples["labels"]:
        tokenized_inputs["labels"].append([label2id.get(l, 0) for l in label])
    return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)

# Training setup
training_args = TrainingArguments(
    output_dir="./slot_filling_model",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    do_train=True,
    no_cuda=True
)

trainer = Trainer(
    model=model_slot,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()


In [32]:
# Save model and tokenizer
model_slot.save_pretrained("./slot_filling_model")
tokenizer.save_pretrained("./slot_filling_model")

('./slot_filling_model/tokenizer_config.json',
 './slot_filling_model/special_tokens_map.json',
 './slot_filling_model/vocab.txt',
 './slot_filling_model/added_tokens.json',
 './slot_filling_model/tokenizer.json')

In [33]:
# test with the following text
#load the model and tokenizer
model= AutoModelForSequenceClassification.from_pretrained("./slot_filling_model")
tokenizer= AutoTokenizer.from_pretrained("./slot_filling_model")
def predict_slots_from_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

    tokens = tokenizer.tokenize(text)
    word_ids = inputs.word_ids()

    previous_word_idx = None
    final_tokens = []
    final_labels = []

    for idx, word_idx in enumerate(word_ids):
        if word_idx is None or word_idx == previous_word_idx:
            continue
        token = tokens[idx]
        label_id = predictions[0][idx].item()
        label = id2label[label_id]
        final_tokens.append(token)
        final_labels.append(label)
        previous_word_idx = word_idx

    for token, label in zip(final_tokens, final_labels):
        print(f"{token:15} → {label}")





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./slot_filling_model and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [149]:


# Define expanded slot labels
slot_labels = [
    "O",
    "B-size", "I-size",
    "B-price", "I-price",
    "B-transport", "I-transport",
    "B-parks", "I-parks",
    "B-location", "I-location",
    "B-feature", "I-feature"
]
label2id = {label: i for i, label in enumerate(slot_labels)}
id2label = {i: label for i, label in enumerate(slot_labels)}

# Sample training data
texts = [
    ["I", "would", "like", "to", "rent", "a", "flat", ",", "100m2", ",", "price", "less", "than", "1000", "Euro", ",", "nearby", "bus", "stations", "and", "parks"],
    ["Looking", "for", "a", "house", "near", "public", "transport", "and", "green", "areas", ",", "price", "below", "800", "Euro", ",", "size", "around", "90m2"],
    ["Need", "an", "apartment", "with", "size", "of", "120", "square", "meters", "and", "price", "under", "950", "Euro", ",", "close", "to", "parks"],
    ["Searching", "for", "a", "studio", "near", "metro", "and", "bus", "lines", ",", "price", "not", "more", "than", "1100", "Euro", ",", "size", "about", "85m2"],
    ["Want", "a", "place", "with", "100m2", "space", ",", "price", "around", "1000", "Euro", ",", "must", "be", "near", "parks", "and", "transport"],
    ["Looking", "to", "rent", "a", "90m2", "flat", "close", "to", "green", "areas", "and", "public", "transportation", ",", "budget", "is", "below", "950", "Euro"],
    ["Interested", "in", "a", "house", "with", "size", "near", "110m2", ",", "price", "limit", "is", "1000", "Euro", ",", "prefer", "location", "near", "bus", "routes"],
    ["Seeking", "a", "rental", "property", "of", "95m2", ",", "price", "maximum", "900", "Euro", ",", "should", "be", "close", "to", "parks", "and", "public", "transport"],
    ["Need", "a", "flat", "with", "size", "around", "105m2", ",", "price", "not", "exceeding", "980", "Euro", ",", "near", "green", "spaces", "and", "bus", "access"],
    ["Looking", "for", "a", "90m2", "apartment", ",", "price", "under", "1000", "Euro", ",", "must", "be", "near", "parks", "and", "transport", "options"]
]
labels = [
    ["O", "O", "O", "O", "O", "O", "O", "O", "B-size", "O", "B-price", "I-price", "I-price", "I-price", "I-price", "O", "B-transport", "I-transport", "O", "B-parks", "I-parks"],
    ["O", "O", "O", "O", "B-transport", "I-transport", "I-transport", "O", "B-parks", "I-parks", "O", "B-price", "I-price", "I-price", "I-price", "O", "B-size", "I-size"],
    ["O", "O", "O", "O", "B-size", "I-size", "I-size", "I-size", "I-size", "O", "B-price", "I-price", "I-price", "I-price", "O", "O", "O", "B-parks"],
    ["O", "O", "O", "O", "B-transport", "I-transport", "O", "I-transport", "I-transport", "O", "B-price", "I-price", "I-price", "I-price", "I-price", "O", "B-size", "I-size", "I-size"],
    ["O", "O", "O", "O", "B-size", "I-size", "O", "B-price", "I-price", "I-price", "I-price", "O", "O", "O", "B-parks", "O", "B-transport"],
    ["O", "O", "O", "O", "B-size", "I-size", "O", "O", "B-parks", "I-parks", "O", "B-transport", "I-transport", "O", "B-price", "O", "I-price", "I-price", "I-price"],
    ["O", "O", "O", "O", "O", "B-size", "I-size", "I-size", "O", "B-price", "I-price", "O", "I-price", "I-price", "O", "O", "O", "B-transport", "I-transport"],
    ["O", "O", "O", "O", "B-size", "I-size", "O", "B-price", "I-price", "I-price", "O", "O", "O", "O", "B-parks", "O", "B-transport", "I-transport"],
    ["O", "O", "O", "O", "B-size", "I-size", "O", "B-price", "I-price", "I-price", "I-price", "O", "B-parks", "I-parks", "O", "B-transport", "I-transport"],
    ["O", "O", "O", "B-size", "I-size", "O", "B-price", "I-price", "I-price", "I-price", "O", "O", "O", "B-parks", "O", "B-transport", "I-transport", "O"]
]



# Create dataset
dataset = Dataset.from_dict({"tokens": texts, "labels": labels})

# Load tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
data_collator = DataCollatorForTokenClassification(tokenizer)
model_slot = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(slot_labels), id2label=id2label, label2id=label2id)

# Tokenization and label alignment
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
    )

    word_ids = tokenized_inputs.encodings[0].word_ids
    label_ids = []
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            if word_idx < len(example["labels"]):
               label_ids.append(label2id[example["labels"][word_idx]])
            else:label_ids.append(-100)

        else:
            label_ids.append(-100)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=False)

# Training arguments
training_args = TrainingArguments(
    output_dir="./slot_filling_model",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    do_train=True,
    no_cuda=True
)

# Trainer setup
trainer = Trainer(
    model=model_slot,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator

)

# Train the model
trainer.train()

print(tokenized_dataset[0]["tokens"])
print(tokenized_dataset[0]["labels"])

# Save model and tokenizer
model_slot.save_pretrained("./slot_filling_model")
tokenizer.save_pretrained("./slot_filling_model")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 10/10 [00:00<00:00, 229.12 examples/s]


Step,Training Loss


['I', 'would', 'like', 'to', 'rent', 'a', 'flat', ',', '100m2', ',', 'price', 'less', 'than', '1000', 'Euro', ',', 'nearby', 'bus', 'stations', 'and', 'parks']
[-100, 0, 0, 0, 0, 0, 0, 0, 0, 1, -100, -100, 0, 3, 4, 4, 4, 4, 0, 5, 6, 0, 7, 8, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -1

('./slot_filling_model/tokenizer_config.json',
 './slot_filling_model/special_tokens_map.json',
 './slot_filling_model/vocab.txt',
 './slot_filling_model/added_tokens.json',
 './slot_filling_model/tokenizer.json')

In [64]:
import re

In [None]:
text = "I would like to rent a flat, 100m2, price less than 1000 Euro, and nearby bus stations and parks"
result = extract_slots(text)
print(result)


{'size': None, 'price': None, 'transport': 0.9, 'parks': 0.9}


In [77]:
import pandas as pd

In [82]:
df=pd.read_csv("requests.csv")
df.head()
text=df["request"].tolist()
result= [extract_slots(t) for t in text]
print(result)

[{'size': None, 'price': None, 'transport': 0.0, 'parks': 0.0}, {'size': None, 'price': None, 'transport': 0.0, 'parks': 0.0}, {'size': None, 'price': None, 'transport': 0.9, 'parks': 0.0}, {'size': None, 'price': None, 'transport': 0.0, 'parks': 0.0}, {'size': None, 'price': None, 'transport': 0.0, 'parks': 0.0}, {'size': None, 'price': None, 'transport': 0.9, 'parks': 0.0}]


# Tune the model and add more features

In [150]:
# Define expanded slot labels
slot_labels = [
    "O",
    "B-size", "I-size",
    "B-price", "I-price",
    "B-transport", "I-transport",
    "B-parks", "I-parks",
    "B-location", "I-location",
    "B-feature", "I-feature"
]
label2id = {label: i for i, label in enumerate(slot_labels)}
id2label = {i: label for i, label in enumerate(slot_labels)}

# Sample training data
texts = [
    ["I", "would", "like", "to", "rent", "a", "flat", ",", "100m2", ",", "price", "less", "than", "1000", "Euro", ",", "nearby", "bus", "stations", "and", "parks"],
    ["Looking", "for", "a", "house", "near", "public", "transport", "and", "green", "areas", ",", "price", "below", "800", "Euro", ",", "size", "around", "90m2"],
    ["Need", "an", "apartment", "with", "size", "of", "120", "square", "meters", "and", "price", "under", "950", "Euro", ",", "close", "to", "parks"],
    ["Searching", "for", "a", "studio", "near", "metro", "and", "bus", "lines", ",", "price", "not", "more", "than", "1100", "Euro", ",", "size", "about", "85m2"],
    ["Want", "a", "place", "with", "100m2", "space", ",", "price", "around", "1000", "Euro", ",", "must", "be", "near", "parks", "and", "transport"],
    ["Looking", "to", "rent", "a", "90m2", "flat", "close", "to", "green", "areas", "and", "public", "transportation", ",", "budget", "is", "below", "950", "Euro"],
    ["Interested", "in", "a", "house", "with", "size", "near", "110m2", ",", "price", "limit", "is", "1000", "Euro", ",", "prefer", "location", "near", "bus", "routes"],
    ["Seeking", "a", "rental", "property", "of", "95m2", ",", "price", "maximum", "900", "Euro", ",", "should", "be", "close", "to", "parks", "and", "public", "transport"],
    ["Need", "a", "flat", "with", "size", "around", "105m2", ",", "price", "not", "exceeding", "980", "Euro", ",", "near", "green", "spaces", "and", "bus", "access"],
    ["Looking", "for", "a", "90m2", "apartment", ",", "price", "under", "1000", "Euro", ",", "must", "be", "near", "parks", "and", "transport", "options"]
]
labels = [
    ["O", "O", "O", "O", "O", "O", "O", "O", "B-size", "O", "B-price", "I-price", "I-price", "I-price", "I-price", "O", "B-transport", "I-transport", "O", "B-parks", "I-parks"],
    ["O", "O", "O", "O", "B-transport", "I-transport", "I-transport", "O", "B-parks", "I-parks", "O", "B-price", "I-price", "I-price", "I-price", "O", "B-size", "I-size"],
    ["O", "O", "O", "O", "B-size", "I-size", "I-size", "I-size", "I-size", "O", "B-price", "I-price", "I-price", "I-price", "O", "O", "O", "B-parks"],
    ["O", "O", "O", "O", "B-transport", "I-transport", "O", "I-transport", "I-transport", "O", "B-price", "I-price", "I-price", "I-price", "I-price", "O", "B-size", "I-size", "I-size"],
    ["O", "O", "O", "O", "B-size", "I-size", "O", "B-price", "I-price", "I-price", "I-price", "O", "O", "O", "B-parks", "O", "B-transport"],
    ["O", "O", "O", "O", "B-size", "I-size", "O", "O", "B-parks", "I-parks", "O", "B-transport", "I-transport", "O", "B-price", "O", "I-price", "I-price", "I-price"],
    ["O", "O", "O", "O", "O", "B-size", "I-size", "I-size", "O", "B-price", "I-price", "O", "I-price", "I-price", "O", "O", "O", "B-transport", "I-transport"],
    ["O", "O", "O", "O", "B-size", "I-size", "O", "B-price", "I-price", "I-price", "O", "O", "O", "O", "B-parks", "O", "B-transport", "I-transport"],
    ["O", "O", "O", "O", "B-size", "I-size", "O", "B-price", "I-price", "I-price", "I-price", "O", "B-parks", "I-parks", "O", "B-transport", "I-transport"],
    ["O", "O", "O", "B-size", "I-size", "O", "B-price", "I-price", "I-price", "I-price", "O", "O", "O", "B-parks", "O", "B-transport", "I-transport", "O"]
]



# Create dataset
dataset = Dataset.from_dict({"tokens": texts, "labels": labels})

# Load tokenizer and model
slot_model_name = "bert-base-uncased"
slot_tokenizer = AutoTokenizer.from_pretrained(slot_model_name, use_fast=True)
data_collator = DataCollatorForTokenClassification(slot_tokenizer)
model_slot = AutoModelForTokenClassification.from_pretrained(slot_model_name, num_labels=len(slot_labels), id2label=id2label, label2id=label2id)

def tokenize_and_align_labels(example):
    tokenized_inputs = slot_tokenizer(
        example["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
    )

    labels = []
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(label2id[example["labels"][word_idx]])
        else:
            labels.append(-100)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [112]:
# Load model and tokenizer
model_slot = AutoModelForTokenClassification.from_pretrained("./slot_filling_model")
slot_tokenizer = AutoTokenizer.from_pretrained("./slot_filling_model")

# the model is still missed to get some keywords, therefore we combined with the rule-based NLP keyword matching ("bus", "park") and regex as backup when the model misses.

In [151]:
def extract_slots(text):
    words = text.split()
    inputs = slot_tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True, padding="max_length")

    with torch.no_grad():
        outputs = model_slot(**inputs)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)[0]
    word_ids = inputs.encodings[0].word_ids

    slots = {
        "size": None,
        "price": None,
        "transport": 0.0,
        "parks": 0.0,
        "university":0.0,
        "kindergarten":0.0
    }

    current_entity = None
    current_value = ""

    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        label_id = predictions[idx].item()
        label = id2label[label_id]
        word = words[word_idx]

        if label.startswith("B-"):
            if current_entity and current_value:
                if current_entity in ["size", "price"]:
                    digits = re.findall(r"\d+", current_value)
                    if digits:
                        slots[current_entity] = int(digits[0])
            current_entity = label[2:]
            current_value = word
            if current_entity in ["transport", "parks"]:
                slots[current_entity] = 0.9
        elif label.startswith("I-") and current_entity:
            current_value += " " + word
        else:
            current_entity = None
            current_value = ""

    # Final entity after loop
    if current_entity and current_value:
        if current_entity in ["size", "price"]:
            digits = re.findall(r"\d+", current_value)
            if digits:
                slots[current_entity] = int(digits[0])

    # Keyword fallback
    if 'bus' in text.lower() or "transport" in text.lower():
        slots["transport"] = 0.9
    if "park" in text.lower():
        slots["parks"] = 0.9
    if slots["size"] is None:
        match = re.search(r"\b(\d+)\s?(m2|square meters)\b", text.lower())
        if match:
            slots["size"] = int(match.group(1))

    if slots["price"] is None:
        match = re.search(r"\b(\d+)\s?(euro|€|dollars|Euro)\b", text.lower())
        if match:
            slots["price"] = int(match.group(1))


    return slots


In [152]:
text = "I would like to rent a flat, 100m2, price less than 1000 Euro, and nearby bus stations and parks"
result = extract_slots(text)
print(result)

{'size': 100, 'price': 1000, 'transport': 0.9, 'parks': 0.9, 'university': 0.0, 'kindergarten': 0.0}


In [153]:
text='I would like a apartment nearby the Mitte of Berlin, rent price less than 2000 dollars'
result = extract_slots(text)
print(result)

{'size': None, 'price': 2000, 'transport': 0.9, 'parks': 0.0, 'university': 0.0, 'kindergarten': 0.0}


In [154]:
df=pd.read_csv("requests.csv")
df.head()
text=df["request"].tolist()
result= [extract_slots(t) for t in text]
print(text, result)

['I would like a apartment nearby the Mitte of Berlin, rent price less than 2000', 'I would like a small room but can have a pet , price is less than $200', 'I like a 2 rooms and neaby MRT, bus stations. The price is less than $2000', 'I would like to 3 bedrooms, nearby the Mitte District, and rent price less than 2000', 'I would like to have 100m2 and price less than 1000 Euro', 'I would like an apartment 100m2 , price less than 1000 Euro, nearby kindergarten and transport stops'] [{'size': None, 'price': None, 'transport': 0.9, 'parks': 0.0, 'university': 0.0, 'kindergarten': 0.0}, {'size': None, 'price': None, 'transport': 0.9, 'parks': 0.0, 'university': 0.0, 'kindergarten': 0.0}, {'size': None, 'price': None, 'transport': 0.9, 'parks': 0.9, 'university': 0.0, 'kindergarten': 0.0}, {'size': None, 'price': None, 'transport': 0.9, 'parks': 0.0, 'university': 0.0, 'kindergarten': 0.0}, {'size': 100, 'price': 1000, 'transport': 0.9, 'parks': 0.0, 'university': 0.0, 'kindergarten': 0.0}

In [106]:
import torch

# Integrate with the inetent prediction and slot filling

In [155]:
# Combine intent and slot filliing using existing extract_slots()
def predict_intent_and_slots(text):
    # --- Intent Prediction ---
    inputs_intent = intent_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs_intent = model_intent(**inputs_intent)
        logits = outputs_intent.logits
        if logits.dim() == 3:  # safety fallback
            logits = logits.mean(dim=1)
        predicted_intent_id = torch.argmax(logits, dim=1).item()
        predicted_intent = model_intent.config.id2label[predicted_intent_id]
    
    # --- Slot Extraction (reuse your existing function) ---
    slots = extract_slots(text)
    
    return predicted_intent, slots


In [156]:
text = "Looking for a 2 bedroom apartment near bus stops and parks, price less than 1000 Euro"
intent, slots = predict_intent_and_slots(text)

print("Predicted Intent:", intent)
print("Extracted Slots:", slots)



Predicted Intent: shared room
Extracted Slots: {'size': None, 'price': 1000, 'transport': 0.9, 'parks': 0.9, 'university': 0.0, 'kindergarten': 0.0}


In [157]:
text="I would like to 3 bedrooms, about 100m2 nearby the Mitte District, and rent price less than 2000euro"
intent, slots = predict_intent_and_slots(text)

print("Predicted Intent:", intent)
print("Extracted Slots:", slots)


Predicted Intent: shared room
Extracted Slots: {'size': 100, 'price': 2000, 'transport': 0.9, 'parks': 0.0, 'university': 0.0, 'kindergarten': 0.0}


In [141]:
#Combine  the intent and slot filling
def predict_intent_and_slots(text):
    # --- Intent Prediction ---
    inputs_intent = intent_tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs_intent = model_intent(**inputs_intent)
        logits = outputs_intent.logits
        if logits.dim() == 3:  # safety fallback
            logits = logits.mean(dim=1)
        predicted_intent_id = torch.argmax(logits, dim=1).item()
        predicted_intent = model_intent.config.id2label[predicted_intent_id]
    
    # --- Slot Filling ---
    tokenized_inputs = slot_tokenizer(
        text.split(),
        is_split_into_words=True,
        return_tensors="pt",
        padding=True,
        truncation=True
    )
    with torch.no_grad():
        outputs_slot = model_slot(**tokenized_inputs)
        predictions = torch.argmax(outputs_slot.logits, dim=2).squeeze().tolist()
    
    # Align tokens with labels
    word_ids = tokenized_inputs.encodings[0].word_ids
    slot_predictions = []
    for idx, word_id in enumerate(word_ids):
        if word_id is None:
            continue
        label_id = predictions[idx]
        slot_predictions.append((text.split()[word_id], model_slot.config.id2label[label_id]))
    
    return predicted_intent, slot_predictions


In [158]:
text = "I would like an apartment 100m2 , price less than 1000 Euro, nearby kindergarten and transport stops"

intent, slots = predict_intent_and_slots(text)
print("Predicted Intent:", intent)
print("Slot Predictions:", slots)



Predicted Intent: shared room
Slot Predictions: {'size': 100, 'price': 1000, 'transport': 0.9, 'parks': 0.0, 'university': 0.0, 'kindergarten': 0.0}


In [159]:
text = "I would like a house with garten , size larger than 100m2 , price less than 2000 Euro, nearby University and transport stops"

intent, slots = predict_intent_and_slots(text)
print("Predicted Intent:", intent)
print("Slot Predictions:", slots)

Predicted Intent: shared room
Slot Predictions: {'size': 100, 'price': 2000, 'transport': 0.9, 'parks': 0.0, 'university': 0.0, 'kindergarten': 0.0}


Conclusion:
the prediction is not very well, training the model is too limited. 
Intent label are similar.
If we will run this model, we still need to do some effort to modify it, for example: fine tuning approach
