In [None]:
import random
from faker import Faker
import json
import inflect

# Initialize Faker and inflect
fake = Faker()
inflect_engine = inflect.engine()

# Define patterns for ROOM TYPE, GUEST TYPE, and greetings
room_types = ["Deluxe Suite", "Family Suite", "Standard Room", "Single Room", "Double Room", "Presidential Suite"]
guest_types = ["adults", "children", "infants"]
greetings = ["Greetings,", "Good morning,", "Good evening,", "Hi there,", "Hello,", "Dear Sir/Madam,", "To whom it may concern,","Hi,"]
closing_remarks = ["Thank you,", "Best regards,", "Sincerely,", "Looking forward to your response,", "Thanks in advance,", "Warm regards,"]

# Helper function to convert numbers to words or digits
def get_number_variation(num):
    return inflect_engine.number_to_words(num)

# Generate annotated emails for NER
def generate_ner_training_data(num_examples=100):
    training_data = []
    for _ in range(num_examples):
        greeting = random.choice(greetings)
        closing = random.choice(closing_remarks)
        guest_name = fake.name()
        room_type = random.choice(room_types)
        num_guests = random.randint(1, 20)
        num_guests_text = get_number_variation(num_guests)
        guest_type = random.choice(guest_types)
        checkin_date = fake.date_between(start_date="today", end_date="+30d").strftime("%Y-%m-%d")
        checkout_date = fake.date_between(start_date="+31d", end_date="+60d").strftime("%Y-%m-%d")
        num_of_days = random.randint(1, 30)
        num_days_text = get_number_variation(num_of_days)
        num_of_rooms = random.randint(1, 5)
        num_of_rooms_text = get_number_variation(num_of_days)

        # Generate a random template
        templates_with_entities = [
            (f"{greeting} I would like to book a {room_type} for {num_guests_text} {guest_type}. Check-in date: {checkin_date}. Check-out date: {checkout_date}. {closing} {guest_name}", 
             [("GUEST", guest_name), ("ROOM TYPE", room_type), ("NUMBER OF GUESTS", num_guests_text), ("GUEST TYPE", guest_type), ("CHECKIN DATE", checkin_date), ("CHECKOUT DATE", checkout_date)]),
            
            (f"{greeting} Could you please reserve a {room_type} for {num_guests_text} {guest_type}? Arrival date: {checkin_date}. Departure date: {checkout_date}. {closing} {guest_name}",
             [("GUEST", guest_name), ("ROOM TYPE", room_type), ("NUMBER OF GUESTS", num_guests_text), ("GUEST TYPE", guest_type), ("CHECKIN DATE", checkin_date), ("CHECKOUT DATE", checkout_date)]),
            
            (f"{greeting} I need accommodation in a {room_type} for {num_guests_text} {guest_type}, starting from {checkin_date} until {checkout_date}. {closing} {guest_name}",
             [("GUEST", guest_name), ("ROOM TYPE", room_type), ("NUMBER OF GUESTS", num_guests_text), ("GUEST TYPE", guest_type), ("CHECKIN DATE", checkin_date), ("CHECKOUT DATE", checkout_date)]),
            
            (f"{greeting} I would like to ask availability of a {num_of_rooms_text} room for {num_days_text} days checking on {checkin_date}. {closing} {guest_name}",
             [("GUEST", guest_name), ("DAYS", num_days_text), ("NUMBER OF ROOMS", num_of_rooms_text),("CHECKIN DATE", checkin_date)]),

            (f"{greeting} Could you please book a {room_type} for {num_guests_text} {guest_type}? We are planning to arrive on {checkin_date} and leave on {checkout_date}. {closing} {guest_name}",
             [("GUEST", guest_name),("ROOM TYPE", room_type), ("NUMBER OF GUESTS", num_guests_text),("GUEST TYPE", guest_type), ("CHECKIN DATE", checkin_date), ("CHECKOUT DATE", checkout_date)]),

            (f"{greeting} This is a request for booking a {room_type} for {num_guests_text} {guest_type}. Arrival date: {checkin_date}. Departure date: {checkout_date}. {closing} {guest_name}",
             [("GUEST", guest_name),("ROOM TYPE", room_type), ("NUMBER OF GUESTS", num_guests_text),("GUEST TYPE", guest_type),("CHECKIN DATE", checkin_date),  ("CHECKOUT DATE", checkout_date)]),

            (f"{greeting} I am interested in reserving a {room_type} for {num_guests_text} {guest_type}. The check-in is {checkin_date}, and the check-out is {checkout_date}. {closing} {guest_name}",
             [("GUEST", guest_name),("ROOM TYPE", room_type), ("NUMBER OF GUESTS", num_guests_text),("GUEST TYPE", guest_type),("CHECKIN DATE", checkin_date),  ("CHECKOUT DATE", checkout_date)]),

            (f"{greeting} I would like to inquire about the availability of a {room_type} for {num_guests_text} {guest_type} between {checkin_date} and {checkout_date}. {closing} {guest_name}",
             [("GUEST", guest_name),("ROOM TYPE", room_type), ("NUMBER OF GUESTS", num_guests_text),("GUEST TYPE", guest_type),("CHECKIN DATE", checkin_date),  ("CHECKOUT DATE", checkout_date)]),

            (f"{greeting} I would like to ask availability of {num_of_rooms_text} room for {num_of_days} days checking on {checkin_date}.{closing} {guest_name}",
             [("GUEST", guest_name),("ROOM TYPE", room_type), ("NUMBER OF ROOMS", num_of_rooms_text), ("NUMBER OF DAYS", num_of_days),("CHECKIN DATE", checkin_date),  ("CHECKOUT DATE", checkout_date)]),
             
             
        ]

        selected_template, applicable_entities = random.choice(templates_with_entities)
        text = selected_template

        # Ensure alignment for applicable entities
        entities = []
        for label, value in applicable_entities:
            start = text.find(str(value))
            if start == -1:
                print(f"DEBUG: Generated text: {text}")
                print(f"DEBUG: Missing value: {value}")
                raise ValueError(f"Value '{value}' not found in text.")
            end = start + len(str(value))
            entities.append((start, end, label))

        # Add text and entities to training data
        training_data.append((text, {"entities": entities}))
    return training_data

# Save the generated NER training data
def save_ner_training_data(training_data, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(training_data, f, ensure_ascii=False, indent=4)

 
# Generate and save NER training data
if __name__ == "__main__":
    num_examples =1500  # Adjust the number of training examples
    ner_training_data = generate_ner_training_data(num_examples)
    output_file_path = "generated_training_data.json"
    save_ner_training_data(ner_training_data, output_file_path)
    print(f"Generated {num_examples} NER training examples and saved to {output_file_path}.")


DEBUG: Generated text: Greetings, I would like to ask availability of six room for 6 days checking on 2024-12-28.Sincerely, Cole Horton


In [35]:
import spacy
from spacy.training.example import Example
from spacy.tokens import DocBin
import json


def load_training_data(file_path):
    """Load training data from a JSON file."""
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)
    

# Step 2: Initialize or Load the Model
def create_blank_model(train_data):
    """Create a blank SpaCy model and add labels from the training data."""
    nlp = spacy.blank("en")  # Create a blank English model
    ner = nlp.add_pipe("ner")  # Add the NER pipeline
    # Add labels to the NER model
    for _, annotations in train_data:
        for ent in annotations["entities"]:
            ner.add_label(ent[2])
    return nlp

# Step 3: Train the Model
def train_model(nlp, train_data, n_iter=20):
    """Train the NER model."""
    optimizer = nlp.begin_training()
    for i in range(n_iter):
        losses = {}
        for text, annotations in train_data:
            example = Example.from_dict(nlp.make_doc(text), annotations)
            nlp.update([example], drop=0.5, losses=losses)
        print(f"Iteration {i+1}, Losses: {losses}")
    return nlp

# Step 4: Save the Model
def save_model(nlp, output_dir):
    """Save the trained model to disk."""
    nlp.to_disk(output_dir)
    print(f"Model saved to {output_dir}")

# Main Program
if __name__ == "__main__":
    # Path to training data
    training_data_file = "C:/Users/georg/Python Projects/HotelBooking/generated_training_data.json"
    
    # Load the training data
    TRAIN_DATA = load_training_data(training_data_file)

    # Create a blank model
    nlp = create_blank_model(TRAIN_DATA)

    # Train the model
    trained_nlp = train_model(nlp, TRAIN_DATA)

    # Save the model
    output_directory = "trained_ner_model"
    save_model(trained_nlp, output_directory)


Iteration 1, Losses: {'ner': np.float32(4344.3677)}
Iteration 2, Losses: {'ner': np.float32(1118.2798)}
Iteration 3, Losses: {'ner': np.float32(748.3312)}
Iteration 4, Losses: {'ner': np.float32(121.77543)}
Iteration 5, Losses: {'ner': np.float32(99.41066)}
Iteration 6, Losses: {'ner': np.float32(143.08932)}
Iteration 7, Losses: {'ner': np.float32(121.57827)}
Iteration 8, Losses: {'ner': np.float32(189.28296)}
Iteration 9, Losses: {'ner': np.float32(116.94065)}
Iteration 10, Losses: {'ner': np.float32(86.26816)}
Iteration 11, Losses: {'ner': np.float32(66.02918)}
Iteration 12, Losses: {'ner': np.float32(69.21724)}
Iteration 13, Losses: {'ner': np.float32(38.656033)}
Iteration 14, Losses: {'ner': np.float32(87.121704)}
Iteration 15, Losses: {'ner': np.float32(20.147615)}
Iteration 16, Losses: {'ner': np.float32(43.62064)}
Iteration 17, Losses: {'ner': np.float32(68.74155)}
Iteration 18, Losses: {'ner': np.float32(42.08443)}
Iteration 19, Losses: {'ner': np.float32(57.61383)}
Iteration 2

In [None]:
Hello, I would like to inquire about the availability of a Family Suite for two infants between 2024-12-12 and 2025-01-23. Thank you, Aimee Henderson
Hi there, I would like to inquire about the availability of a Presidential Suite for two adults between 2024-12-17 and 2025-01-24. Best regards, Jose Riley
    To whom it may concern, I would like to inquire about the availability of a Presidential Suite for four adults between 2024-12-28 and 2025-01-06. Warm regards, Jason Roberts
# Example text for testing
test_text = """
Hi, I would like to ask avaialbility of a room for 5 days checking on 20-09-2025.
"""

# Process the text with the model
doc = nlp(test_text)

# Print the entities
print("Entities found:")
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}, Start: {ent.start_char}, End: {ent.end_char}")
    

In [36]:
import spacy
import pandas as pd
import json

# Function to load JSON file with emails
def load_emails(file_path):
    """Load email texts from a JSON file."""
    with open(file_path, "r", encoding="utf-8") as file:
        emails = json.load(file)
    return emails

# Function to extract entities using a SpaCy model
def extract_entities_from_emails(nlp, emails):
    """Extract entities from emails and organize them into a structured format."""
    results = []
    for email in emails:
        doc = nlp(email)
        extracted_entities = {
            "Email": email,
            "GUEST": [],
            "ROOM TYPE": [],
            "NUMBER OF GUESTS": [],
            "GUEST TYPE": [],
            "CHECKIN DATE": [],
            "CHECKOUT DATE": [],
        }
        for ent in doc.ents:
            if ent.label_ in extracted_entities:
                extracted_entities[ent.label_].append(ent.text)
        # Flatten lists where applicable
        for key in extracted_entities:
            if key != "Email":
                extracted_entities[key] = ", ".join(extracted_entities[key])
        results.append(extracted_entities)
    return results

# Function to create a DataFrame
def create_dataframe(entities):
    """Convert extracted entities into a Pandas DataFrame."""
    return pd.DataFrame(entities)

# Main program
if __name__ == "__main__":
    # Path to the JSON file containing emails
    email_file_path = "test_data.json"  # Replace with your JSON file path
    
    # Path to the trained SpaCy model
    trained_model_path = "trained_ner_model"  # Replace with your trained model directory
    
    # Load emails
    emails = load_emails(email_file_path)
    
    # Load the trained SpaCy model
    nlp = spacy.load(trained_model_path)
    
    # Extract entities from emails
    entities = extract_entities_from_emails(nlp, emails)
    
    # Create a DataFrame
    df = create_dataframe(entities)


In [37]:
df

Unnamed: 0,Email,GUEST,ROOM TYPE,NUMBER OF GUESTS,GUEST TYPE,CHECKIN DATE,CHECKOUT DATE
0,"Greetings, I would like to inquire about booki...",April Simmons,Family Suite,three,children,2024-12-12,2025-01-17
1,"Good morning, This is a request for booking a ...",Betty Shaw,Deluxe Suite,5,adults,2024-12-30,2025-02-01
2,"Dear Sir/Madam, Can I please reserve a Family ...",James Smith,Family Suite,4,adults,2024-12-13,2025-01-23
3,"Good morning, Could you please book a Double R...",Dr. Anthony Reyes,Double Room,5,children,2024-12-06,2025-01-31
4,"Hello, Can I please reserve a Family Suite for...",David Garcia,Family Suite,3,adults,2024-12-09,2025-01-19
...,...,...,...,...,...,...,...
1495,"Hi there, I would like to book a Family Suite ...",Alan Downs,Family Suite,one,infants,2024-12-16,2025-02-02
1496,"To whom it may concern, I need accommodation i...",Zachary Ward,Single Room,four,infants,2024-12-05,2025-01-07
1497,"Good morning, I would like to book a Standard ...",Aaron Hawkins,Standard Room,six,children,2024-12-16,2025-01-19
1498,"Greetings, I would like to inquire about the a...",Penny Mora,Family Suite,six,adults,,"2024-12-25, 2025-01-27"
