In [2]:
!pip install --upgrade google-cloud-aiplatform
!pip install --upgrade google-cloud-storage
!pip install --upgrade gcsfs



In [19]:
# Variables
PROJECT_ID = "vtxdemos" # @param {type:"string"}
VERTEX_LOCATION = "us-central1" # @param {type:"string"}
OUTPUT_IMAGE_PATH = "vtxdemos-abnb-images" # @param {type:"string"}

BQ_DATASET_ID = 'abnb_metadata' # @param {type:"string"}

# Replace with your table ID
BQ_TABLE_ID = 'sytheticdb' # @param {type:"string"}

NUMBER_OF_IMAGES_TO_GENERATE = 20 # @param {type:"integer"}

# Imports

In [20]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, FinishReason
from vertexai.preview.vision_models import ImageGenerationModel
import json
import math
import matplotlib.pyplot as plt
import subprocess
import time
import random
import uuid
import datetime
import base64
import io
from google.cloud import bigquery
import pandas as pd
from PIL import Image
import vertexai.preview.generative_models as generative_models

# Flow
1. Construct prompts using random
2. Generate interior and exterior images
2. Store the images in cloud storage
3. Construct BQ schema and save all images

# Variables

In [21]:
generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95
}


# Define lists of amenities and nearby neighborhoods
amenities_list = [
    "Bathtub", "Hair dryer", "Cleaning products", "Shampoo", "Conditioner", "Body soap", "Hot water", "Shower gel",
    "Free washer - In unit", "Free dryer - In unit", "Towels", "Bed sheets", "Soap", "Toilet paper", "Hangers", "Bed linens",
    "Extra pillows and blankets", "Iron", "Clothing storage: closet and dresser", "Ethernet connection", "50 inch HDTV with Apple TV",
    "DVD player", "Record player", "Sound system with Bluetooth and aux", "Books and reading material", "Pack 'n play/Travel crib",
    "books and toys", "Board games", "Portable air conditioning", "Indoor fireplace", "Heating",
    "Smoke alarm", "Carbon monoxide alarm", "Fire extinguisher", "First aid kit", "Wifi", "Dedicated workspace (in a room with a door)",
    "Space where guests can cook their own meals", "Refrigerator", "Microwave", "Cooking basics (pots and pans, oil, salt and pepper)",
    "Dishes and silverware (bowls, chopsticks, plates, cups, etc.)", "Freezer", "Dishwasher", "Gas stove", "Oven", "Hot water kettle",
    "Coffee maker (Keurig and pour-over)", "Wine glasses", "Toaster", "Baking sheet", "Trash compactor", "Barbecue utensils (grill, charcoal, bamboo skewers/iron skewers, etc.)",
    "Dining table", "Coffee", "Private entrance", "Separate street or building entrance", "Private patio or balcony",
    "Private backyard (not fully fenced)", "Open space on the property (usually covered in grass)", "Fire pit", "Outdoor furniture",
    "Hammock", "Outdoor dining area", "BBQ grill", "Free driveway parking on premises (4 spaces)", "Long term stays allowed (28 days or more)",
    "Self check-in (keypad)", "1 queen bed", "2 single beds"
]

nearby_neighborhood_list = [
    "Walking trails", "Lakes", "Shopping places", "Downtown", "Countryside", "Winery", "Hills"
]


table_exists = False  # Global flag to track table existence
dataset_exists = False  # Global flag to track bigquery dataset existence

# Util Functions

In [22]:
# An axuillary function to display images in grid
def display_images_in_grid(images):
    """Displays the provided images in a grid format. 4 images per row.

    Args:
        images: A list of PIL Image objects representing the images to display.
    """

    # Determine the number of rows and columns for the grid layout.
    nrows = math.ceil(len(images) / 4)  # Display at most 4 images per row
    ncols = min(len(images) + 1, 4)  # Adjust columns based on the number of images

    # Create a figure and axes for the grid layout.
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12, 6))

    for i, ax in enumerate(axes.flat):
        if i < len(images):
            # Display the image in the current axis.
            ax.imshow(images[i]._pil_image)

            # Adjust the axis aspect ratio to maintain image proportions.
            ax.set_aspect("equal")

            # Disable axis ticks for a cleaner appearance.
            ax.set_xticks([])
            ax.set_yticks([])
        else:
            # Hide empty subplots to avoid displaying blank axes.
            ax.axis("off")

    # Adjust the layout to minimize whitespace between subplots.
    plt.tight_layout()

    # Display the figure with the arranged images.
    plt.show()

def parse_prompt_json(json_data):
    """
    Parses a JSON string containing prompt and negative_prompt.
    
    Args:
      json_data: A JSON string representing the prompt data.
    
    Returns:
      A dictionary containing the prompt and negative_prompt, or None if parsing fails.
    """
    try:
        data = json.loads(json_data)
        return {
            "prompt": data["prompt"],
            "negative_prompt": data["negative_prompt"]
        }
    except (json.JSONDecodeError, KeyError):
        return None

# Generative Functions

In [23]:
def generatecatchytitle(title):
    text1 = f"""Write a catchy Airbnb title of 10 words or less based on title: \',{title},\''.
    Remember: Output only the title, nothing else. Use plain text with no formatting (no bold, italics, lists, etc.)"""

    generation_config = {
        "max_output_tokens": 2048,
        "temperature": 1,
        "top_p": 1,
    }

    safety_settings = {
        generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
        generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
        generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
        generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    }

    vertexai.init(project=PROJECT_ID, location=VERTEX_LOCATION)
    model = GenerativeModel(
      "gemini-1.0-pro-002",
    )
    responses = model.generate_content(
        [text1],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=True,
    )

    responsetext=""
    for response in responses:
        responsetext+=response.text

    return responsetext

def generateImageUsingImage2(imagenprompt, negative_prompt):
    vertexai.init(project=PROJECT_ID, location=VERTEX_LOCATION)
    imagen_model = ImageGenerationModel.from_pretrained("imagen-3.0-fast-generate-preview-0611")
    response = imagen_model.generate_images(
        prompt=imagenprompt,
        negative_prompt=negative_prompt,
        number_of_images=5,
        add_watermark=True,
        aspect_ratio="16:9",
        language="en",
        guidance_scale=7.5,
        output_gcs_uri=f"gs://{OUTPUT_IMAGE_PATH}/airbnbimages/",
        #safety_filter_level="block_some",
    )
    return response

def generate_listing_data(image_prompt):
    """
    Generates listing data based on the image prompt.

    Args:
        image_prompt: A dictionary containing information about the image, including property type, style, location, etc.

    Returns:
        A dictionary containing listing data.
    """

    listing_data = {
        "listing_id": str(uuid.uuid4()),
        "title": generatecatchytitle(f"{image_prompt['property_type']} in {image_prompt['location']} ({image_prompt['style']})"),
        "location": image_prompt['location'],
        "guests": random.randint(2, 8),
        "bedrooms": random.randint(1, 4),
        "beds": random.randint(1, 6),
        "baths": random.randint(1, 3),
        "rating": round(random.uniform(4.5, 5), 1),
        "reviews": random.randint(5, 50),
        "host_name": random.choice(["John Smith", "Jane Doe", "Emily Brown", "David Wilson"]),
        "host_type": random.choice(["Superhost", "Experienced Host", "New Host"]),
        "price_per_night": random.randint(50, 500),
        "check_in_date": datetime.date.today() + datetime.timedelta(days=random.randint(1, 180)),
        "check_out_date": datetime.date.today() + datetime.timedelta(days=random.randint(1, 180)),
        "amenities": random.sample(amenities_list, random.randint(5, 10)),
        "description": f"This {image_prompt['property_type']} offers a {image_prompt['style']} experience in {image_prompt['location']}. Enjoy {image_prompt['amenity_description']} and more.",
        "is_rare_find": random.choice([True, False]),
        "nearby_neighbourhood": random.sample(nearby_neighborhood_list, random.randint(2, 3))
    }

    return listing_data

# BQ Function

In [24]:
def load_listing_data_to_bigquery(client, listing_data, project_id, dataset_id, table_id):
    """Loads listing data (with image URLs) into a BigQuery table,
    handling missing columns gracefully."""

    client = bigquery.Client(project=project_id)
    table_id = f"{project_id}.{dataset_id}.{table_id}"

    # Get table schema from BigQuery
    table = client.get_table(table_id)
    bq_schema = table.schema

    # Convert listing_data to DataFrame before filtering
    df = pd.DataFrame([listing_data])

    df['guests'] = pd.to_numeric(df['guests'])

    # Convert date columns to datetime objects
    df['check_in_date'] = pd.to_datetime(df['check_in_date'])
    df['check_out_date'] = pd.to_datetime(df['check_out_date'])

    # Explicitly convert datetime to timestamp
    df['check_in_date'] = pd.to_datetime(df['check_in_date']).dt.to_pydatetime()
    df['check_out_date'] = pd.to_datetime(df['check_out_date']).dt.to_pydatetime()


    # Filter out columns not present in the DataFrame
    columns_to_load = [field.name for field in bq_schema if field.name in df.columns]
    df_filtered = df[columns_to_load]

    # Load the filtered DataFrame into BigQuery
    job_config = bigquery.LoadJobConfig(schema=bq_schema)
    job = client.load_table_from_dataframe(df_filtered, table_id, job_config=job_config)
    job.result()

    if job.errors:
        raise Exception(f"BigQuery load job failed: {job.errors}")
    else:
        print(f"Loaded {job.output_rows} rows to {dataset_id}.{table_id}")


def create_bigquery_model(client, project_id, dataset_id, table_id, listing_data):
    """Creates a BigQuery table model based on listing data keys."""
    global table_exists, dataset_exists

    if not dataset_exists:
        dataset_ref = f"{project_id}.{dataset_id}"
        dataset = bigquery.Dataset(dataset_ref)
        dataset.location = "US"
        try:
            dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.
            print(f"Created dataset {project_id}.{dataset_id}")
            dataset_exists = True
        except Exception as e:
            if e.code == 409:  # TableAlreadyExists error
                print(f"Dataset {dataset_id} already exists.")
                dataset_exists = True  # Set flag to True
            else:
                print(f"An error occurred: {e}")
                raise

    if not table_exists:
        # Construct the full table ID
        table_ref = f"{project_id}.{dataset_id}.{table_id}"
        
        # Define BigQuery schema dynamically based on listing_data_keys
        schema = []
        for key in list(listing_data.keys()):
            field_type = bigquery.SqlTypeNames.STRING
            if key == 'amenities' or key == 'nearby_neighbourhood':
                mode = 'REPEATED'
            else:
                mode = 'NULLABLE'
                
        for key, value in listing_data.items():
            mode= 'NULLABLE'
            if isinstance(value, str):
                field_type = bigquery.SqlTypeNames.STRING
            elif isinstance(value, int):
                field_type = bigquery.SqlTypeNames.INTEGER
            elif isinstance(value, float):
                field_type = bigquery.SqlTypeNames.FLOAT
            elif isinstance(value, bool):
                field_type = bigquery.SqlTypeNames.BOOLEAN
            elif isinstance(value, datetime.datetime):  # Handle datetime objects as TIMESTAMP
                field_type = bigquery.SqlTypeNames.TIMESTAMP
            elif isinstance(value, datetime.date):  # Handle date objects as DATE
                field_type = bigquery.SqlTypeNames.DATE
            elif isinstance(value, list):  # Handle lists as ARRAY
                field_type = bigquery.SqlTypeNames.STRING
                mode= 'REPEATED'
            elif isinstance(value, dict):  # Handle dicts as STRUCT
                field_type = bigquery.SqlTypeNames.STRING
                mode= 'REPEATED'
                
            schema.append(bigquery.SchemaField(key, field_type, mode=mode))
        
        print("Setting up table.....")
        
        # Create the table
        table = bigquery.Table(table_ref, schema=schema)

        try:
            client.create_table(table) # Make an API request.
            print(f"Created table {table_ref}")
            table_exists = True
        except Exception as e:
            if e.code == 409:  # TableAlreadyExists error
                print(f"Table {table_ref} already exists.")
                table_exists = True  # Set flag to True
            else:
                print(f"An error occurred: {e}")
                raise

# Prompt construction

In [25]:
def generate_image_prompts(n):
    """Generates a list of image prompts based on various property types, styles, locations, and amenities.

     Args:
       n: The number of image prompts to generate.

     Returns:
       A list of dictionaries, each containing an image prompt.
    """

    property_types = [
        "Beach House", "Condo", "Forest Retreat", "Cabin", "Treehouse", "Lakefront", "Farmhouse"
    ]

    styles = [
        "Coastal", "Nautical", "Minimalist", "Rustic", "Bohemian", "Luxury", "Contemporary", "Industrial", "Loft", "Cozy", "Scandinavian", "Adventure", "Eco-friendly", "Art Deco", "Vintage", "Farmhouse Chic"
    ]

    locations = [
        "Malibu, California", "Charleston, South Carolina", "Miami Beach, Florida", "Chicago, Illinois", "Yellowstone National Park, Wyoming",
        "Asheville, North Carolina", "New Orleans, Louisiana", "Austin, Texas", "New York City"
    ]

    amenity_descriptions = [
        "a private balcony overlooking the Pacific Ocean, direct beach access, and a fire pit for evening gatherings.",
        "a screened-in porch with a hammock, a spacious kitchen perfect for entertaining, and a backyard with a fire pit.",
        "a private balcony with a stunning ocean view, a resort-style pool and spa, and a state-of-the-art fitness center.",
        "high ceilings with exposed brick, floor-to-ceiling windows, and a private balcony overlooking the city.",
        "a wood-burning fireplace, large windows for nature views, and a private deck with a hot tub.",
        "a canopy view of the surrounding forest, a hammock for relaxing, and a private outdoor shower.",
        "a rooftop bar with live music, a courtyard with lush greenery, and a spa offering unique treatments.",
        "a large porch with rocking chairs, a garden for growing fresh produce, and a fire pit for stargazing.",
        "a stunning view of the city skyline and a rooftop terrace with a pool",
    ]

    prompts = []
    for _ in range(n):
        prompt = {
          "property_type": random.choice(property_types),
          "style": random.choice(styles),
          "location": random.choice(locations),
          "amenity_description": random.choice(amenity_descriptions)
      }
        prompts.append(prompt)

    return prompts

In [26]:
def generate_image_prompts_str(listing_data, prompt):
    """Generates image prompts (aerial and interior) based on listing data."""

    # Convert the nearby_neighbourhood list to a comma-separated string
    nearby_neighbourhood_str = ", ".join(listing_data["nearby_neighbourhood"])

    # Construct the prompts using listing_data
    image_prompt_aerial = (
        f"This is a {prompt['style']} {prompt['property_type']} located in {listing_data['location']}, "
        f"close to {nearby_neighbourhood_str}. Please generate a realistic, high-quality, 4K aerial photo of this "
        f"{listing_data['bedrooms']}-bedroom, {listing_data['baths']}-bathroom property, capturing its surroundings "
        f"and architectural details."
    )

    amenities_highlights = ", ".join(listing_data["amenities"][:3])  # Mention a few key amenities

    image_prompt_interior = (
        f"This is the interior of a {prompt['style']} {prompt['property_type']} with {listing_data['bedrooms']} bedrooms and "
        f"{listing_data['baths']} bathrooms. The space includes amenities like {amenities_highlights} and more. "
        f"Please generate realistic, high-quality, 4K images showcasing different rooms, decor, and the mentioned amenities."
    )

    return image_prompt_aerial, image_prompt_interior

# Main Function

In [17]:
vertexai.init(project=PROJECT_ID, location="us-central1")
model = GenerativeModel("gemini-1.5-flash-001",)

def llm(context: str):
    prompt = f"""Your task is to create a very small/brief description for aibnb (no more than 14 words) from the following context:

    {context}"""

    response = model.generate_content(
        [prompt],
    )

    try: return response.text

    except:
        return "Cozy apartment"

In [18]:
image_prompts = generate_image_prompts(NUMBER_OF_IMAGES_TO_GENERATE)
client = bigquery.Client()

for prompt in image_prompts:
    print(f"Property Type: {prompt['property_type']}")
    print(f"Style: {prompt['style']}")
    print(f"Location: {prompt['location']}")
    print(f"Amenity Description: {prompt['amenity_description']}\n")
    listing_data = generate_listing_data(prompt)  # Assuming this generates your initial data
    print(listing_data)

    image_prompt_aerial, image_prompt_interior = generate_image_prompts_str(listing_data, prompt)

    try:
        imageresponse = generateImageUsingImage2(image_prompt_aerial, "cartoon, painting, bad image")
        if imageresponse is not None and imageresponse.images:
            i = 0
            for image in imageresponse.images:
                image_url_key = f"Img_exterior_url_{i}"
                listing_data[image_url_key] = str(image._gcs_uri)
                i += 1

    except Exception as e:
        print(f"Error generating or processing exterior images: {e}")
        print("Error: No exterior images were generated. Continuing to next...")


    # Interior Image Generation (similar logic)
    print(image_prompt_interior)
    try:
        imageresponse = generateImageUsingImage2(image_prompt_interior, "cartoon, painting, bad image")
        if imageresponse is not None and imageresponse.images:
            #display_images_in_grid(imageresponse.images) #TODO: uncomment to see images
            i = 0
            for image in imageresponse.images:
                image_url_key = f"Img_interior_url_{i}"
                listing_data[image_url_key] = str(image._gcs_uri)
                i += 1

    except Exception as e:
        print(f"Error generating or processing interior images: {e}")
        print("Error: No images were generated. Continuing to next...")


    listing_data["b_desc"] = llm(f"Context: str({listing_data})")

    # Creates table only once if it doesn't exist.
    create_bigquery_model(client, PROJECT_ID, BQ_DATASET_ID, BQ_TABLE_ID, listing_data)
    try:
        load_listing_data_to_bigquery(client, listing_data, PROJECT_ID, BQ_DATASET_ID, BQ_TABLE_ID)
    except Exception as e:
        print(f"Error loading data to BQ. Possible reason is image satefy filters- {e}")

Property Type: Farmhouse
Style: Nautical
Location: Miami Beach, Florida
Amenity Description: high ceilings with exposed brick, floor-to-ceiling windows, and a private balcony overlooking the city.



I0000 00:00:1721428588.333762  183900 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


{'listing_id': '2289d4f7-5ed6-4c37-9c13-0e8347d1206a', 'title': 'Nautical Oasis in Miami Beach Farmhouse 🌴⚓️\n', 'location': 'Miami Beach, Florida', 'guests': 3, 'bedrooms': 1, 'beds': 6, 'baths': 3, 'rating': 4.8, 'reviews': 12, 'host_name': 'John Smith', 'host_type': 'Superhost', 'price_per_night': 471, 'check_in_date': datetime.date(2024, 8, 20), 'check_out_date': datetime.date(2024, 8, 26), 'amenities': ['Clothing storage: closet and dresser', 'Private backyard (not fully fenced)', 'Shampoo', 'Body soap', 'Dishes and silverware (bowls, chopsticks, plates, cups, etc.)', 'Hammock', 'Record player', 'Outdoor dining area', 'Cooking basics (pots and pans, oil, salt and pepper)'], 'description': 'This Farmhouse offers a Nautical experience in Miami Beach, Florida. Enjoy high ceilings with exposed brick, floor-to-ceiling windows, and a private balcony overlooking the city. and more.', 'is_rare_find': True, 'nearby_neighbourhood': ['Winery', 'Lakes', 'Shopping places']}


I0000 00:00:1721428589.238524  183900 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1721428589.612548  183900 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1721428589.613112  183900 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported


This is the interior of a Nautical Farmhouse with 1 bedrooms and 3 bathrooms. The space includes amenities like Clothing storage: closet and dresser, Private backyard (not fully fenced), Shampoo and more. Please generate realistic, high-quality, 4K images showcasing different rooms, decor, and the mentioned amenities.


I0000 00:00:1721428596.465710  183900 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1721428596.830312  183900 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1721428596.830688  183900 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1721428602.384785  183900 check_gcp_environment_no_op.cc:29] ALTS: Platforms other than Linux and Windows are not supported
I0000 00:00:1721428603.189509  183900 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
  df['check_in_date'] = pd.to_datetime(df['check_in_date']).dt.to_pydatetime()
  df['check_out_date'] = pd.to_datetime(df['check_out_date']).dt.to_pydatetime()


Loaded 1 rows to abnb_metadata.vtxdemos.abnb_metadata.sytheticdb


In [44]:
listing_data.keys()

dict_keys(['listing_id', 'title', 'location', 'guests', 'bedrooms', 'beds', 'baths', 'rating', 'reviews', 'host_name', 'host_type', 'price_per_night', 'check_in_date', 'check_out_date', 'amenities', 'description', 'is_rare_find', 'nearby_neighbourhood', 'Img_exterior_url_0', 'Img_exterior_url_1', 'Img_exterior_url_2', 'Img_exterior_url_3', 'Img_exterior_url_4', 'Img_interior_url_0', 'Img_interior_url_1', 'Img_interior_url_2', 'Img_interior_url_3', 'Img_interior_url_4', 'b_desc'])

## Multimodal Embeddings (img_text_indexes)

---
* Using text or image as input/output for similarities
* Using vertexai.vision_models

In [23]:
import base64
import requests
import vertexai
from io import BytesIO
import numpy as np
import pandas as pd
from PIL import Image as Img
from google.cloud import storage, aiplatform
import vertexai.generative_models as generative_models
from vertexai.vision_models import MultiModalEmbeddingModel, Image
from vertexai.generative_models import GenerativeModel, Part, FinishReason

## Init

In [24]:
storage_client = storage.Client()
bq_client = bigquery.Client()
mm = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")

## Creating Embeddings
---

In [25]:
sql = f"""
SELECT * FROM {PROJECT_ID}.{BQ_DATASET_ID}.{BQ_TABLE_ID}
"""
df = bq_client.query_and_wait(sql).to_dataframe()

In [26]:
df.head()

Unnamed: 0,listing_id,title,location,guests,bedrooms,beds,baths,rating,reviews,host_name,...,Img_exterior_url_0,Img_exterior_url_1,Img_exterior_url_2,Img_exterior_url_3,Img_exterior_url_4,Img_interior_url_0,Img_interior_url_1,Img_interior_url_2,Img_interior_url_3,Img_interior_url_4
0,87f54048-8272-49a8-a37d-742cc8eadf78,✅ **Bohemian Treetop Hideaway in Chicago** \n,"Chicago, Illinois",3,3,3,2,4.6,27,David Wilson,...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...
1,29bb9fc2-5538-4490-a25b-2357e27428a3,"Vintage Beach House in Charleston, SC\n","Charleston, South Carolina",4,4,3,1,4.7,42,David Wilson,...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...
2,d36dae74-95f5-48a6-b11b-741511b5074b,New Orleans Bohemian Cabin Retreat 🌴,"New Orleans, Louisiana",6,3,3,1,4.5,33,David Wilson,...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...
3,ba2c5e26-6350-4185-92e9-f70fa217893d,Art Deco Farmhouse Retreat - Asheville NC\n,"Asheville, North Carolina",8,3,5,2,4.5,34,Jane Doe,...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...,gs://vtxdemos-abnb-images/airbnbimages/1720530...
4,def1fdbf-5760-4cbe-984a-b93e1ecf8db1,Asheville Lakefront Oasis with Modern Style,"Asheville, North Carolina",5,3,1,1,4.6,42,Jane Doe,...,gs://vtxdemos-abnb-images/airbnbimages/1720529...,gs://vtxdemos-abnb-images/airbnbimages/1720529...,gs://vtxdemos-abnb-images/airbnbimages/1720529...,gs://vtxdemos-abnb-images/airbnbimages/1720529...,gs://vtxdemos-abnb-images/airbnbimages/1720529...,gs://vtxdemos-abnb-images/airbnbimages/1720529...,gs://vtxdemos-abnb-images/airbnbimages/1720529...,gs://vtxdemos-abnb-images/airbnbimages/1720529...,gs://vtxdemos-abnb-images/airbnbimages/1720529...,gs://vtxdemos-abnb-images/airbnbimages/1720529...


In [28]:
df["id"] = df.index

In [None]:
# Store the Entire DataFrame into GCS
## The dataframe will be used later in the middleware
df.to_csv(f"gs://{OUTPUT_IMAGE_PATH}/dataset/data.csv" , index=False)

In [29]:
id_list = []
embeddings = []

for index, row in df.iterrows():
    for im in ["Img_interior_url_", "Img_exterior_url_"]:
        for i in range(5):
            _img = row[f"{im}{i}"].replace("https://storage.googleapis.com/", "gs://")
            print(f"Processing {_img}")
            e = mm.get_embeddings(
                image=Image(
                    gcs_uri=_img
                ),
            ).image_embedding
            embeddings.append(e)
            id_list.append(row["id"])
emb_df = pd.DataFrame({"id": id_list, "embedding": embeddings}) 

Processing gs://vtxdemos-abnb-images/airbnbimages/1720530257628/sample_0.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720530257628/sample_1.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720530257628/sample_2.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720530257628/sample_3.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720530257628/sample_4.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720530252640/sample_0.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720530252640/sample_1.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720530252640/sample_2.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720530252640/sample_3.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720530252640/sample_4.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720530215026/sample_0.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720530215026/sample_1.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720530215026/sample_2.png

## Vector Search (Managed Service)

In [52]:
def preprocess(df, f_name):
    df.to_pickle("text_data.pkl")
    data = df.to_json(orient='records', lines=True)
    
    with open('text_data.json', 'w') as f:
        f.write(data)
        
    storage_client.bucket(OUTPUT_IMAGE_PATH).blob(f_name).upload_from_filename("text_data.json")
    return data

In [53]:
data_1 = preprocess(emb_df, "abnb_text/text_data.json") 

In [None]:
abnb_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name = f"vs-abnb-index-text-1",
    contents_delta_uri = f"gs://{OUTPUT_IMAGE_PATH}/abnb_text",
    dimensions = len(emb_df["embedding"].iloc[0]),
    approximate_neighbors_count = 15,
)

In [None]:
 abnb_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name = f"vs-abnb-index-endpoint-text-1",
    public_endpoint_enabled = True
)

In [None]:
 abnb_index_endpoint.deploy_index(
    index = abnb_index, deployed_index_id = "vs_abnb_deployed_text_1"
)

## Multimodal Embeddings (combined_indexes)

---
* Using text AND image as input/output for similarities.

In [58]:
model = GenerativeModel("gemini-1.5-flash-001",)
mm = MultiModalEmbeddingModel.from_pretrained("multimodalembedding")

In [59]:
 generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = {
    generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
    generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_MEDIUM_AND_ABOVE,
}

weight_image = 0.6
weight_text = 0.4

## Concatanate

In [62]:
id_list = []
context_list = []
embeddings = []

def llm(context: str):
    prompt = f"""
    Your task is to create a good description from the following context:
    
    <rules>
    1. The output will be used to generate multimodal embeddings (MultiModalEmbeddingModel) from vertexai.vision_models, description should fit on it.
    2. The end goal is to combine image and text embeddings, description should be according that.
    3. The maximum number of characters is 1024 so keep it simply but do not forget relevant information remember point number 1 and 2.

    </rules>
    
    <context>
    {context}
    
    </context>
    
    Output description:
    """
    
    response = model.generate_content(
      [prompt],
      generation_config=generation_config,
      safety_settings=safety_settings,
    )
    
    try:
        return response.text
    except:
        print("Error!")
        print(response)
        return ""
    
def l2_normalize(vector):
    """Normalizes a vector to unit length using L2 normalization."""
    l2_norm = np.linalg.norm(vector)
    if l2_norm == 0:
        return vector  # Avoid division by zero
    return vector / l2_norm
        
    
for index, row in df.iterrows():
    context = f'''
    The title of the airbnb listing is {row["title"]}, the location is {row["location"]}, the number of permissible guests are {row["guests"]},
    the number of bedrooms are: {row["bedrooms"]}, it has the following ratings: {row["rating"]}, and count with {row["reviews"]} reviews,
    the hostname is {row["host_name"]}, and the type of host is {row["host_type"]}, price per night is {row["price_per_night"]},
    and if the following value is 1 it means that is a rare find {row["is_rare_find"]}, amenities are: {row["amenities"]}, and the closest neighbourhoods are:
    {row["nearby_neighbourhood"]}.
    '''
    context = llm(context)
    print(context)
    if context == "":
        pass
    else:
        for im in ["Img_interior_url_", "Img_exterior_url_"]:
            for i in range(5):
                _img = row[f"{im}{i}"].replace("https://storage.googleapis.com/", "gs://")
                print(f"Processing {_img}")
                e = mm.get_embeddings(
                    image=Image(
                        gcs_uri=_img
                    ),
                    contextual_text=context,
                )
                
                normalized_text_embedding = l2_normalize(e.text_embedding)
                normalized_image_embedding = l2_normalize(e.image_embedding)
                we_ave =  (weight_image * normalized_image_embedding) + (weight_text * normalized_text_embedding)

                embeddings.append(we_ave)
                id_list.append(row["id"])
                context_list.append(context)

This is a secluded, 4-bedroom retreat near the Austin Coast, perfect for 3 guests. Enjoy amenities like an indoor fireplace, a Keurig coffee maker, and a 50-inch HDTV. This rare find, hosted by David Wilson (New Host), offers a unique experience for $332 per night. Located near Lakes, Downtown, and Winery neighborhoods, this listing boasts a 4.5-star rating with 5 reviews. 

Processing gs://vtxdemos-abnb-images/airbnbimages/1720469250662/sample_0.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720469250662/sample_1.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720469250662/sample_2.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720469250662/sample_3.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720469250662/sample_4.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720469246611/sample_0.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720469246611/sample_1.png
Processing gs://vtxdemos-abnb-images/airbnbimages/1720469246611/sample_2.png
Proces

In [63]:
emb_df = pd.DataFrame({"id": id_list, "context": context_list, "embedding": embeddings}) 

## Vector Search (Managed Service)

In [64]:
data_1 = preprocess(emb_df, "abnb_image_text/image_text_data.json") 

In [None]:
 abnb_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name = f"vs-abnb-index-image-text-1",
    contents_delta_uri = "gs://vtxdemos-vsearch-datasets/abnb_image_text",
    dimensions = len(emb_df["embedding"].iloc[0]),
    approximate_neighbors_count = 15,
)

In [None]:
abnb_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name = f"vs-abnb-index-endpoint-image-text-1",
    public_endpoint_enabled = True
)

In [None]:
abnb_index_endpoint.deploy_index(
    index = abnb_index, deployed_index_id = "vs_abnb_deployed_image_text_1"
)