# NYC Airbnb Listings Data Analysis and Embeddings

This notebook:
1. Downloads NYC Airbnb listings data from Inside Airbnb
2. Cleans the data (handles price column, removes duplicates, selects useful features)
3. Generates embeddings for the features using sentence transformers

## Setup and Imports

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import urllib.request
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
sns.set_theme()

## Step 1: Download NYC Airbnb Data

In [None]:
def download_nyc_data(output_dir="data"):
    """
    Download NYC Airbnb listings data from Inside Airbnb
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # URL for NYC listings data
    url = "http://data.insideairbnb.com/united-states/ny/new-york-city/2024-12-04/data/listings.csv.gz"
    output_path = os.path.join(output_dir, "listings.csv.gz")
    
    print(f"Downloading NYC listings data from {url}...")
    urllib.request.urlretrieve(url, output_path)
    print(f"Downloaded data to {output_path}")
    
    return output_path

# Download the data
data_path = download_nyc_data()

## Step 2: Load and Explore the Data

In [None]:
# Load the data
df = pd.read_csv(data_path, compression='gzip')
print(f"Loaded {len(df)} listings with {len(df.columns)} columns")

# Display first few rows
df.head()

In [None]:
# Check data types and missing values
print("Data Info:")
df.info()

## Step 3: Clean the Data

In [None]:
print("=== Cleaning Data ===")
print(f"Original shape: {df.shape}")

# 1. Clean the price column
if 'price' in df.columns:
    print("\nBefore cleaning price column:")
    print(df['price'].head())
    
    df['price'] = df['price'].str.replace('$', '', regex=False)
    df['price'] = df['price'].str.replace(',', '', regex=False)
    df['price'] = pd.to_numeric(df['price'], errors='coerce')
    
    print("\nAfter cleaning price column:")
    print(df['price'].head())
    print(f"Cleaned price column - removed $ and commas")

In [None]:
# 2. Remove duplicates
original_count = len(df)
df = df.drop_duplicates(subset=['id'])
duplicates_removed = original_count - len(df)
print(f"Removed {duplicates_removed} duplicate listings")

In [None]:
# 3. Select useful room features to keep
useful_columns = [
    'id',
    'name',
    'description',
    'neighborhood_overview',
    'host_id',
    'host_name',
    'neighbourhood_cleansed',
    'neighbourhood_group_cleansed',
    'latitude',
    'longitude',
    'property_type',
    'room_type',
    'accommodates',
    'bathrooms_text',
    'bedrooms',
    'beds',
    'amenities',
    'price',
    'minimum_nights',
    'maximum_nights',
    'number_of_reviews',
    'review_scores_rating',
    'review_scores_accuracy',
    'review_scores_cleanliness',
    'review_scores_checkin',
    'review_scores_communication',
    'review_scores_location',
    'review_scores_value',
    'instant_bookable',
    'reviews_per_month'
]

# Only keep columns that exist in the dataframe
existing_columns = [col for col in useful_columns if col in df.columns]
df = df[existing_columns]
print(f"Selected {len(existing_columns)} useful columns")

In [None]:
# 4. Remove listings with missing essential data
df = df.dropna(subset=['name', 'price'])
print(f"Removed listings with missing name or price")

# 5. Remove invalid price values
df = df[df['price'] > 0]
print(f"Removed listings with invalid prices (<= 0)")

# 6. Remove extreme outliers in price
df = df[df['price'] <= 10000]
print(f"Removed listings with extreme prices (> $10,000)")

print(f"\nFinal shape after cleaning: {df.shape}")

## Step 4: Exploratory Data Analysis

In [None]:
# Summary statistics
print("Price Statistics:")
print(df['price'].describe())

In [None]:
# Visualize price distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df['price'], bins=50, edgecolor='black')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.title('Price Distribution')

plt.subplot(1, 2, 2)
plt.hist(df[df['price'] <= 500]['price'], bins=50, edgecolor='black')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.title('Price Distribution (Price <= $500)')

plt.tight_layout()
plt.show()

In [None]:
# Room type distribution
if 'room_type' in df.columns:
    print("\nRoom Type Distribution:")
    print(df['room_type'].value_counts())
    
    plt.figure(figsize=(10, 6))
    df['room_type'].value_counts().plot(kind='bar')
    plt.xlabel('Room Type')
    plt.ylabel('Count')
    plt.title('Distribution of Room Types')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Neighborhood distribution
if 'neighbourhood_group_cleansed' in df.columns:
    print("\nNeighborhood Group Distribution:")
    print(df['neighbourhood_group_cleansed'].value_counts())
    
    plt.figure(figsize=(10, 6))
    df['neighbourhood_group_cleansed'].value_counts().plot(kind='bar')
    plt.xlabel('Neighborhood Group')
    plt.ylabel('Count')
    plt.title('Distribution of Listings by Neighborhood Group')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Property type distribution
if 'property_type' in df.columns:
    print("\nTop 10 Property Types:")
    print(df['property_type'].value_counts().head(10))
    
    plt.figure(figsize=(12, 6))
    df['property_type'].value_counts().head(10).plot(kind='bar')
    plt.xlabel('Property Type')
    plt.ylabel('Count')
    plt.title('Top 10 Property Types')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

## Step 5: Generate Embeddings

We'll use the `sentence-transformers` library with the `nomic-ai/nomic-embed-text-v1.5` model, as learned in lab3.

In [None]:
def create_text_for_embedding(row):
    """
    Create a text representation of a listing for embedding generation
    """
    parts = []
    
    # Add listing name
    if pd.notna(row.get('name')):
        parts.append(f"Name: {row['name']}")
    
    # Add description
    if pd.notna(row.get('description')):
        parts.append(f"Description: {row['description']}")
    
    # Add neighborhood overview
    if pd.notna(row.get('neighborhood_overview')):
        parts.append(f"Neighborhood: {row['neighborhood_overview']}")
    
    # Add property type and room type
    if pd.notna(row.get('property_type')):
        parts.append(f"Property Type: {row['property_type']}")
    
    if pd.notna(row.get('room_type')):
        parts.append(f"Room Type: {row['room_type']}")
    
    # Add amenities
    if pd.notna(row.get('amenities')):
        parts.append(f"Amenities: {row['amenities']}")
    
    # Add location info
    if pd.notna(row.get('neighbourhood_cleansed')):
        parts.append(f"Location: {row['neighbourhood_cleansed']}")
    
    return " ".join(parts)

# Example of text representation
print("Example text representation:")
print(create_text_for_embedding(df.iloc[0])[:500] + "...")

In [None]:
# Load the sentence transformer model
print("Loading model: nomic-ai/nomic-embed-text-v1.5")
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
print("Model loaded successfully!")

In [None]:
# Create text representations for each listing
print("Creating text representations...")
texts = df.apply(create_text_for_embedding, axis=1).tolist()

# Add the "search_document:" prefix as per the model's recommendation
texts_prefixed = ["search_document: " + text for text in texts]

print(f"Created {len(texts_prefixed)} text representations")

In [None]:
# Generate embeddings
# Note: This may take several minutes depending on the number of listings
print(f"Generating embeddings for {len(texts_prefixed)} listings...")
embeddings = model.encode(
    texts_prefixed,
    batch_size=32,
    normalize_embeddings=True,
    show_progress_bar=True
)

print(f"Generated embeddings with shape: {embeddings.shape}")

In [None]:
# Add embeddings to dataframe
df['embedding'] = list(embeddings)

print(f"Added embeddings to dataframe")
print(f"Each listing now has a {df['embedding'].iloc[0].shape[0]}-dimensional embedding")

## Step 6: Verify Embeddings

Let's verify that the embeddings are normalized and test similarity between listings.

In [None]:
# Check if embeddings are normalized
norms = np.linalg.norm(embeddings[:10], axis=1)
print(f"Norms of first 10 embeddings: {norms}")
print(f"Are embeddings normalized? {np.allclose(norms, 1.0, atol=1e-3)}")

In [None]:
# Example: Find similar listings using embeddings
def find_similar_listings(query_idx, top_k=5):
    """
    Find the most similar listings to a given listing
    """
    query_embedding = df['embedding'].iloc[query_idx]
    
    # Convert embeddings to matrix
    embedding_matrix = np.stack(df['embedding'].values)
    
    # Calculate similarity scores (dot product since embeddings are normalized)
    scores = embedding_matrix @ query_embedding
    
    # Get top k indices (excluding the query itself)
    top_k_indices = np.argsort(scores)[::-1][1:top_k+1]
    
    print(f"\nQuery Listing (Index {query_idx}):")
    print(f"Name: {df.iloc[query_idx]['name']}")
    print(f"Room Type: {df.iloc[query_idx].get('room_type', 'N/A')}")
    print(f"Price: ${df.iloc[query_idx]['price']}")
    
    print(f"\nTop {top_k} Most Similar Listings:")
    for i, idx in enumerate(top_k_indices, 1):
        print(f"\n{i}. (Similarity: {scores[idx]:.4f})")
        print(f"   Name: {df.iloc[idx]['name']}")
        print(f"   Room Type: {df.iloc[idx].get('room_type', 'N/A')}")
        print(f"   Price: ${df.iloc[idx]['price']}")

# Test with a random listing
find_similar_listings(0)

## Step 7: Save the Results

In [None]:
# Save the cleaned and embedded data
output_path = "data/nyc_listings_cleaned_embedded.parquet"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_parquet(output_path, index=False)
print(f"Data saved to {output_path}")
print(f"\nFinal dataset contains {len(df)} listings")
print(f"Each listing has a {df['embedding'].iloc[0].shape[0]}-dimensional embedding")

## Summary

In this notebook, we:
1. ✅ Downloaded NYC Airbnb listings data from Inside Airbnb
2. ✅ Cleaned the dataset:
   - Fixed the price column (removed $ and commas, converted to numeric)
   - Removed duplicate listings
   - Selected useful room features
   - Removed listings with invalid or missing data
   - Filtered out extreme price outliers
3. ✅ Generated embeddings using the `nomic-ai/nomic-embed-text-v1.5` model from sentence-transformers
4. ✅ Saved the cleaned and embedded data for future use

The embeddings can now be used for:
- Semantic search of listings
- Finding similar properties
- Clustering listings by features
- Building recommendation systems