# Airbnb Hotel Booking Analysis

**Author:** Generated by ChatGPT

This notebook loads the provided Excel dataset (`Airbnb_Open_Data.xlsx`), performs light cleaning, and recreates the visualizations (Price Distribution, Room Type Counts, Location vs Price). The notebook is fully runnable — execute the cells in order.

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# display settings
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 180)


In [None]:
# Load the dataset (update the path if needed)
file_path = '/mnt/data/1730285881-Airbnb_Open_Data.xlsx'
df = pd.read_excel(file_path)

# Quick look
print('Dataset loaded from:', file_path)
print('\nShape:', df.shape)
df.head()

In [None]:
# Basic cleaning and preparation
# - Standardize column names to lowercase and replace spaces with underscores
# - Try to coerce price to numeric if it's not already
# - Handle latitude/longitude column name variants
df.columns = [c.strip().lower().replace(' ', '_') for c in df.columns]

# Common price column names to attempt
price_cols = [c for c in df.columns if 'price' in c]
price_cols, df.columns.tolist()[:20]  # show candidate price columns and first columns

In [None]:
# Choose the most plausible price column
if price_cols:
    price_col = price_cols[0]
    print('Using price column:', price_col)
    df[price_col] = pd.to_numeric(df[price_col], errors='coerce')
    df = df.rename(columns={price_col: 'price'})
else:
    # If no price column found, create a dummy price for demonstration (this should not happen for your file)
    print('No price column detected — creating demo price column.')
    df['price'] = np.random.randint(50, 500, size=len(df))

# Normalize room_type column name if present
room_candidates = [c for c in df.columns if 'room' in c and 'type' in c]
if room_candidates:
    df = df.rename(columns={room_candidates[0]: 'room_type'})
elif 'room' in df.columns:
    df = df.rename(columns={'room': 'room_type'})

# Latitude / longitude columns
lat_candidates = [c for c in df.columns if 'lat' in c]
lon_candidates = [c for c in df.columns if 'lon' in c or 'lng' in c]
if lat_candidates:
    df = df.rename(columns={lat_candidates[0]: 'latitude'})
if lon_candidates:
    df = df.rename(columns={lon_candidates[0]: 'longitude'})

# Drop duplicates
df = df.drop_duplicates()

# Basic info
print('\nAfter cleaning: shape =', df.shape)
df[['price','room_type','latitude','longitude']].head()

## Price Distribution
Histogram of listing prices.

In [None]:
# Price distribution plot
plt.figure(figsize=(8,5))
plt.hist(df['price'].dropna(), bins=40, edgecolor='black', alpha=0.7)
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Count')
plt.grid(True, linestyle=':', linewidth=0.5, alpha=0.5)
plt.show()

## Room Type Counts
Bar chart of room types (counts).

In [None]:
# Room type counts
if 'room_type' in df.columns:
    counts = df['room_type'].fillna('Unknown').value_counts()
    plt.figure(figsize=(8,5))
    counts.plot(kind='bar')
    plt.title('Room Type Counts')
    plt.xlabel('Room Type')
    plt.ylabel('Count')
    plt.grid(axis='y', linestyle=':', linewidth=0.5, alpha=0.5)
    plt.show()
else:
    print('No room_type column found in the dataset.')

## Location vs Price
Scatter plot of listings by latitude/longitude. Marker size scaled by price.

In [None]:
# Location scatter (size ~ price)
if 'latitude' in df.columns and 'longitude' in df.columns:
    plt.figure(figsize=(8,6))
    sizes = (df['price'].fillna(df['price'].median())/20).clip(lower=5, upper=300)
    plt.scatter(df['latitude'], df['longitude'], s=sizes, alpha=0.5)
    plt.title('Location vs Price (marker size ~ price)')
    plt.xlabel('Latitude')
    plt.ylabel('Longitude')
    plt.grid(True, linestyle=':', linewidth=0.5, alpha=0.5)
    plt.show()
else:
    print('Latitude/Longitude columns not found; cannot create location scatter.')

## Extra: Average Price by Neighbourhood (if available)
A quick aggregation to show average price by neighbourhood or area column.

In [None]:
# Average price by neighbourhood or city (if such a column exists)
neigh_cols = [c for c in df.columns if 'neigh' in c or 'city' in c or 'district' in c or 'area' in c]
if neigh_cols:
    nb = neigh_cols[0]
    avg_price = df.groupby(nb)['price'].mean().dropna().sort_values(ascending=False).head(15)
    plt.figure(figsize=(10,5))
    avg_price.plot(kind='bar')
    plt.title(f'Average Price by {nb.title()} (top 15)')
    plt.xlabel(nb.title())
    plt.ylabel('Average Price')
    plt.grid(axis='y', linestyle=':', linewidth=0.5, alpha=0.5)
    plt.show()
else:
    print('No obvious neighbourhood/city/area column found for aggregation.')

----

*Notebook saved as `Airbnb_Hotel_Booking_Analysis.ipynb`. Run the cells in order to reproduce the analysis and plots.*