In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3 as sql

print("All imports successful")

All imports successful


In [2]:
df = pd.read_csv('../../../austin-airbnb-market-analysis/data/raw/listings.csv')
df.shape

(15187, 79)

In [3]:
pd.set_option('display.max_rows',None)
#df.dtypes

In [4]:
df['price'] = df['price'].str.replace(r'[$,]', '', regex=True).astype(float)
df.dtypes

id                                                int64
listing_url                                      object
scrape_id                                         int64
last_scraped                                     object
source                                           object
name                                             object
description                                      object
neighborhood_overview                            object
picture_url                                      object
host_id                                           int64
host_url                                         object
host_name                                        object
host_since                                       object
host_location                                    object
host_about                                       object
host_response_time                               object
host_response_rate                               object
host_acceptance_rate                            

In [5]:
print("Price stats:")
print(df['price'].describe())
print("\nSample prices:")
print(df['price'].head(10))

Price stats:
count    10708.000000
mean       386.470583
std       2620.198322
min          9.000000
25%         89.000000
50%        138.000000
75%        235.000000
max      50000.000000
Name: price, dtype: float64

Sample prices:
0    101.0
1     45.0
2      NaN
3    155.0
4     43.0
5    145.0
6     58.0
7     95.0
8    514.0
9     72.0
Name: price, dtype: float64


In [6]:
df.head(1).T

Unnamed: 0,0
id,5456
listing_url,https://www.airbnb.com/rooms/5456
scrape_id,20250613040113
last_scraped,2025-06-13
source,city scrape
name,"Walk to 6th, Rainey St and Convention Ctr"
description,Great central location for walking to Convent...
neighborhood_overview,My neighborhood is ideally located if you want...
picture_url,https://a0.muscache.com/pictures/14084884/b5a3...
host_id,8028


# SQL Drills

In [7]:
conn = sql.connect(':memory:')
df.to_sql('austin_housing', conn, index=False)

15187

Query 1: Property Type Performance
Business question: Which 5 property types generate the highest average revenue? Show property type, count of listings, average revenue, average price. Sort by revenue descending.
Your task:

Group by property_type
Calculate: count, mean of estimated_revenue_l365d, mean of price
Sort by average revenue (highest first)
Get top 5

In [8]:
t1q = """ SELECT property_type,
    COUNT(*),
    AVG(estimated_revenue_l365d) as avg_annual_revenue,
    AVG(price) as avg_price
    FROM austin_housing
    GROUP BY property_type
    ORDER BY avg_annual_revenue DESC
"""

t1r = pd.read_sql(t1q, conn)
t1r.head(5)

Unnamed: 0,property_type,COUNT(*),avg_annual_revenue,avg_price
0,Room in hotel,418,55650.227273,6710.944444
1,Entire villa,55,45421.302326,786.232558
2,Dome,1,34680.0,136.0
3,Treehouse,10,26814.75,371.875
4,Entire home,5939,21013.479842,355.329755


Query 2:
"Do Superhosts have higher occupancy than non-Superhosts?"

Column: host_is_superhost (values are 't' or 'f')
Compare average occupancy between the two groups
Use CASE WHEN to label them 'Superhost' and 'Regular Host'

In [9]:
t2q = """ SELECT 
    CASE 
        WHEN host_is_superhost == 't' THEN 'Superhost'
        ELSE 'Regular Host'
    END as rating_group,
    COUNT(*) as listing_count,
    AVG(estimated_occupancy_l365d) as avg_occupancy
FROM austin_housing
WHERE review_scores_rating IS NOT NULL
GROUP BY rating_group
"""

t2r = pd.read_sql(t2q, conn)
t2r.head()

Unnamed: 0,rating_group,listing_count,avg_occupancy
0,Regular Host,6871,44.837724
1,Superhost,5405,117.591119


Query 3: Bedroom Count Analysis
Business question: For properties with 2+ bedrooms, which neighborhoods have the most listings AND average price above $150? Show neighborhood (use neighbourhood_cleansed), count of listings, average price. Sort by count descending.
Your task:

Filter: bedrooms >= 2
Group by neighborhood
Calculate: count, average price
HAVING clause: Only show groups where avg(price) > 150
Sort by count descending
Top 10

In [10]:
t3q = """ SELECT neighbourhood_cleansed,
    COUNT(*) as listing_count,
    AVG(price) as avg_price
    FROM austin_housing
    WHERE bedrooms >= 2.0
    GROUP BY neighbourhood_cleansed
    HAVING avg_price > 150
    ORDER BY listing_count DESC
"""

t3r = pd.read_sql(t3q, conn)
t3r.head(10)

Unnamed: 0,neighbourhood_cleansed,listing_count,avg_price
0,78704,1273,346.632236
1,78702,1057,342.073265
2,78745,512,220.069136
3,78741,459,238.115942
4,78701,413,478.028455
5,78723,315,224.369369
6,78703,307,499.96347
7,78734,281,427.376569
8,78721,275,244.716895
9,78744,246,248.507937


**Query 4: Multi-Condition Filtering (Final SQL Drill)**
*Business question: Find entire home/apt listings that meet ALL these criteria:*
- Have 50+ reviews
- Rating >= 4.8
- Price between $100-$500

**Show: listing_url, price, review_scores_rating, number_of_reviews. Limit to 10 results.**
*Your task:*
- WHERE with multiple conditions (use AND)
- No GROUP BY needed (just filtering individual rows)
- Select specific columns
- LIMIT 10

In [11]:
t4q = """ SELECT listing_url,
    price,
    review_scores_rating as rating,
    number_of_reviews
    FROM austin_housing
    WHERE rating >= 4.8 AND number_of_reviews >= 50 AND price >= 100 AND price <= 500
"""

t4r = pd.read_sql(t4q, conn)
t4r.head(10)

Unnamed: 0,listing_url,price,rating,number_of_reviews
0,https://www.airbnb.com/rooms/5456,101.0,4.85,711
1,https://www.airbnb.com/rooms/6448,155.0,4.97,338
2,https://www.airbnb.com/rooms/69303,130.0,4.95,261
3,https://www.airbnb.com/rooms/70812,165.0,4.89,188
4,https://www.airbnb.com/rooms/73289,105.0,4.89,63
5,https://www.airbnb.com/rooms/76501,106.0,4.96,244
6,https://www.airbnb.com/rooms/79603,134.0,4.82,147
7,https://www.airbnb.com/rooms/89475,365.0,4.86,126
8,https://www.airbnb.com/rooms/104386,150.0,4.97,139
9,https://www.airbnb.com/rooms/128326,236.0,4.95,93


# Hypothesis Testing Practice

In [12]:
from scipy import stats

In [14]:
# Basic pattern - comparing two groups

# Step 1: Create two groups (extract the data)
instant_yes = df[df['instant_bookable'] == 't']['price']
instant_no = df[df['instant_bookable'] == 'f']['price']

# Step 2: Check what you're comparing (always do this first)
print(f"Instant Book Yes: n={len(instant_yes)}, mean={instant_yes.mean():.2f}")
print(f"Instant Book No: n={len(instant_no)}, mean={instant_no.mean():.2f}")

# Step 3: Run t-test
statistic, p_value = stats.ttest_ind(instant_yes, instant_no, nan_policy='omit')

# Step 4: Interpret
print(f"\nT-statistic: {statistic:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("Statistically significant difference (p < 0.05)")
else:
    print("Not statistically significant (p >= 0.05)")

Instant Book Yes: n=5272, mean=642.69
Instant Book No: n=9915, mean=216.27

T-statistic: 8.2733
P-value: 0.0000
Statistically significant difference (p < 0.05)


In [15]:
# Check median instead of mean (resistant to outliers)
print(f"Instant Book YES: median=${instant_yes.median():.2f}")
print(f"Instant Book NO: median=${instant_no.median():.2f}")

Instant Book YES: median=$144.50
Instant Book NO: median=$134.00


**Scenario 2: Your Turn**
*Question: Do 3-bedroom properties earn significantly more than 2-bedroom properties?*
- Column to compare: estimated_revenue_l365d
**Use the Scenario 1 template:**
- Filter for bedrooms == 2.0
- Filter for bedrooms == 3.0
- Extract revenue column
- Check means/counts
- Run t-test
- Interpret

In [16]:
three_beds = df[df['bedrooms'] == 3]['estimated_revenue_l365d']
two_beds = df[df['bedrooms'] == 2]['estimated_revenue_l365d']

print(f"Three Bed Listings: n={len(three_beds)}, mean={three_beds.mean():.2f}")
print(f"Two Bed Listings: n={len(two_beds)}, mean={two_beds.mean():.2f}")

statistic, p_value = stats.ttest_ind(three_beds, two_beds, nan_policy = 'omit')

print(f"\nT-statistic: {statistic:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("Statistically significant difference (p < 0.05)")
else:
    print("Not statistically significant (p >= 0.05)")

Three Bed Listings: n=2566, mean=17193.32
Two Bed Listings: n=3435, mean=13167.80

T-statistic: 6.9948
P-value: 0.0000
Statistically significant difference (p < 0.05)


In [17]:
print(f"Three Bed Listings: median=${three_beds.median():.2f}")
print(f"Two Bed Listings: median=${two_beds.median():.2f}")

Three Bed Listings: median=$11157.00
Two Bed Listings: median=$8100.00


In [22]:
eado = df[df['host_neighbourhood'] == 'East Downtown']['estimated_revenue_l365d']
allandale = df[df['host_neighbourhood'] == 'Allandale']['estimated_revenue_l365d']

print(f"East Downtown Listings: n={len(eado)}, avg_revenue = {eado.mean():.2f}")
print(f"Allandale Listings: n={len(allandale)}, avg_revenue = {allandale.mean():.2f}")

print(f"East Downtown: median = ${eado.median():.2f}")
print(f"Allandale: median = ${allandale.median():.2f}")

statistic, p_value = stats.ttest_ind(eado, allandale, nan_policy='omit')

print(f"\nT-statistic: {statistic:.4f}")
print(f"P-Value: {p_value:.4f}")

if p_value < 0.05:
    print("Statistically significant difference (p < 0.05)")
else:
    print("Not Statistically Significant (p > 0.05)")

East Downtown Listings: n=494, avg_revenue = 25356.50
Allandale Listings: n=34, avg_revenue = 18228.70
East Downtown: median = $18013.50
Allandale: median = $10620.00

T-statistic: 1.3560
P-Value: 0.1759
Not Statistically Significant (p > 0.05)
