In [1]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt

In [2]:
# Create SQLite database

df = pd.read_csv('../../austin-airbnb-market-analysis/data/raw/listings.csv')

conn = sqlite3.connect('austin_airbnb.db')
df.to_sql('listings', conn, if_exists='replace', index=False)

# Translation: "Create database, push DataFrame to it as 'listings' table"

15187

In [3]:
# Query 1: Neighborhoods by listing count

query1 = """
SELECT host_neighbourhood, COUNT(*) as listing_count
FROM listings
GROUP BY host_neighbourhood
ORDER BY listing_count DESC
LIMIT 10
"""

results1 = pd.read_sql_query(query1, conn)
print(results1)

        host_neighbourhood  listing_count
0                     None           1629
1             South Austin            587
2            East Downtown            494
3          Downtown Austin            410
4                   Zilker            407
5           Central Austin            314
6              South Lamar            309
7  East Riverside - Oltorf            297
8                    Holly            294
9            Bouldin Creek            249


In [4]:
# Query 2: Neighborhoods by avg price (with minimum listing filter)

query2 = """
SELECT host_neighbourhood,
COUNT(*) as listing_count,
AVG(price) as avg_price
FROM listings
GROUP BY host_neighbourhood
HAVING COUNT(*) >= 5
ORDER BY avg_price DESC
"""

results2 = pd.read_sql_query(query2, conn)
print(results2)

                       host_neighbourhood  listing_count  avg_price
0                                  Zilker            407        0.0
1                            Yacht Harbor             32        0.0
2                                  Wooten             68        0.0
3                              Windy Cove              5        0.0
4                            Windsor Park            173        0.0
..                                    ...            ...        ...
245                             Allandale             34        0.0
246                                  None           1629        0.0
247  Scottsdale Villa Mirage Resort Condo              8        NaN
248             Historic District - North              8        NaN
249                             Frydendal              5        NaN

[250 rows x 3 columns]


In [5]:
# Task 1: Check what columns you actually have
print(df.columns.tolist())

# Task 2: See a few rows to understand the data
print(df.head(3))

['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name', 'description', 'neighborhood_overview', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability', 'availability_30', 'availability_60', 'availability_90', 'availabil