In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

print("All imports successful")

All imports successful


In [2]:
first_list = list(random.sample(range(1, 30), k=15))
print(first_list)

[24, 4, 14, 10, 17, 1, 18, 15, 27, 3, 2, 5, 6, 21, 29]


In [3]:
new_list = [f"High, {num}" if num > 10 else f"Low, {num}" for num in first_list]
print(new_list)

['High, 24', 'Low, 4', 'High, 14', 'Low, 10', 'High, 17', 'Low, 1', 'High, 18', 'High, 15', 'High, 27', 'Low, 3', 'Low, 2', 'Low, 5', 'Low, 6', 'High, 21', 'High, 29']


In [4]:
df = pd.read_csv('../../data/raw/listings.csv')

In [5]:
pd.set_option('display.max_rows',None)
df['price'] = df['price'].str.replace(r'[^\d.]', '', regex=True).astype(float)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15187 entries, 0 to 15186
Data columns (total 79 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            15187 non-null  int64  
 1   listing_url                                   15187 non-null  object 
 2   scrape_id                                     15187 non-null  int64  
 3   last_scraped                                  15187 non-null  object 
 4   source                                        15187 non-null  object 
 5   name                                          15187 non-null  object 
 6   description                                   14840 non-null  object 
 7   neighborhood_overview                         7914 non-null   object 
 8   picture_url                                   15186 non-null  object 
 9   host_id                                       15187 non-null 

In [7]:
lower = df['price'].quantile(0.05)
upper = df['price'].quantile(0.95)
df = df[(df['price'] >= lower) & (df['price'] <= upper)]

df['price'] = df['price'].fillna(df['price'].mean())

print(f"Rows after trimming: {len(df)}")
print(f"Price range: ${df['price'].min():.2f} - ${df['price'].max():.2f}")
print(f"Missing values: {df['price'].isna().sum()}")

Rows after trimming: 9665
Price range: $45.00 - $708.00
Missing values: 0


In [8]:
# Check current state
print(f"Current min: ${df['price'].min():.2f}")
print(f"Current max: ${df['price'].max():.2f}")
print(f"Rows: {len(df)}")

Current min: $45.00
Current max: $708.00
Rows: 9665


In [9]:
df.shape

(9665, 79)

In [10]:
counts = df['property_type'].value_counts()
print(counts)

property_type
Entire home                           4143
Entire rental unit                    1920
Entire condo                           868
Private room in home                   677
Entire guesthouse                      569
Entire townhouse                       224
Entire guest suite                     203
Room in hotel                          163
Tiny home                              143
Entire bungalow                        135
Private room in rental unit             81
Camper/RV                               79
Entire cottage                          48
Entire loft                             45
Entire serviced apartment               40
Entire cabin                            37
Entire villa                            28
Private room in resort                  27
Entire resort                           25
Room in boutique hotel                  21
Entire vacation home                    20
Private room in townhouse               19
Private room in condo                   

In [11]:
total = counts.sum()
print(total)

9665


In [12]:
type_percs = (counts / total) * 100
print(type_percs)

property_type
Entire home                           42.866011
Entire rental unit                    19.865494
Entire condo                           8.980859
Private room in home                   7.004656
Entire guesthouse                      5.887222
Entire townhouse                       2.317641
Entire guest suite                     2.100362
Room in hotel                          1.686498
Tiny home                              1.479565
Entire bungalow                        1.396793
Private room in rental unit            0.838076
Camper/RV                              0.817382
Entire cottage                         0.496637
Entire loft                            0.465598
Entire serviced apartment              0.413864
Entire cabin                           0.382825
Entire villa                           0.289705
Private room in resort                 0.279359
Entire resort                          0.258665
Room in boutique hotel                 0.217279
Entire vacation home      

In [13]:
# Show top 5 property types with percentages
for prop_type in counts.head(5).index:
    count = counts[prop_type]
    percentage = (count / total) * 100
    print(f"{prop_type}: {count} listings ({percentage:.2f}%)")

Entire home: 4143 listings (42.87%)
Entire rental unit: 1920 listings (19.87%)
Entire condo: 868 listings (8.98%)
Private room in home: 677 listings (7.00%)
Entire guesthouse: 569 listings (5.89%)
