In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

# ---------------------------
# 1. Load data
# ---------------------------
# Fixed unterminated string and avoid using __file__ in a Jupyter Notebook.
DATA_PATH = Path(r"C:\Users\elean\OneDrive - University of Virginia\Documents\DS 3001\Final\AB_NYC_2019.csv\AB_NYC_2019.csv")
df = pd.read_csv(DATA_PATH)

print("Initial shape:", df.shape)
print("Columns:", df.columns.tolist())

# ---------------------------
# 2. Basic checks
# ---------------------------
print("\nMissing values per column:")
print(df.isna().sum())

print("\nBasic numeric stats:")
print(df.describe())

# ---------------------------
# 3. Remove duplicates
# ---------------------------
df = df.drop_duplicates()
df = df.drop_duplicates(subset="id", keep="first")
print("\nAfter removing duplicates:", df.shape)

# ---------------------------
# 4. Fix data types
# ---------------------------
df["last_review"] = pd.to_datetime(df["last_review"], errors="coerce")

# ---------------------------
# 5. Handle missing values
# ---------------------------
# Drop rows with missing essential text info
df = df.dropna(subset=["name", "host_name"])

# If number_of_reviews is 0, reviews_per_month should be 0
df.loc[df["number_of_reviews"] == 0, "reviews_per_month"] = 0

# Fill any remaining missing reviews_per_month with median
df["reviews_per_month"] = df["reviews_per_month"].fillna(df["reviews_per_month"].median())

print("\nRemaining NaNs after cleaning:")
print(df.isna().sum())

# ---------------------------
# 6. Remove unreasonable values
# ---------------------------
# Price: > 0 and <= 1000
df = df[df["price"] > 0]
df = df[df["price"] <= 1000]

# Minimum nights: <= 365
df = df[df["minimum_nights"] <= 365]

# ---------------------------
# 7. Filter geographic outliers (NYC bounds)
# ---------------------------
lat_min, lat_max = 40.49, 40.92
lon_min, lon_max = -74.27, -73.68

df = df[
    df["latitude"].between(lat_min, lat_max)
    & df["longitude"].between(lon_min, lon_max)
]

# ---------------------------
# 8. Clean text columns
# ---------------------------
text_cols = ["name", "host_name", "neighbourhood_group", "neighbourhood", "room_type"]

for col in text_cols:
    df[col] = df[col].astype(str).str.strip()

df["neighbourhood_group"] = df["neighbourhood_group"].str.title()
df["neighbourhood"] = df["neighbourhood"].str.title()
df["room_type"] = df["room_type"].str.title()

# ---------------------------
# 9. Final check + save
# ---------------------------
print("\nFinal shape:", df.shape)
print("Preview of cleaned data:")
print(df.head())

# Save cleaned file next to the input file (avoid __file__ in notebooks)
OUTPUT_PATH = DATA_PATH.parent / "AB_NYC_2019_cleaned.csv"
df.to_csv(OUTPUT_PATH, index=False)
print(f"\nSaved cleaned file to: {OUTPUT_PATH}")


Initial shape: (48895, 16)
Columns: ['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']

Missing values per column:
id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

Basic numeric stats:
      

In [3]:
print("\nNumeric stats AFTER cleaning:")
print(df[["price", "minimum_nights", "availability_365", 
          "number_of_reviews", "reviews_per_month"]].describe())



Numeric stats AFTER cleaning:
              price  minimum_nights  availability_365  number_of_reviews  \
count  48595.000000    48595.000000      48595.000000       48595.000000   
mean     141.329231        6.785575        112.416586          23.358247   
std      116.761806       16.128145        131.395684          44.635326   
min       10.000000        1.000000          0.000000           0.000000   
25%       69.000000        1.000000          0.000000           1.000000   
50%      105.000000        3.000000         44.000000           5.000000   
75%      175.000000        5.000000        226.000000          24.000000   
max     1000.000000      365.000000        365.000000         629.000000   

       reviews_per_month  
count       48595.000000  
mean            1.095029  
std             1.599530  
min             0.000000  
25%             0.040000  
50%             0.380000  
75%             1.595000  
max            58.500000  
