# NYC Rental Price Prediction Demo

This notebook demonstrates the NYC rental price prediction system, from data preprocessing to model training and prediction.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configure plotting
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 14

## 1. Load and Explore Data

First, let's load the sample data and explore it.

In [None]:
# Load the sample data
sample_data_path = "../data/raw/sample_listings.csv"
df = pd.read_csv(sample_data_path)

# Display basic information
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Check data types and missing values
df.info()

In [None]:
# Summary statistics
df.describe()

## 2. Data Visualization

Let's visualize some key aspects of the data.

In [None]:
# Price distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], kde=True)
plt.title('Distribution of Rental Prices')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Price by neighborhood
plt.figure(figsize=(14, 8))
sns.boxplot(x='neighborhood', y='price', data=df)
plt.title('Rental Prices by Neighborhood')
plt.xlabel('Neighborhood')
plt.ylabel('Price ($)')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Price vs. bedrooms
plt.figure(figsize=(10, 6))
sns.boxplot(x='bedrooms', y='price', data=df)
plt.title('Rental Prices by Number of Bedrooms')
plt.xlabel('Bedrooms')
plt.ylabel('Price ($)')
plt.show()

In [None]:
# Price vs. square footage
plt.figure(figsize=(10, 6))
sns.scatterplot(x='sqft', y='price', hue='bedrooms', data=df)
plt.title('Rental Prices vs. Square Footage')
plt.xlabel('Square Footage')
plt.ylabel('Price ($)')
plt.show()

In [None]:
# Correlation matrix
numeric_cols = ['price', 'bedrooms', 'bathrooms', 'sqft', 'has_doorman', 'has_elevator', 
                'has_dishwasher', 'has_washer_dryer', 'is_furnished', 'has_balcony', 
                'has_parking', 'is_no_fee']
corr = df[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

## 3. Data Preprocessing

Now, let's preprocess the data using our pipeline.

In [None]:
from src.nyc_rental_price.data.preprocessing import clean_data, generate_features, split_data
from src.nyc_rental_price.features import FeaturePipeline

# Clean the data
cleaned_df = clean_data(df)
print(f"Cleaned data shape: {cleaned_df.shape}")

# Generate features
pipeline = FeaturePipeline()
features_df = generate_features(cleaned_df, pipeline)
print(f"Features data shape: {features_df.shape}")

# Display feature columns
print(f"\nFeature columns:\n{', '.join(features_df.columns[:20])}...")

In [None]:
# Split the data
X_train, X_val, X_test, y_train, y_val, y_test = split_data(
    features_df, test_size=0.2, val_size=0.1
)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")

## 4. Model Training

Let's train different models and compare their performance.

In [None]:
from src.nyc_rental_price.models.model import GradientBoostingModel, NeuralNetworkModel, ModelEnsemble

# Create model directory if it doesn't exist
os.makedirs("../models", exist_ok=True)

In [None]:
# Train Gradient Boosting model
gb_model = GradientBoostingModel(
    model_dir="../models",
    model_name="gb_demo_model",
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
)

gb_model.fit(X_train, y_train)

# Evaluate on validation set
gb_metrics = gb_model.evaluate(X_val, y_val)
print(f"Gradient Boosting metrics on validation set:")
for metric, value in gb_metrics.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Train Neural Network model
nn_model = NeuralNetworkModel(
    model_dir="../models",
    model_name="nn_demo_model",
    hidden_layers=[64, 32],
    dropout_rate=0.2,
    learning_rate=0.001,
    epochs=50,
    batch_size=8,  # Small batch size for our small dataset
    random_state=42,
)

nn_model.fit(X_train, y_train, validation_data=(X_val, y_val))

# Evaluate on validation set
nn_metrics = nn_model.evaluate(X_val, y_val)
print(f"Neural Network metrics on validation set:")
for metric, value in nn_metrics.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Create Ensemble model
ensemble_model = ModelEnsemble(
    models=[gb_model, nn_model],
    weights=[0.7, 0.3],  # Weight in favor of the better-performing model
    model_dir="../models",
    model_name="ensemble_demo_model",
    random_state=42,
)

# Evaluate on validation set
ensemble_metrics = ensemble_model.evaluate(X_val, y_val)
print(f"Ensemble metrics on validation set:")
for metric, value in ensemble_metrics.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Compare models on test set
models = {
    "Gradient Boosting": gb_model,
    "Neural Network": nn_model,
    "Ensemble": ensemble_model,
}

test_metrics = {}
for name, model in models.items():
    metrics = model.evaluate(X_test, y_test)
    test_metrics[name] = metrics
    print(f"{name} metrics on test set:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")
    print()

In [None]:
# Visualize model performance comparison
metrics_df = pd.DataFrame({
    model_name: {
        'MAE': metrics['mae'],
        'RMSE': metrics['rmse'],
        'R²': metrics['r2'],
    }
    for model_name, metrics in test_metrics.items()
}).T

# Plot MAE and RMSE
plt.figure(figsize=(12, 6))
metrics_df[['MAE', 'RMSE']].plot(kind='bar')
plt.title('Model Error Comparison')
plt.ylabel('Error ($)')
plt.xticks(rotation=0)
plt.show()

# Plot R²
plt.figure(figsize=(12, 6))
metrics_df['R²'].plot(kind='bar')
plt.title('Model R² Comparison')
plt.ylabel('R²')
plt.xticks(rotation=0)
plt.show()

## 5. Feature Importance

Let's examine which features are most important for predicting rental prices.

In [None]:
# Get feature importances from Gradient Boosting model
if hasattr(gb_model, 'feature_importances_'):
    # Get top 20 features
    feature_importance = gb_model.feature_importances_
    top_features = feature_importance.head(20)
    
    # Plot feature importances
    plt.figure(figsize=(12, 10))
    sns.barplot(x=top_features.values, y=top_features.index)
    plt.title('Top 20 Feature Importances')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()

## 6. Model Predictions

Let's make predictions on some sample properties.

In [None]:
# Create sample properties with different characteristics
sample_properties = pd.DataFrame([
    {
        'bedrooms': 1,
        'bathrooms': 1,
        'sqft': 600,
        'neighborhood': 'east village',
        'has_doorman': 0,
        'has_elevator': 0,
        'has_dishwasher': 1,
        'has_washer_dryer': 0,
        'is_furnished': 0,
        'has_balcony': 0,
        'has_parking': 0,
        'is_no_fee': 1,
        'description': 'Cozy 1BR apartment in East Village with dishwasher.'
    },
    {
        'bedrooms': 2,
        'bathrooms': 1,
        'sqft': 850,
        'neighborhood': 'williamsburg',
        'has_doorman': 0,
        'has_elevator': 1,
        'has_dishwasher': 1,
        'has_washer_dryer': 0,
        'is_furnished': 0,
        'has_balcony': 0,
        'has_parking': 0,
        'is_no_fee': 0,
        'description': 'Spacious 2BR in Williamsburg with elevator and dishwasher.'
    },
    {
        'bedrooms': 3,
        'bathrooms': 2,
        'sqft': 1200,
        'neighborhood': 'upper west side',
        'has_doorman': 1,
        'has_elevator': 1,
        'has_dishwasher': 1,
        'has_washer_dryer': 1,
        'is_furnished': 0,
        'has_balcony': 1,
        'has_parking': 0,
        'is_no_fee': 0,
        'description': 'Luxury 3BR/2BA on Upper West Side with doorman, elevator, washer/dryer, and balcony.'
    },
    {
        'bedrooms': 0,
        'bathrooms': 1,
        'sqft': 450,
        'neighborhood': 'astoria',
        'has_doorman': 0,
        'has_elevator': 0,
        'has_dishwasher': 0,
        'has_washer_dryer': 0,
        'is_furnished': 0,
        'has_balcony': 0,
        'has_parking': 0,
        'is_no_fee': 1,
        'description': 'Affordable studio in Astoria. Great location near subway.'
    },
])

# Display sample properties
sample_properties

In [None]:
# Preprocess sample properties
processed_properties = pipeline.transform(sample_properties)

# Make predictions with each model
predictions = {}
for name, model in models.items():
    preds = model.predict(processed_properties)
    predictions[name] = preds

# Create a DataFrame with predictions
results = pd.DataFrame({
    'Bedrooms': sample_properties['bedrooms'],
    'Bathrooms': sample_properties['bathrooms'],
    'Square Feet': sample_properties['sqft'],
    'Neighborhood': sample_properties['neighborhood'],
})

# Add predictions from each model
for name, preds in predictions.items():
    results[f'{name} Prediction'] = [f"${p:.2f}" for p in preds]

# Display results
results

In [None]:
# Visualize predictions
pred_df = pd.DataFrame(predictions)
pred_df.index = [
    f"{b}BR {n}" for b, n in zip(
        sample_properties['bedrooms'], 
        sample_properties['neighborhood']
    )
]

plt.figure(figsize=(12, 8))
pred_df.plot(kind='bar')
plt.title('Predicted Rental Prices by Model')
plt.ylabel('Price ($)')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

## 7. Save the Best Model

Let's save the best performing model for deployment.

In [None]:
# Determine the best model based on test RMSE
best_model_name = min(test_metrics, key=lambda x: test_metrics[x]['rmse'])
best_model = models[best_model_name]

print(f"Best model: {best_model_name}")
print(f"RMSE: {test_metrics[best_model_name]['rmse']:.2f}")
print(f"MAE: {test_metrics[best_model_name]['mae']:.2f}")
print(f"R²: {test_metrics[best_model_name]['r2']:.4f}")

# Save the best model
best_model.model_name = "best_model"
best_model.save_model()
print(f"\nBest model saved as 'best_model'")

## 8. Next Steps

Here are some potential next steps for improving the model:

1. **Collect more data**: The sample dataset is small. Collecting more listings would improve model performance.

2. **Add more features**: Consider adding features like:
   - Distance to subway stations
   - School district ratings
   - Crime statistics
   - Walkability scores

3. **Hyperparameter tuning**: Use Bayesian optimization to find the optimal hyperparameters for each model.

4. **Deploy the API**: Start the FastAPI server to serve predictions:
   ```bash
   python -m src.nyc_rental_price.api.main
   ```

5. **Monitor performance**: Implement logging and monitoring to track model performance over time.