# CIND830 Assignment 2: Apartment Rental Data Analysis
## Complete Working Version

This notebook demonstrates the complete analysis with encoding fixes and proper imports.

## Setup and Imports

In [None]:
import os
import sys
from pathlib import Path

# Add the src directory to Python path
project_root = Path.cwd().parent
src_path = project_root / 'src'
sys.path.insert(0, str(src_path))

print(f"Project root: {project_root}")
print(f"Source path: {src_path}")
print(f"Source path exists: {src_path.exists()}")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List

# Import our custom modules
from models.apartment import Apartment
from data.dataset_manager import DatasetManager
from data.price_analysis import PriceAnalysis
from data.location_analysis import LocationAnalysis
from algorithms.search import SearchAlgorithms
from algorithms.sorting import SortingAlgorithms
from visualization.plots import ApartmentVisualizer

# Configure plotting
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 6)
sns.set_palette("husl")

print("✅ All modules imported successfully!")

## 1. Data Structures: Loading and Cleaning Data

### Check for Dataset

In [None]:
# Check if dataset exists
data_path = project_root / 'apartments_for_rent_classified_100K.csv'
print(f"Looking for dataset at: {data_path}")

if not data_path.exists():
    print("❌ Dataset not found!")
    print("📥 Please run the download script first:")
    print("   python scripts/download_data.py")
    print("")
    print("Or download manually from UCI ML Repository:")
    print("https://doi.org/10.24432/C5X623")
    raise FileNotFoundError("Dataset file not found. Please download it first.")
else:
    print("✅ Dataset found!")
    file_size = data_path.stat().st_size / (1024*1024)
    print(f"📊 File size: {file_size:.1f} MB")

### Load the Dataset

In [None]:
# Initialize DatasetManager and load data with encoding handling
dataset_manager = DatasetManager(str(data_path))

# Load raw data (now handles encoding issues automatically)
raw_data = dataset_manager.load_data()
print(f"\nRaw data shape: {raw_data.shape}")
print(f"\nFirst 5 rows:")
print(raw_data.head())

### Inspect Dataset Structure

In [None]:
# Get comprehensive data information
dataset_manager.get_data_info()

# Display sample data
print("\nSample data (first 3 rows, key columns):")
key_columns = ['price', 'bedrooms', 'bathrooms', 'square_feet', 'cityname', 'state']
available_columns = [col for col in key_columns if col in raw_data.columns]
print(raw_data[available_columns].head(3))

### Clean the Data

In [None]:
# Clean the data
cleaned_data = dataset_manager.clean_data()

print("\nData cleaning completed!")
print(f"Cleaned data shape: {cleaned_data.shape}")

# Show data types after cleaning
print("\nData types after cleaning:")
print(cleaned_data.dtypes)

### Create Apartment Objects

In [None]:
# Create apartment objects from cleaned data
apartments = dataset_manager.create_apartments()

print(f"Created {len(apartments)} apartment objects")
print("\nExample apartment:")
print(apartments[0].get_summary())

### Descriptive Statistics

In [None]:
# Calculate descriptive statistics
stats = dataset_manager.get_descriptive_statistics()

print("Descriptive Statistics:")
for feature, stat_dict in stats.items():
    print(f"\n{feature.upper()}:")
    for stat_name, value in stat_dict.items():
        if value is not None:
            print(f"  {stat_name}: {value:,.2f}")
        else:
            print(f"  {stat_name}: None")

## 2. Algorithms: Search and Sort

### Search Algorithms Demo

In [None]:
# Search for apartments with specific price using linear search
target_price = 1500.0
print(f"Searching for apartments with price ${target_price:,.0f}...")

# Linear search
apartments_with_target_price = SearchAlgorithms.search_by_price(apartments, target_price)
print(f"Found {len(apartments_with_target_price)} apartments with price ${target_price:,.0f}")

# Search by city
target_city = "Denver"
denver_apartments = SearchAlgorithms.search_by_city(apartments, target_city)
print(f"Found {len(denver_apartments)} apartments in {target_city}")

if denver_apartments:
    print("\nFirst Denver apartment:")
    print(denver_apartments[0].get_summary())
elif len(apartments) > 0:
    # Try a city that actually exists in the data
    sample_cities = [apt.cityname for apt in apartments[:100] if apt.cityname]
    if sample_cities:
        test_city = sample_cities[0]
        test_apartments = SearchAlgorithms.search_by_city(apartments, test_city)
        print(f"\nFound {len(test_apartments)} apartments in {test_city} (sample city)")

### Binary Search vs Linear Search Performance

In [None]:
# Compare search performance
search_results = SearchAlgorithms.time_search_comparison(apartments, target_price)

print("Search Performance Comparison:")
print(f"\nLinear Search:")
print(f"  Time: {search_results['linear_search']['time']:.6f} seconds")
print(f"  Results found: {search_results['linear_search']['results_count']}")

print(f"\nBinary Search:")
print(f"  Sort time: {search_results['binary_search']['sort_time']:.6f} seconds")
print(f"  Search time: {search_results['binary_search']['search_time']:.6f} seconds")
print(f"  Total time: {search_results['binary_search']['total_time']:.6f} seconds")
print(f"  Result found: {search_results['binary_search']['result'] is not None}")

### Custom Sorting Algorithms

In [None]:
# Test custom sorting algorithms on a sample
sample_apartments = apartments[:100]  # Use smaller sample for demonstration

print("Testing sorting algorithms on sample of 100 apartments...")

# Sort by price using bubble sort
bubble_sorted = SortingAlgorithms.sort_by_price(sample_apartments, "bubble")
valid_prices = [apt for apt in bubble_sorted if apt.price is not None]
if valid_prices:
    print(f"\nBubble sort completed. Lowest price: ${valid_prices[0].price:,.2f}")
    print(f"Highest price: ${valid_prices[-1].price:,.2f}")

# Sort by square feet using insertion sort
insertion_sorted = SortingAlgorithms.sort_by_square_feet(sample_apartments, "insertion")
valid_sqft = [apt for apt in insertion_sorted if apt.square_feet is not None]
if valid_sqft:
    print(f"\nInsertion sort by square feet completed.")
    print(f"Smallest: {valid_sqft[0].square_feet} sq ft")
    print(f"Largest: {valid_sqft[-1].square_feet} sq ft")

### Sorting Performance Comparison

In [None]:
# Compare sorting performance
performance_results = SortingAlgorithms.compare_sorting_performance(
    apartments, 
    lambda apt: apt.price if apt.price is not None else float('inf'),
    sample_size=500
)

print("Sorting Performance Comparison (500 apartments):")
print(f"\nBubble Sort:")
print(f"  Total time: {performance_results['bubble_sort']['time']:.6f} seconds")
print(f"  Time per item: {performance_results['bubble_sort']['time_per_item']:.8f} seconds")

print(f"\nInsertion Sort:")
print(f"  Total time: {performance_results['insertion_sort']['time']:.6f} seconds")
print(f"  Time per item: {performance_results['insertion_sort']['time_per_item']:.8f} seconds")

print(f"\nBuilt-in Sort:")
print(f"  Total time: {performance_results['builtin_sort']['time']:.6f} seconds")
print(f"  Time per item: {performance_results['builtin_sort']['time_per_item']:.8f} seconds")

print(f"\nRelative Performance (vs fastest):")
for alg, ratio in performance_results['relative_performance'].items():
    print(f"  {alg}: {ratio:.1f}x")

## 3. Object-Oriented Programming: Inheritance and Polymorphism

### Price Analysis (Inheritance Demo)

In [None]:
# Create PriceAnalysis instance (inherits from DatasetManager)
price_analyzer = PriceAnalysis(str(data_path))
price_analyzer.load_data()
price_analyzer.clean_data()
price_analyzer.create_apartments()

# Demonstrate polymorphism - overridden get_summary method
print("PriceAnalysis Summary (Polymorphism Demo):")
print(price_analyzer.get_summary())

In [None]:
# Use specialized price analysis methods
price_stats = price_analyzer.compute_price_statistics()
print("\nDetailed Price Statistics:")
for stat, value in price_stats.items():
    print(f"{stat}: ${value:,.2f}")

# Get price percentiles
percentiles = price_analyzer.get_price_percentiles([10, 25, 50, 75, 90, 95, 99])
print("\nPrice Percentiles:")
for p, value in percentiles.items():
    print(f"{p}th percentile: ${value:,.2f}")

In [None]:
# Analyze price by number of bedrooms
bedroom_stats = price_analyzer.get_price_by_bedrooms()
print("\nPrice Analysis by Number of Bedrooms:")
for bedrooms, stats in sorted(bedroom_stats.items()):
    print(f"\n{bedrooms} Bedroom(s):")
    print(f"  Count: {stats['count']}")
    print(f"  Mean: ${stats['mean']:,.2f}")
    print(f"  Median: ${stats['median']:,.2f}")
    print(f"  Range: ${stats['min']:,.2f} - ${stats['max']:,.2f}")

### Location Analysis (Inheritance Demo)

In [None]:
# Create LocationAnalysis instance (inherits from DatasetManager)
location_analyzer = LocationAnalysis(str(data_path))
location_analyzer.load_data()
location_analyzer.clean_data()
location_analyzer.create_apartments()

# Demonstrate polymorphism - overridden get_summary method
print("LocationAnalysis Summary (Polymorphism Demo):")
print(location_analyzer.get_summary())

In [None]:
# Get city statistics
city_stats = location_analyzer.get_city_statistics()

# Show top 10 cities by number of listings
sorted_cities = sorted(city_stats.items(), key=lambda x: x[1]['count'], reverse=True)[:10]

print("Top 10 Cities by Number of Listings:")
for i, (city, stats) in enumerate(sorted_cities, 1):
    avg_price = stats['avg_price']
    price_str = f"${avg_price:,.2f}" if avg_price else "N/A"
    print(f"{i:2d}. {city}, {stats['state']}: {stats['count']} listings, Avg Price: {price_str}")

In [None]:
# Demonstrate proximity search
# Example: Find apartments near NYC (approximate coordinates)
nyc_lat, nyc_lon = 40.7128, -74.0060
nearby_apartments = location_analyzer.filter_by_proximity(nyc_lat, nyc_lon, 50)  # 50km radius

print(f"Found {len(nearby_apartments)} apartments within 50km of NYC coordinates")
if nearby_apartments:
    print("\nSample nearby apartment:")
    print(nearby_apartments[0].get_summary())

## 4. Visualization Tasks

### Initialize Visualizer

In [None]:
# Create visualizer instance
visualizer = ApartmentVisualizer()
print("Visualizer initialized successfully!")

### 1. Price Distribution Histogram

In [None]:
# Create price distribution histogram
fig = visualizer.plot_price_histogram(apartments, bins=50)
plt.show()

# Print some insights
prices = [apt.price for apt in apartments if apt.price is not None]
print(f"\nPrice Distribution Insights:")
print(f"Total apartments with price data: {len(prices):,}")
print(f"Mean price: ${np.mean(prices):,.2f}")
print(f"Median price: ${np.median(prices):,.2f}")
print(f"Standard deviation: ${np.std(prices):,.2f}")

### 2. Square Feet vs Price Scatter Plot

In [None]:
# Create scatter plot
try:
    fig = visualizer.plot_price_vs_sqft_scatter(apartments)
    plt.show()
    
    # Calculate and display correlation
    valid_data = [(apt.square_feet, apt.price) for apt in apartments 
                 if apt.square_feet is not None and apt.price is not None]
    if valid_data:
        sqft, prices = zip(*valid_data)
        correlation = np.corrcoef(sqft, prices)[0, 1]
        print(f"\nCorrelation between square feet and price: {correlation:.3f}")
        print(f"Sample size: {len(valid_data):,} apartments")
except ValueError as e:
    print(f"Could not create scatter plot: {e}")
    print("This may indicate insufficient data with both price and square feet information.")

### 3. Average Price by Number of Bedrooms Bar Chart

In [None]:
# Create bar chart
fig = visualizer.plot_price_by_bedrooms_bar(apartments)
plt.show()

# Display bedroom statistics
bedroom_stats = price_analyzer.get_price_by_bedrooms()
print("\nBedroom Price Analysis:")
for bedrooms in sorted(bedroom_stats.keys()):
    stats = bedroom_stats[bedrooms]
    print(f"{bedrooms} BR: {stats['count']} units, Avg: ${stats['mean']:,.2f}")

### 4. Correlation Matrix Heatmap

In [None]:
# Create correlation heatmap
fig = visualizer.plot_correlation_heatmap(apartments)
plt.show()

print("\nCorrelation Analysis Insights:")
print("- Strong positive correlation indicates variables increase together")
print("- Strong negative correlation indicates variables move in opposite directions")
print("- Values close to 0 indicate weak linear relationship")

### 5. Comprehensive Dashboard

In [None]:
# Create comprehensive dashboard
fig = visualizer.create_comprehensive_dashboard(apartments)
plt.show()

print("\nComprehensive dashboard created with all major visualizations!")

## Summary and Conclusions

This analysis has successfully demonstrated all required assignment components:

### 1. Data Structures
- ✅ Loaded and inspected the apartment dataset with encoding handling
- ✅ Cleaned data by handling missing values and type conversions
- ✅ Computed descriptive statistics for numerical features
- ✅ Created structured apartment objects from raw data

### 2. Algorithms
- ✅ Implemented linear search for finding apartments by price and city
- ✅ Implemented binary search with performance comparison
- ✅ Created custom bubble sort and insertion sort algorithms
- ✅ Compared custom algorithm performance vs built-in sorting

### 3. Object-Oriented Programming
- ✅ Defined Apartment class with proper encapsulation
- ✅ Created DatasetManager class for data operations
- ✅ Demonstrated inheritance with PriceAnalysis and LocationAnalysis subclasses
- ✅ Showed polymorphism through overridden get_summary() methods

### 4. Visualization
- ✅ Created histogram showing price distribution with outlier identification
- ✅ Generated scatter plot revealing relationships between variables
- ✅ Built bar chart comparing average prices by bedroom count
- ✅ Produced correlation heatmap for numerical feature relationships
- ✅ All plots include proper titles, axis labels, and legends

### Technical Improvements Made:
1. **Encoding Handling**: Robust CSV loading with multiple encoding attempts
2. **Import Flexibility**: Works in various Python environments
3. **Error Handling**: Graceful handling of missing data scenarios
4. **Performance**: Efficient algorithms with timing comparisons

The modular code structure ensures maintainability and extensibility for future analysis.