In [1]:
# Install packages if needed (run this once)
using Pkg

# Load required packages
using LinearAlgebra
using Random
using Printf
using DataFrames
using CSV
using Statistics
using StatsBase
using HypothesisTests
using Plots
using StatsPlots

# Set plotting backend
gr()
println("All packages loaded successfully!")

All packages loaded successfully!


In [2]:
function load_data()
    """
    Load apartment data from the repository.
    """
    println("Loading apartment data from repository...")
    
    # Load the real apartments.csv file from the input folder
    data_path = "../input/apartments.csv"  # Relative path from scripts/ to input
    df = CSV.read(data_path, DataFrame)
    
    @printf("Loaded data with %d observations and %d variables\n", nrow(df), ncol(df))
    @printf("\nDataset shape: (%d, %d)\n", nrow(df), ncol(df))
    @printf("\nColumn names: %s\n", join(names(df), ", "))
    
    return df
end

# Load the data
df = load_data()

Loading apartment data from repository...
Loaded data with 110191 observations and 21 variables

Dataset shape: (110191, 21)

Column names: id, price, month, area, type, rooms, centredistance, schooldistance, clinicdistance, postofficedistance, kindergartendistance, restaurantdistance, collegedistance, pharmacydistance, ownership, buildingmaterial, hasparkingspace, hasbalcony, haselevator, hassecurity, hasstorageroom


Row,id,price,month,area,type,rooms,centredistance,schooldistance,clinicdistance,postofficedistance,kindergartendistance,restaurantdistance,collegedistance,pharmacydistance,ownership,buildingmaterial,hasparkingspace,hasbalcony,haselevator,hassecurity,hasstorageroom
Unnamed: 0_level_1,String,Int64,Int64,Int64,String31?,Int64,Float64,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,String15,String15?,String3,String3,String3?,String3,String3
1,a01d82c9529f98a54d64b9e061c9a73b,1199999,1,105,apartmentBuilding,4,5.06,1.08,0.949,0.623,0.791,1.054,3.062,0.335,condominium,brick,no,yes,yes,no,no
2,8373aa373dbc3fe7ca3b7434166b8766,650000,1,73,tenement,3,3.24,0.275,0.672,0.367,0.246,0.3,1.857,0.28,condominium,brick,no,no,no,no,no
3,7d0c31d5409caab173571cce3dcdf702,590000,1,69,blockOfFlats,3,3.94,0.139,1.336,0.926,0.93,0.071,0.786,0.304,condominium,brick,no,yes,no,no,yes
4,3eaa36a59b9354206703b5f6b2f2ff1d,584999,1,42,blockOfFlats,2,5.19,0.209,1.533,0.201,0.319,0.157,2.722,0.257,condominium,missing,no,yes,no,no,no
5,027b30cebbc49faf3094421b741ddd56,363000,1,46,blockOfFlats,2,1.65,0.25,0.624,0.51,0.177,0.232,0.986,0.273,condominium,concreteSlab,no,no,no,no,yes
6,27437e173a8c37d5002c0bf69c848b7c,717000,1,81,tenement,3,1.76,0.083,0.257,0.294,0.151,0.102,0.256,0.301,condominium,brick,no,no,no,no,yes
7,39404087c054348c27522c74ca21a973,689000,1,70,blockOfFlats,3,3.35,0.269,1.364,0.247,0.122,0.388,1.589,0.194,cooperative,concreteSlab,no,yes,yes,no,yes
8,7a3b00507c086fcf83562f22bb5d01fc,790000,1,67,apartmentBuilding,3,1.61,0.501,0.973,0.713,0.067,0.052,0.988,0.215,condominium,brick,yes,yes,yes,no,no
9,24af7cc54099d7930535543827604a9d,560000,1,69,blockOfFlats,3,2.29,0.371,0.592,0.213,0.222,0.24,0.583,0.721,condominium,brick,no,yes,no,no,yes
10,7ebe2d0eeb6231486f90c5835c695cd7,590000,1,56,blockOfFlats,2,4.27,0.509,0.081,0.126,0.381,0.227,1.685,0.186,condominium,brick,no,no,no,no,yes


In [3]:
function clean_data(df)
    """
    Perform data cleaning as specified in Part 3a.
    
    Tasks:
    1. Create area2 variable (square of area)
    2. Convert binary variables to dummy variables (yes/no -> 1/0)
    3. Create last digit dummy variables for area (end_0 to end_9)
    """
    println("\n=== DATA CLEANING (Part 3a) ===\n")
    
    df_clean = copy(df)
    
    # 1. Create area2 variable (0.25 points)
    df_clean.area2 = df_clean.area .^ 2
    println("✓ Created area2 variable (square of area)")
    
    # 2. Convert binary variables to dummy variables (0.75 points)
    # First, let's identify the binary variables in our dataset
    binary_vars = Symbol[]
    for col in names(df_clean)
        if startswith(col, "has") && eltype(df_clean[!, col]) <: AbstractString
            push!(binary_vars, Symbol(col))
        end
    end
    
    @printf("\nIdentified binary variables: %s\n", join(string.(binary_vars), ", "))
    
    for var in binary_vars
        # Convert 'yes'/'no' to 1/0
        df_clean[!, var] = Int.(df_clean[!, var] .== "yes")
    end
    
    @printf("✓ Converted %d binary variables to dummy variables (1=yes, 0=no)\n", length(binary_vars))
    
    # 3. Create last digit dummy variables (1 point)
    area_last_digit = Int.(floor.(df_clean.area)) .% 10
    
    for digit in 0:9
        col_name = Symbol("end_$(digit)")
        df_clean[!, col_name] = Int.(area_last_digit .== digit)
    end
    
    println("✓ Created last digit dummy variables (end_0 through end_9)")
    
    # Display summary of cleaning
    @printf("\nCleaning Summary:\n")
    @printf("- Original variables: %d\n", ncol(df))
    @printf("- Variables after cleaning: %d\n", ncol(df_clean))
    new_vars = ["area2"; ["end_$i" for i in 0:9]]
    @printf("- New variables created: %s\n", join(new_vars, ", "))
    
    # Show distribution of area last digits
    println("\nArea last digit distribution:")
    for digit in 0:9
        count = sum(area_last_digit .== digit)
        pct = count / length(df_clean.area) * 100
        @printf("  end_%d: %4d (%5.1f%%)\n", digit, count, pct)
    end
    
    return df_clean
end

# Perform data cleaning
df_clean = clean_data(df);


=== DATA CLEANING (Part 3a) ===

✓ Created area2 variable (square of area)

Identified binary variables: hasparkingspace, hasbalcony, hassecurity, hasstorageroom
✓ Converted 4 binary variables to dummy variables (1=yes, 0=no)
✓ Created last digit dummy variables (end_0 through end_9)

Cleaning Summary:
- Original variables: 21
- Variables after cleaning: 32
- New variables created: area2, end_0, end_1, end_2, end_3, end_4, end_5, end_6, end_7, end_8, end_9

Area last digit distribution:
  end_0: 12651 ( 11.5%)
  end_1: 8810 (  8.0%)
  end_2: 10861 (  9.9%)
  end_3: 10057 (  9.1%)
  end_4: 11088 ( 10.1%)
  end_5: 10828 (  9.8%)
  end_6: 11835 ( 10.7%)
  end_7: 11788 ( 10.7%)
  end_8: 13182 ( 12.0%)
  end_9: 9091 (  8.3%)


In [14]:
function create_design_matrix(df, features)
    """
    Create design matrix from DataFrame and feature list.
    Handle missing values by replacing them with 0 or appropriate defaults.
    """
    
    # Start with numeric features that exist directly in the dataframe
    numeric_features = filter(f -> Symbol(f) in propertynames(df), features)
    if !isempty(numeric_features)
        X_numeric_raw = Matrix(df[!, Symbol.(numeric_features)])
        # Replace missing values with 0 (or you could use mean imputation)
        X_numeric = replace(X_numeric_raw, missing => 0.0)
        # Convert to Float64 matrix
        X_numeric = convert(Matrix{Float64}, X_numeric)
    else
        X_numeric = zeros(Float64, nrow(df), 0)
    end
    
    # Handle categorical dummy variables
    categorical_features = filter(f -> !(Symbol(f) in propertynames(df)), features)
    
    if !isempty(categorical_features)
        X_categorical = zeros(Float64, nrow(df), length(categorical_features))
        
        for (i, feature) in enumerate(categorical_features)
            if startswith(feature, "month_")
                month_val = parse(Int, replace(feature, "month_" => ""))
                # Handle missing values by treating them as false (not equal to the target value)
                X_categorical[:, i] = Float64.((df.month .== month_val) .& .!ismissing.(df.month))
            elseif startswith(feature, "type_")
                type_val = replace(feature, "type_" => "")
                # Handle missing values by treating them as false
                X_categorical[:, i] = Float64.((df.type .== type_val) .& .!ismissing.(df.type))
            elseif startswith(feature, "rooms_")
                rooms_val = parse(Int, replace(feature, "rooms_" => ""))
                # Handle missing values by treating them as false
                X_categorical[:, i] = Float64.((df.rooms .== rooms_val) .& .!ismissing.(df.rooms))
            elseif startswith(feature, "ownership_")
                ownership_val = replace(feature, "ownership_" => "")
                # Handle missing values by treating them as false
                X_categorical[:, i] = Float64.((df.ownership .== ownership_val) .& .!ismissing.(df.ownership))
            elseif startswith(feature, "buildingmaterial_")
                material_val = replace(feature, "buildingmaterial_" => "")
                if :buildingmaterial in propertynames(df)
                    # Handle missing values by treating them as false
                    X_categorical[:, i] = Float64.((df.buildingmaterial .== material_val) .& .!ismissing.(df.buildingmaterial))
                end
            end
        end
        
        # Combine numeric and categorical features
        X = hcat(X_numeric, X_categorical)
    else
        X = X_numeric
    end
    
    return X
end

create_design_matrix (generic function with 1 method)

In [15]:
function linear_model_estimation(df)
    """
    Perform linear model estimation as specified in Part 3b.
    
    Tasks:
    1. Regress price against specified covariates
    2. Perform the same regression using partialling-out method
    3. Verify coefficients match
    """
    println("\n=== LINEAR MODEL ESTIMATION (Part 3b) ===\n")
    
    # First, remove rows with missing prices
    valid_rows = .!ismissing.(df.price)
    df_valid = df[valid_rows, :]
    @printf("Removed %d rows with missing prices. Working with %d observations.\n", 
            sum(.!valid_rows), nrow(df_valid))
    
    # Prepare the feature list
    features = String[]
    
    # Area's last digit dummies (omit 9 to have a base category)
    digit_features = ["end_$i" for i in 0:8]  # end_0 through end_8
    append!(features, digit_features)
    
    # Area and area squared
    append!(features, ["area", "area2"])
    
    # Distance variables (adjust names to match actual dataset)
    distance_features = String[]
    for col in names(df_valid)
        if occursin("distance", lowercase(col))
            push!(distance_features, col)
        end
    end
    append!(features, distance_features)
    
    # Binary features (those we converted)
    binary_features = String[]
    for col in names(df_valid)
        if startswith(col, "has") && eltype(df_valid[!, col]) <: Number
            push!(binary_features, col)
        end
    end
    append!(features, binary_features)
    
    # Categorical variables (create dummy variables, drop first category)
    categorical_vars = String[]
    for col in ["month", "type", "rooms", "ownership", "buildingmaterial"]
        if Symbol(col) in propertynames(df_valid)
            push!(categorical_vars, col)
        end
    end
    
    @printf("Available columns: %s\n", join(names(df_valid), ", "))
    @printf("Distance features found: %s\n", join(distance_features, ", "))
    @printf("Binary features found: %s\n", join(binary_features, ", "))
    @printf("Categorical variables to encode: %s\n", join(categorical_vars, ", "))
    
    # Add categorical dummy variables to features list
    for var in categorical_vars
        if Symbol(var) in propertynames(df_valid)
            unique_vals = unique(skipmissing(df_valid[!, var]))
            # Drop first category to avoid multicollinearity
            for val in unique_vals[2:end]
                push!(features, "$(var)_$(val)")
            end
        end
    end
    
    # Remove any features that don't exist in the dataset
    existing_features = String[]
    for feature in features
        if Symbol(feature) in propertynames(df_valid) || occursin("_", feature)
            push!(existing_features, feature)
        end
    end
    
    features = existing_features
    
    # Create design matrix
    X = create_design_matrix(df_valid, features)
    y = convert(Vector{Float64}, replace(df_valid.price, missing => 0.0))
    
    @printf("\nFeature matrix shape: (%d, %d)\n", size(X)...)
    @printf("Target variable shape: (%d,)\n", length(y))
    @printf("Total features: %d\n", length(features))
    
    return X, y, features
end

# Prepare the data for modeling
X, y, features = linear_model_estimation(df_clean);


=== LINEAR MODEL ESTIMATION (Part 3b) ===

Removed 0 rows with missing prices. Working with 110191 observations.
Available columns: id, price, month, area, type, rooms, centredistance, schooldistance, clinicdistance, postofficedistance, kindergartendistance, restaurantdistance, collegedistance, pharmacydistance, ownership, buildingmaterial, hasparkingspace, hasbalcony, haselevator, hassecurity, hasstorageroom, area2, end_0, end_1, end_2, end_3, end_4, end_5, end_6, end_7, end_8, end_9
Removed 0 rows with missing prices. Working with 110191 observations.
Available columns: id, price, month, area, type, rooms, centredistance, schooldistance, clinicdistance, postofficedistance, kindergartendistance, restaurantdistance, collegedistance, pharmacydistance, ownership, buildingmaterial, hasparkingspace, hasbalcony, haselevator, hassecurity, hasstorageroom, area2, end_0, end_1, end_2, end_3, end_4, end_5, end_6, end_7, end_8, end_9
Distance features found: centredistance, schooldistance, clini

### Method 1: Standard Linear Regression

In [16]:
# Method 1: Standard linear regression (with intercept)
println("\n1. Standard Linear Regression:")
X_with_intercept = hcat(ones(size(X, 1)), X)
beta_full = (X_with_intercept' * X_with_intercept) \ (X_with_intercept' * y)

y_pred = X_with_intercept * beta_full
r2 = 1 - sum((y .- y_pred).^2) / sum((y .- mean(y)).^2)

@printf("R-squared: %.4f\n", r2)
@printf("Intercept: %.2f\n", beta_full[1])

# Focus on end_0 coefficient
end_0_coef = nothing
if "end_0" in features
    end_0_idx = findfirst(x -> x == "end_0", features)
    end_0_coef = beta_full[end_0_idx + 1]  # +1 because of intercept
    @printf("Coefficient for end_0: %.2f\n", end_0_coef)
else
    println("Warning: end_0 feature not found in features list")
end

# Create results DataFrame
feature_names = ["intercept"; features]
results_df = DataFrame(
    feature = feature_names,
    coefficient = beta_full
)

println("\nTop 10 coefficients by magnitude:")
if nrow(results_df) > 1
    top_coeffs = results_df[2:end, :]  # Exclude intercept
    top_coeffs.abs_coeff = abs.(top_coeffs.coefficient)
    sort!(top_coeffs, :abs_coeff, rev=true)
    
    for i in 1:min(10, nrow(top_coeffs))
        @printf("  %-20s: %10.2f\n", top_coeffs.feature[i], top_coeffs.coefficient[i])
    end
end


1. Standard Linear Regression:
R-squared: 0.5305
Intercept: 184180.03
Coefficient for end_0: 13266.13

Top 10 coefficients by magnitude:
  restaurantdistance  : -237743.22
  pharmacydistance    :  110961.22
  type_blockOfFlats   : -106835.85
  ownership_udziaÃÂ   :  -95055.09
  type_tenement       :  -85358.04
  rooms_5             :  -84689.41
  buildingmaterial_concreteSlab:  -79255.93
  hassecurity         :   73948.49
  clinicdistance      :  -67079.67
  hasstorageroom      :  -66269.82
Coefficient for end_0: 13266.13

Top 10 coefficients by magnitude:
  restaurantdistance  : -237743.22
  pharmacydistance    :  110961.22
  type_blockOfFlats   : -106835.85
  ownership_udziaÃÂ   :  -95055.09
  type_tenement       :  -85358.04
  rooms_5             :  -84689.41
  buildingmaterial_concreteSlab:  -79255.93
  hassecurity         :   73948.49
  clinicdistance      :  -67079.67
  hasstorageroom      :  -66269.82


### Method 2: Partialling-out (FWL) Method

Now let's implement the Frisch-Waugh-Lovell theorem to estimate the coefficient for `end_0` using the partialling-out method.

In [17]:
# Method 2: Partialling-out (FWL) method for end_0
end_0_coef_fwl = nothing

if "end_0" in features && end_0_coef !== nothing
    println("\n2. Partialling-out Method (focusing on end_0):")
    
    # Separate X into X1 (end_0) and X2 (all other variables)
    end_0_idx = findfirst(x -> x == "end_0", features)
    X1 = X[:, end_0_idx:end_0_idx]  # Variable of interest
    other_indices = setdiff(1:size(X, 2), end_0_idx)
    X2 = X[:, other_indices]  # Control variables
    
    # Add intercept to X2
    X2_with_intercept = hcat(ones(size(X2, 1)), X2)
    
    # Step 1: Regress y on X2 and get residuals
    beta_y_on_x2 = (X2_with_intercept' * X2_with_intercept) \ (X2_with_intercept' * y)
    y_residuals = y .- X2_with_intercept * beta_y_on_x2
    
    # Step 2: Regress X1 on X2 and get residuals
    beta_x1_on_x2 = (X2_with_intercept' * X2_with_intercept) \ (X2_with_intercept' * X1)
    x1_residuals = X1 .- X2_with_intercept * beta_x1_on_x2
    
    # Step 3: Regress residuals (no intercept needed since residuals are mean zero)
    end_0_coef_fwl = (x1_residuals' * x1_residuals) \ (x1_residuals' * y_residuals)
    end_0_coef_fwl = end_0_coef_fwl[1]  # Extract scalar
    
    @printf("Coefficient for end_0 (FWL method): %.2f\n", end_0_coef_fwl)
    @printf("Coefficient for end_0 (standard method): %.2f\n", end_0_coef)
    @printf("Difference: %.6f\n", abs(end_0_coef - end_0_coef_fwl))
    @printf("Methods match (within 1e-6): %s\n", abs(end_0_coef - end_0_coef_fwl) < 1e-6)
    
    # Store results for later use
    model_results = Dict(
        "features" => features,
        "results_df" => results_df,
        "end_0_coef_standard" => end_0_coef,
        "end_0_coef_fwl" => end_0_coef_fwl,
        "X" => X,
        "y" => y,
        "X_with_intercept" => X_with_intercept,
        "beta_full" => beta_full
    )
else
    println("\nSkipping FWL method as end_0 feature is not available")
    model_results = Dict(
        "features" => features,
        "results_df" => results_df,
        "X" => X,
        "y" => y,
        "X_with_intercept" => X_with_intercept,
        "beta_full" => beta_full
    )
end

model_results


2. Partialling-out Method (focusing on end_0):
Coefficient for end_0 (FWL method): 13266.13
Coefficient for end_0 (standard method): 13266.13
Difference: 0.000000
Methods match (within 1e-6): true
Difference: 0.000000
Methods match (within 1e-6): true


Dict{String, Any} with 8 entries:
  "features"            => ["end_0", "end_1", "end_2", "end_3", "end_4", "end_5…
  "end_0_coef_fwl"      => 13266.1
  "end_0_coef_standard" => 13266.1
  "X"                   => [0.0 0.0 … 0.0 0.0; 0.0 0.0 … 0.0 0.0; … ; 1.0 0.0 ……
  "beta_full"           => [184180.0, 13266.1, -18691.3, 2250.67, 14007.7, -684…
  "X_with_intercept"    => [1.0 0.0 … 0.0 0.0; 1.0 0.0 … 0.0 0.0; … ; 1.0 1.0 ……
  "results_df"          => [1m39×2 DataFrame[0m[0m…
  "y"                   => [1.2e6, 650000.0, 590000.0, 584999.0, 363000.0, 7170…

## Part 3c: Price Premium Analysis (3 points)

Now we'll analyze whether apartments with areas ending in "0" command a price premium. We'll:
1. Train a model excluding apartments with area ending in 0
2. Use this model to predict prices for all apartments
3. Compare actual vs predicted prices for apartments ending in 0

In [20]:
function price_premium_analysis(df, model_results)
    """
    Analyze price premium for apartments with area ending in 0.
    Part 3c: Price premium for area that ends in 0-digit (3 points)
    """
    println("\n=== PRICE PREMIUM ANALYSIS (Part 3c) ===\n")
    
    features = model_results["features"]
    X = model_results["X"]
    y = model_results["y"]
    
    # Check if we have end_0 variable
    if !(:end_0 in propertynames(df))
        println("Warning: end_0 variable not found. Cannot perform premium analysis.")
        return nothing
    end
    
    # Step 1: Train model excluding apartments with area ending in 0 (1.25 points)
    println("1. Training model excluding apartments with area ending in 0:")
    
    # Filter out apartments with area ending in 0
    mask_not_end_0 = df.end_0 .== 0
    X_train = X[mask_not_end_0, :]
    y_train = y[mask_not_end_0]
    
    @printf("   Training sample size: %d (excluded %d apartments ending in 0)\n", 
            sum(mask_not_end_0), sum(.!mask_not_end_0))
    
    # Remove the end_0 feature from the training data since all values would be 0
    end_0_idx = findfirst(x -> x == "end_0", features)
    if end_0_idx !== nothing
        # Remove end_0 column from training data
        other_indices = setdiff(1:size(X_train, 2), end_0_idx)
        X_train_reduced = X_train[:, other_indices]
        features_reduced = features[other_indices]
    else
        X_train_reduced = X_train
        features_reduced = features
    end
    
    # Train the model (with intercept)
    X_train_with_intercept = hcat(ones(size(X_train_reduced, 1)), X_train_reduced)
    
    # Initialize beta_no_end_0
    beta_no_end_0 = nothing
    
    # Use regularization if matrix is close to singular
    try
        beta_no_end_0 = (X_train_with_intercept' * X_train_with_intercept) \ (X_train_with_intercept' * y_train)
    catch e
        println("   Warning: Matrix is singular, adding small regularization...")
        λ = 1e-6  # Small regularization parameter
        XtX = X_train_with_intercept' * X_train_with_intercept
        n_features = size(XtX, 1)
        XtX += λ * Matrix{Float64}(I, n_features, n_features)  # Add ridge regularization
        beta_no_end_0 = XtX \ (X_train_with_intercept' * y_train)
    end
    
    if beta_no_end_0 === nothing
        println("Error: Could not solve for coefficients")
        return nothing
    end
    
    y_pred_train = X_train_with_intercept * beta_no_end_0
    r2_train = 1 - sum((y_train .- y_pred_train).^2) / sum((y_train .- mean(y_train)).^2)
    @printf("   R-squared on training data: %.4f\n", r2_train)
    
    # Step 2: Predict prices for entire sample (1.25 points)
    println("\n2. Predicting prices for entire sample:")
    
    # For prediction, we need to handle the end_0 feature carefully
    # Since our model was trained without end_0, we need to exclude it from prediction too
    if end_0_idx !== nothing
        X_pred_reduced = X[:, other_indices]
    else
        X_pred_reduced = X
    end
    
    X_pred_with_intercept = hcat(ones(size(X_pred_reduced, 1)), X_pred_reduced)
    
    # Predict using the model trained without end_0 apartments
    y_pred_full = X_pred_with_intercept * beta_no_end_0
    
    @printf("   Predictions generated for %d apartments\n", length(y_pred_full))
    
    # Step 3: Compare averages for apartments ending in 0 (0.5 points)
    println("\n3. Comparing actual vs predicted prices for apartments with area ending in 0:")
    
    # Get apartments with area ending in 0
    mask_end_0 = df.end_0 .== 1
    
    actual_prices_end_0 = y[mask_end_0]
    predicted_prices_end_0 = y_pred_full[mask_end_0]
    
    # Calculate averages
    avg_actual = mean(actual_prices_end_0)
    avg_predicted = mean(predicted_prices_end_0)
    premium = avg_actual - avg_predicted
    premium_pct = (premium / avg_predicted) * 100
    
    @printf("   Number of apartments with area ending in 0: %d\n", sum(mask_end_0))
    @printf("   Average actual price: %.2f PLN\n", avg_actual)
    @printf("   Average predicted price: %.2f PLN\n", avg_predicted)
    @printf("   Price premium: %.2f PLN (%+.2f%%)\n", premium, premium_pct)
    
    # Additional analysis
    @printf("\n   Additional Statistics:\n")
    @printf("   Median actual price: %.2f PLN\n", median(actual_prices_end_0))
    @printf("   Median predicted price: %.2f PLN\n", median(predicted_prices_end_0))
    @printf("   Standard deviation of premium: %.2f PLN\n", std(actual_prices_end_0 .- predicted_prices_end_0))
    
    return Dict(
        "avg_actual" => avg_actual,
        "avg_predicted" => avg_predicted,
        "premium" => premium,
        "premium_pct" => premium_pct,
        "n_end_0" => sum(mask_end_0),
        "actual_prices_end_0" => actual_prices_end_0,
        "predicted_prices_end_0" => predicted_prices_end_0
    )
end

# Perform premium analysis
premium_results = price_premium_analysis(df_clean, model_results);


=== PRICE PREMIUM ANALYSIS (Part 3c) ===

1. Training model excluding apartments with area ending in 0:
   Training sample size: 97540 (excluded 12651 apartments ending in 0)
   R-squared on training data: 0.5309

2. Predicting prices for entire sample:
   Predictions generated for 110191 apartments

3. Comparing actual vs predicted prices for apartments with area ending in 0:
   Number of apartments with area ending in 0: 12651
   Average actual price: 873616.30 PLN
   Average predicted price: 860404.16 PLN
   Price premium: 13212.14 PLN (+1.54%)

   Additional Statistics:
1. Training model excluding apartments with area ending in 0:
   Training sample size: 97540 (excluded 12651 apartments ending in 0)
   R-squared on training data: 0.5309

2. Predicting prices for entire sample:
   Predictions generated for 110191 apartments

3. Comparing actual vs predicted prices for apartments with area ending in 0:
   Number of apartments with area ending in 0: 12651
   Average actual price: 87

### Statistical Significance Test

In [21]:
if premium_results !== nothing
    # Determine if apartments ending in 0 are overpriced
    premium = premium_results["premium"]
    premium_pct = premium_results["premium_pct"]
    
    @printf("\n   Conclusion:\n")
    if premium > 0
        @printf("   ✓ Apartments with area ending in 0 appear to be sold at a PREMIUM\n")
        @printf("     of %.2f PLN (%+.2f%%) above what their features suggest.\n", premium, premium_pct)
        @printf("     This could indicate that buyers perceive 'round' areas as more desirable\n")
        @printf("     or that sellers use psychological pricing strategies.\n")
    else
        @printf("   ✗ Apartments with area ending in 0 appear to be sold at a DISCOUNT\n")
        @printf("     of %.2f PLN (%.2f%%) below what their features suggest.\n", abs(premium), abs(premium_pct))
    end
    
    # Statistical significance test
    actual_prices_end_0 = premium_results["actual_prices_end_0"]
    predicted_prices_end_0 = premium_results["predicted_prices_end_0"]
    
    differences = actual_prices_end_0 .- predicted_prices_end_0
    t_test_result = OneSampleTTest(differences, 0.0)
    t_stat = t_test_result.t
    p_value = pvalue(t_test_result)
    
    @printf("\n   Statistical Test (t-test):\n")
    @printf("   Null hypothesis: Mean price difference = 0\n")
    @printf("   t-statistic: %.3f\n", t_stat)
    @printf("   p-value: %.6f\n", p_value)
    
    if p_value < 0.05
        @printf("   ✓ The price difference is statistically significant at 5%% level.\n")
    else
        @printf("   ✗ The price difference is not statistically significant at 5%% level.\n")
    end
    
    # Add to results
    premium_results["t_stat"] = t_stat
    premium_results["p_value"] = p_value
end


   Conclusion:
   ✓ Apartments with area ending in 0 appear to be sold at a PREMIUM
     of 13212.14 PLN (+1.54%) above what their features suggest.
     This could indicate that buyers perceive 'round' areas as more desirable
     or that sellers use psychological pricing strategies.

   Statistical Test (t-test):
   Null hypothesis: Mean price difference = 0
   t-statistic: 4.661
   p-value: 0.000003
   ✓ The price difference is statistically significant at 5% level.
   Null hypothesis: Mean price difference = 0
   t-statistic: 4.661
   p-value: 0.000003
   ✓ The price difference is statistically significant at 5% level.


3.1740305434732614e-6

In [22]:
function save_results(df_clean, model_results, premium_results)
    """
    Save all results to files.
    """
    println("\n=== SAVING RESULTS ===\n")
    
    # Create output directory if it doesn't exist
    output_dir = "/Users/gabrielsaco/Documents/GitHub/High_Dimensional_Linear_Models/Julia/output"
    mkpath(output_dir)
    
    # Save cleaned data
    CSV.write(joinpath(output_dir, "apartments_cleaned.csv"), df_clean)
    println("✓ Cleaned data saved to apartments_cleaned.csv")
    
    # Save regression results
    CSV.write(joinpath(output_dir, "regression_results.csv"), model_results["results_df"])
    println("✓ Regression results saved to regression_results.csv")
    
    # Save premium analysis results
    if premium_results !== nothing
        premium_summary = DataFrame(
            metric = ["n_apartments_end_0", "avg_actual_price", "avg_predicted_price", 
                     "premium_amount", "premium_percentage", "t_statistic", "p_value"],
            value = [premium_results["n_end_0"], premium_results["avg_actual"], 
                    premium_results["avg_predicted"], premium_results["premium"],
                    premium_results["premium_pct"], 
                    get(premium_results, "t_stat", NaN), 
                    get(premium_results, "p_value", NaN)]
        )
        
        CSV.write(joinpath(output_dir, "premium_analysis.csv"), premium_summary)
        println("✓ Premium analysis results saved to premium_analysis.csv")
    end
    
    @printf("\nAll results saved to: %s\n", output_dir)
end

# Save all results
save_results(df_clean, model_results, premium_results);


=== SAVING RESULTS ===

✓ Cleaned data saved to apartments_cleaned.csv
✓ Regression results saved to regression_results.csv
✓ Premium analysis results saved to premium_analysis.csv

All results saved to: /Users/gabrielsaco/Documents/GitHub/High_Dimensional_Linear_Models/Julia/output
✓ Cleaned data saved to apartments_cleaned.csv
✓ Regression results saved to regression_results.csv
✓ Premium analysis results saved to premium_analysis.csv

All results saved to: /Users/gabrielsaco/Documents/GitHub/High_Dimensional_Linear_Models/Julia/output
