# Exploratory Data Analysis

In [44]:
# Import Libraries:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import numpy as np
import scipy.stats as stats

import warnings
%matplotlib notebook

In [45]:
# data:
data = pd.read_csv('clean_data.csv')

In [46]:
# dataset overview
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,zipcode,lat,long,sqft_living15,sqft_lot15,log_price,log_sqft_lot,log_sqft_lot15,log_sqft_living,house_age
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,...,98178,47.5112,-122.257,1340,5650,12.309982,8.639411,8.639411,7.07327,59
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,...,98125,47.721,-122.319,1690,7639,13.195614,8.887653,8.941022,7.851661,63
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,0,...,98028,47.7379,-122.233,2720,8062,12.100712,9.21034,8.994917,6.646391,82
3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0,0,...,98136,47.5208,-122.393,1360,5000,13.311329,8.517193,8.517193,7.5807,49
4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0,0,...,98074,47.6168,-122.045,1800,7503,13.142166,8.997147,8.923058,7.426549,28


## EDA Summary
### 1. Summary Statistics:

In [47]:
data.describe()

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,zipcode,lat,long,sqft_living15,sqft_lot15,log_price,log_sqft_lot,log_sqft_lot15,log_sqft_living,house_age
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,...,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,4580302000.0,540088.1,3.369454,2.114757,2079.899736,15106.97,1.494309,0.007542,0.234303,3.40943,...,98077.939805,47.560053,-122.213896,1986.552492,12768.455652,13.047817,8.989956,8.960984,7.550335,43.317818
std,2876566000.0,367127.2,0.907964,0.770163,918.440897,41420.51,0.539989,0.086517,0.766318,0.650743,...,53.505026,0.138564,0.140828,685.391304,27304.179631,0.526685,0.902425,0.813048,0.424807,29.375493
min,1000102.0,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,...,98001.0,47.1559,-122.519,399.0,651.0,11.225243,6.253829,6.47851,5.669881,-1.0
25%,2123049000.0,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,...,98033.0,47.471,-122.328,1490.0,5100.0,12.682152,8.525161,8.536996,7.26333,18.0
50%,3904930000.0,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,...,98065.0,47.5718,-122.23,1840.0,7620.0,13.017003,8.938269,8.938532,7.554859,40.0
75%,7308900000.0,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,...,98118.0,47.678,-122.125,2360.0,10083.0,13.377006,9.276877,9.218606,7.843849,63.0
max,9900000000.0,7700000.0,11.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,...,98199.0,47.7776,-121.315,6210.0,871200.0,15.856731,14.317109,13.677627,9.513404,115.0


### 2. Univariate Analysis of the Target Variable 'Price'

In [48]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

plt.figure(figsize=(10, 6))
sns.histplot(data['price'], kde=True, bins=30)
plt.title('Figure 2.1: Distribution of House Prices')
plt.xlabel('Price (USD)')
plt.ylabel('Frequency')
plt.grid(True)

plt.savefig('distribution_house_prices.png', dpi=300)
plt.show()

<IPython.core.display.Javascript object>

### 3. Bivariate Analysis: Price vs. Key Features

In [49]:
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

sns.scatterplot(ax=axes[0], data=data, x='sqft_living', y='price')
axes[0].set_title('House Price vs. Square Feet of Living Space')
axes[0].set_xlabel('Square Feet of Living Space')
axes[0].set_ylabel('Price (USD)')

sns.scatterplot(ax=axes[1], data=data, x='bedrooms', y='price')
axes[1].set_title('House Price vs. Number of Bedrooms')
axes[1].set_xlabel('Number of Bedrooms')
axes[1].set_ylabel('Price (USD)')

sns.scatterplot(ax=axes[2], data=data, x='bathrooms', y='price')
axes[2].set_title('House Price vs. Number of Bathrooms')
axes[2].set_xlabel('Number of Bathrooms')
axes[2].set_ylabel('Price (USD)')

plt.suptitle('Figure 2.2: House Price vs. Property Features', fontsize=16)  
plt.tight_layout()

plt.savefig('house_price_vs_property_features.png', dpi=300)  

plt.show()

<IPython.core.display.Javascript object>

### 4. Correlation Matrix

In [50]:
plt.figure(figsize=(14, 12))  
corr_matrix = data.corr()
heatmap = sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', linewidths=.5, annot_kws={"size": 8}, square=True)
plt.title('Figure 2.3: Correlation Matrix of House Features', fontsize=16)  
plt.savefig('correlation_matrix.png', dpi=300) 
plt.show()

<IPython.core.display.Javascript object>

### 5. Statistical Analysis (Pearson Correlation Tests)

In [51]:
from scipy.stats import pearsonr

def perform_pearson_test(x, y, feature_name):
    correlation, p_value = pearsonr(x, y)
    print(f"Correlation between 'price' and '{feature_name}': {correlation:.3f}")
    if p_value < 0.05:
        print(f"Result: Statistically significant (p-value = {p_value:.3e}).")
    else:
        print(f"Result: Not statistically significant (p-value = {p_value:.3e}).")

perform_pearson_test(data['price'], data['sqft_living'], 'sqft_living')
perform_pearson_test(data['price'], data['bedrooms'], 'bedrooms')
perform_pearson_test(data['price'], data['bathrooms'], 'bathrooms')


Correlation between 'price' and 'sqft_living': 0.702
Result: Statistically significant (p-value = 0.000e+00).
Correlation between 'price' and 'bedrooms': 0.315
Result: Statistically significant (p-value = 0.000e+00).
Correlation between 'price' and 'bathrooms': 0.525
Result: Statistically significant (p-value = 0.000e+00).


### 6. Creating a Table for Statistical Test Results

In [52]:
test_results = pd.DataFrame({
    'Feature': ['Square Feet of Living Space', 'Number of Bedrooms', 'Number of Bathrooms'],
    'Correlation Coefficient': [0.702, 0.315, 0.525],
    'P-Value': ['<0.001', '<0.001', '<0.001'],
    'Result': ['Statistically Significant', 'Statistically Significant', 'Statistically Significant']
})

test_results.set_index('Feature', inplace=True)

In [53]:
test_results 

Unnamed: 0_level_0,Correlation Coefficient,P-Value,Result
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Square Feet of Living Space,0.702,<0.001,Statistically Significant
Number of Bedrooms,0.315,<0.001,Statistically Significant
Number of Bathrooms,0.525,<0.001,Statistically Significant
