Contains --> 'longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity'

## 1. Data Preprocessing

### 1.1 Import Libraries


In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import normaltest, shapiro, anderson
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")


Libraries imported successfully!


### 1.2 Load and Parse Data


In [12]:
# Load the housing dataset
df = pd.read_csv('housing.csv')

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")


Dataset loaded successfully!
Dataset shape: (20640, 10)
Columns: ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity']


In [13]:
# Show all possible values for ocean_proximity
print("Ocean Proximity - All Possible Values:")
print("="*50)
print("Unique values in ocean_proximity:")
for i, value in enumerate(df['ocean_proximity'].unique(), 1):
    count = (df['ocean_proximity'] == value).sum()
    percentage = (count / len(df)) * 100
    print(f"{i}. '{value}': {count:,} records ({percentage:.2f}%)")

print(f"\nTotal categories: {df['ocean_proximity'].nunique()}")
print(f"Data type: {df['ocean_proximity'].dtype}")


Ocean Proximity - All Possible Values:
Unique values in ocean_proximity:
1. 'NEAR BAY': 2,290 records (11.09%)
2. '<1H OCEAN': 9,136 records (44.26%)
3. 'INLAND': 6,551 records (31.74%)
4. 'NEAR OCEAN': 2,658 records (12.88%)
5. 'ISLAND': 5 records (0.02%)

Total categories: 5
Data type: object


In [14]:
# Display first few rows and basic info
print("First 5 rows:")
print(df.head())
print("\n" + "="*50 + "\n")
print("Dataset Info:")
print(df.info())
print("\n" + "="*50 + "\n")
print("Basic Statistics:")
print(df.describe())


First 5 rows:
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIn

### 1.3 Data Cleaning and Organization


In [15]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")

# Handle missing values in total_bedrooms (the only column with missing values)
if df['total_bedrooms'].isnull().sum() > 0:
    # Fill missing values with median
    df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)
    print(f"\nFilled {df['total_bedrooms'].isnull().sum()} missing values in total_bedrooms")

# Convert ocean_proximity to categorical
df['ocean_proximity'] = df['ocean_proximity'].astype('category')

print("\nData cleaning completed!")
print(f"Final dataset shape: {df.shape}")
print(f"Data types:\n{df.dtypes}")


Missing values per column:
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

Total missing values: 207

Filled 0 missing values in total_bedrooms

Data cleaning completed!
Final dataset shape: (20640, 10)
Data types:
longitude              float64
latitude               float64
housing_median_age     float64
total_rooms            float64
total_bedrooms         float64
population             float64
households             float64
median_income          float64
median_house_value     float64
ocean_proximity       category
dtype: object


## 2. Basic Data Exploration and Summary Statistics


In [16]:
# Basic summary statistics
print("="*60)
print("DATASET SUMMARY")
print("="*60)

print(f"Dataset Dimensions: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n" + "="*40)
print("NUMERICAL FEATURES SUMMARY")
print("="*40)
numerical_cols = df.select_dtypes(include=[np.number]).columns
print(df[numerical_cols].describe())

print("\n" + "="*40)
print("CATEGORICAL FEATURES SUMMARY")
print("="*40)
categorical_cols = df.select_dtypes(include=['category', 'object']).columns
for col in categorical_cols:
    print(f"\n{col}:")
    print(f"  Unique values: {df[col].nunique()}")
    print(f"  Value counts:")
    print(df[col].value_counts().head(10))


DATASET SUMMARY
Dataset Dimensions: 20640 rows × 10 columns
Memory Usage: 1.44 MB

NUMERICAL FEATURES SUMMARY
          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.631861           28.639486   2635.763081   
std        2.003532      2.135952           12.585558   2181.615252   
min     -124.350000     32.540000            1.000000      2.000000   
25%     -121.800000     33.930000           18.000000   1447.750000   
50%     -118.490000     34.260000           29.000000   2127.000000   
75%     -118.010000     37.710000           37.000000   3148.000000   
max     -114.310000     41.950000           52.000000  39320.000000   

       total_bedrooms    population    households  median_income  \
count    20640.000000  20640.000000  20640.000000   20640.000000   
mean       536.838857   1425.476744    499.539680       3.870671   
std        419.391878   1132.462122    382.329