# Initial Data Analysis (IDA)

Sections: Data loading, Data types, Stratified split, Save splits.

In [8]:
# IDA 📊 Initial Data Analysis
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

RAW_DIR   = Path("../data/raw")
TRAIN_DIR = Path("../data/train")
TEST_DIR  = Path("../data/test")

# load the raw data
raw_data = pd.read_csv(RAW_DIR / "housing.csv")

print("raw_data shape:", raw_data.shape)
raw_data.head()


raw_data shape: (20640, 10)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


## Data type analysis

In [3]:
# Data type analysis for training features
train_features.info()
train_features.describe(include='all').T


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16512 entries, 0 to 16511
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16344 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   ocean_proximity     16512 non-null  object 
dtypes: float64(8), object(1)
memory usage: 1.1+ MB


Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
longitude,16512.0,,,,-119.573125,2.000624,-124.35,-121.8,-118.51,-118.01,-114.49
latitude,16512.0,,,,35.637746,2.133294,32.55,33.93,34.26,37.72,41.95
housing_median_age,16512.0,,,,28.577156,12.585738,1.0,18.0,29.0,37.0,52.0
total_rooms,16512.0,,,,2639.402798,2185.287466,2.0,1447.0,2125.0,3154.0,39320.0
total_bedrooms,16344.0,,,,538.949094,423.862079,1.0,296.0,434.0,645.0,6210.0
population,16512.0,,,,1425.513929,1094.795467,3.0,787.0,1167.0,1726.0,16305.0
households,16512.0,,,,499.990189,382.865787,1.0,279.0,408.0,603.0,5358.0
median_income,16512.0,,,,3.870428,1.891936,0.4999,2.5625,3.5385,4.75,15.0001
ocean_proximity,16512.0,5.0,<1H OCEAN,7274.0,,,,,,,


## Stratified train/test splitting

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split

# start from the raw dataset
housing = pd.read_csv("../data/raw/housing.csv")

# create income_cat for stratification
housing['income_cat'] = pd.cut(
    housing['median_income'],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)

# do the split
train_set, test_set = train_test_split(
    housing,
    test_size=0.2,
    stratify=housing['income_cat'],
    random_state=42
)

# drop helper column
for s in (train_set, test_set):
    s.drop(columns=['income_cat'], inplace=True)

train_set.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
13096,-122.42,37.8,52.0,3321.0,1115.0,1576.0,1034.0,2.0987,458300.0,NEAR BAY
14973,-118.38,34.14,40.0,1965.0,354.0,666.0,357.0,6.0876,483800.0,<1H OCEAN
3785,-121.98,38.36,33.0,1083.0,217.0,562.0,203.0,2.433,101700.0,INLAND
14689,-117.11,33.75,17.0,4174.0,851.0,1845.0,780.0,2.2618,96100.0,INLAND
20507,-118.15,33.77,36.0,4366.0,1211.0,1912.0,1172.0,3.5292,361800.0,NEAR OCEAN


## Final: Save splits to data/train and data/test

In [6]:
from pathlib import Path
Path('data/train').mkdir(parents=True, exist_ok=True)
Path('data/test').mkdir(parents=True, exist_ok=True)
train_set.drop('median_house_value', axis=1).to_csv('data/train/train_features.csv', index=False)
train_set['median_house_value'].to_csv('data/train/train_labels.csv', index=False)
test_set.drop('median_house_value', axis=1).to_csv('data/test/test_features.csv', index=False)
test_set['median_house_value'].to_csv('data/test/test_labels.csv', index=False)
print('Saved train/test CSVs.')

Saved train/test CSVs.
