# Housing Regression Example
## EDA Notebook

In [2]:
import os
import pandas as pd
import seaborn as sns

In [3]:
# Check working directory
os.getcwd()

'/Users/dsl/Documents/GitHub/PythonBasics/RegressionExamples/housing/notebooks'

In [4]:
# Define paths
data_path = os.path.join('..', 'data')
img_path = os.path.join('..', 'imgs')

In [5]:
# Ensure dirs exist
os.makedirs(data_path, exist_ok=True)
os.makedirs(img_path, exist_ok=True)

In [6]:
# Define file name for X-train data
X_train_data_path = os.path.join(data_path, 'X_train.csv')

In [7]:
# Define file name for y-train data
y_train_data_path = os.path.join(data_path, 'y_train.csv')

In [8]:
# Read in X and y training data
X_train = pd.read_csv(X_train_data_path)
y_train = pd.read_csv(y_train_data_path)

In [9]:
# View top rows of X train df
X_train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,0.9809,19.0,3.187726,1.129964,726.0,2.620939,33.98,-118.28
1,4.2232,33.0,6.189696,1.086651,1015.0,2.377049,37.46,-122.23
2,3.5488,42.0,4.821577,1.095436,1044.0,4.33195,33.79,-118.26
3,1.6469,24.0,4.274194,1.048387,1686.0,4.532258,35.87,-119.26
4,3.9909,14.0,4.608303,1.08935,2738.0,2.471119,37.54,-121.96


In [10]:
# View top rows of y train df
y_train.head()

Unnamed: 0,target
0,1.214
1,3.637
2,2.056
3,0.476
4,2.36


In [11]:
# Combine X and y dfs
df = pd.concat([X_train, y_train], axis=1)

In [12]:
# Check combined data
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,0.9809,19.0,3.187726,1.129964,726.0,2.620939,33.98,-118.28,1.214
1,4.2232,33.0,6.189696,1.086651,1015.0,2.377049,37.46,-122.23,3.637
2,3.5488,42.0,4.821577,1.095436,1044.0,4.33195,33.79,-118.26,2.056
3,1.6469,24.0,4.274194,1.048387,1686.0,4.532258,35.87,-119.26,0.476
4,3.9909,14.0,4.608303,1.08935,2738.0,2.471119,37.54,-121.96,2.36


In [13]:
# Check shape of combined data
df.shape

(13828, 9)

In [14]:
# Built-in pandas function, returns None
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13828 entries, 0 to 13827
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      13828 non-null  float64
 1   HouseAge    13828 non-null  float64
 2   AveRooms    13828 non-null  float64
 3   AveBedrms   13828 non-null  float64
 4   Population  13828 non-null  float64
 5   AveOccup    13828 non-null  float64
 6   Latitude    13828 non-null  float64
 7   Longitude   13828 non-null  float64
 8   target      13828 non-null  float64
dtypes: float64(9)
memory usage: 972.4 KB


In [15]:
# Built-in pandas function, returns df
df_desc = df.describe()
df_desc

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
count,13828.0,13828.0,13828.0,13828.0,13828.0,13828.0,13828.0,13828.0,13828.0
mean,3.876745,28.559662,5.436556,1.097533,1430.865924,3.12866,35.651238,-119.585098,2.066636
std,1.903102,12.600767,2.449446,0.445688,1146.236335,12.64613,2.134064,2.005127,1.153743
min,0.4999,1.0,0.888889,0.333333,3.0,0.692308,32.55,-124.35,0.14999
25%,2.568575,18.0,4.459802,1.006623,793.0,2.432189,33.94,-121.81,1.194
50%,3.53875,29.0,5.232422,1.049552,1170.5,2.819702,34.27,-118.51,1.792
75%,4.7566,37.0,6.058566,1.100283,1729.0,3.282093,37.72,-118.01,2.64025
max,15.0001,52.0,141.909091,25.636364,35682.0,1243.333333,41.95,-114.31,5.00001


In [37]:
desc_path = os.path.join(data_path, 'df_desc.csv')
df_desc.to_csv(desc_path)

In [16]:
from skimpy import skim_get_figure

In [17]:
# Set save path and generate skim report
skim_img_path = os.path.join(img_path, 'skim_summary.svg')
skim_get_figure(df, save_path=skim_img_path)