# 4. DATA PREPARATION

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [2]:
%store -r train_set
%store -r test_set

y_train = train_set['median_house_value']
X_train = train_set.drop('median_house_value', axis=1)

y_test = test_set['median_house_value']
X_test = test_set.drop('median_house_value', axis=1)

### 4.1 ENCODING

In [3]:
def ocean_priximity_oh_encoder(data):
    encoder = LabelBinarizer()
    ocean_proximity_oh = encoder.fit_transform(data['ocean_proximity'])
    ocean_proximity_oh_df = pd.DataFrame(ocean_proximity_oh, columns=data['ocean_proximity'].unique(), index=data.index)
    data = pd.concat([data, ocean_proximity_oh_df], axis = 1)
    data = data.drop(['ocean_proximity'], axis=1)
    return data
    

In [4]:
X_train = ocean_priximity_oh_encoder(X_train)
X_test = ocean_priximity_oh_encoder(X_test)

In [5]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,NEAR OCEAN,<1H OCEAN,INLAND,NEAR BAY,ISLAND
18543,-122.03,36.97,36.0,337.0,69.0,223.0,68.0,3.2404,0,0,0,0,1
15147,-116.94,32.87,24.0,2824.0,441.0,1480.0,471.0,5.2614,1,0,0,0,0
9966,-122.4,38.46,33.0,2542.0,466.0,1099.0,420.0,4.635,0,1,0,0,0
16781,-122.48,37.67,31.0,2609.0,433.0,1746.0,464.0,5.1054,0,0,0,0,1
10892,-117.87,33.72,39.0,3167.0,669.0,2789.0,619.0,3.5902,1,0,0,0,0


### 4.2 CLEANING THE DATA

In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 18543 to 13497
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16360 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   NEAR OCEAN          16512 non-null  int32  
 9   <1H OCEAN           16512 non-null  int32  
 10  INLAND              16512 non-null  int32  
 11  NEAR BAY            16512 non-null  int32  
 12  ISLAND              16512 non-null  int32  
dtypes: float64(8), int32(5)
memory usage: 1.4 MB


In [7]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4128 entries, 7914 to 2431
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           4128 non-null   float64
 1   latitude            4128 non-null   float64
 2   housing_median_age  4128 non-null   float64
 3   total_rooms         4128 non-null   float64
 4   total_bedrooms      4073 non-null   float64
 5   population          4128 non-null   float64
 6   households          4128 non-null   float64
 7   median_income       4128 non-null   float64
 8   <1H OCEAN           4128 non-null   int32  
 9   INLAND              4128 non-null   int32  
 10  NEAR OCEAN          4128 non-null   int32  
 11  NEAR BAY            4128 non-null   int32  
 12  ISLAND              4128 non-null   int32  
dtypes: float64(8), int32(5)
memory usage: 370.9 KB


In [8]:
def total_bedrooms_nan_transform(train_data, test_data):
    imputer = SimpleImputer(strategy = 'median')
    
    imputer.fit(train_data)
    
    train_data_transformed = imputer.transform(train_data)
    test_data_transformed = imputer.transform(test_data)
    
    train_data_transformed = pd.DataFrame(train_data_transformed, columns = train_data.columns)
    test_data_transformed = pd.DataFrame(test_data_transformed, columns = test_data.columns)
    
    return train_data_transformed, test_data_transformed

In [9]:
X_train, X_test = total_bedrooms_nan_transform(X_train, X_test)
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4128 entries, 0 to 4127
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           4128 non-null   float64
 1   latitude            4128 non-null   float64
 2   housing_median_age  4128 non-null   float64
 3   total_rooms         4128 non-null   float64
 4   total_bedrooms      4128 non-null   float64
 5   population          4128 non-null   float64
 6   households          4128 non-null   float64
 7   median_income       4128 non-null   float64
 8   <1H OCEAN           4128 non-null   float64
 9   INLAND              4128 non-null   float64
 10  NEAR OCEAN          4128 non-null   float64
 11  NEAR BAY            4128 non-null   float64
 12  ISLAND              4128 non-null   float64
dtypes: float64(13)
memory usage: 419.3 KB


### 4.3 ATTRIBUTE COMPOSITION

In [10]:
def compose_bedrooms_per_room(data):
    data['bedrooms_per_room'] = data['total_bedrooms'] / data['total_rooms']
    return data

def compose_rooms_per_house(data):
    data['rooms_per_house'] = data['total_rooms'] / data['households']
    return data

In [11]:
X_train = compose_bedrooms_per_room(X_train)
X_train = compose_rooms_per_house(X_train)
X_test = compose_bedrooms_per_room(X_test)
X_test = compose_rooms_per_house(X_test)

### 4.4 FEATURE SCALING

In [12]:
def feature_scaling_standard_scaler(train, test):
    scaler = StandardScaler()
    scaler.fit(train)
    train = pd.DataFrame(scaler.transform(train), columns = train.columns)
    test = pd.DataFrame(scaler.transform(test), columns = test.columns)
    return train, test

In [13]:
X_train, X_test = feature_scaling_standard_scaler(X_train, X_test)

In [14]:
%store X_train X_test y_train y_test

Stored 'X_train' (DataFrame)
Stored 'X_test' (DataFrame)
Stored 'y_train' (Series)
Stored 'y_test' (Series)
