In [19]:
import numpy as np
import pandas as pd
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score as r2, mean_absolute_error as mae, mean_squared_error as mse
from sklearn.model_selection import KFold, GridSearchCV

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [20]:
PREPARED_DATASET_PATH = './housing_prepared.csv'
TRAIN_DATASET_PATH = '../housing_train.csv'

In [21]:
df = pd.read_csv(PREPARED_DATASET_PATH, sep=';')
df.head(4)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,housing_median_age_nan,ocean_proximity_nan,longitude_outlier,latitude_outlier,population_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,0,0.365909,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,0,0.338217,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,0,0.338105,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,0,0.437991,0,0,0,1,0


In [22]:
feature_names = df.columns.tolist()
feature_names.remove('median_house_value')

target_name = 'median_house_value'

In [23]:
feature_names_for_stand = df[feature_names].select_dtypes(include=['float64','int64']).columns.tolist()
feature_names_for_stand

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'housing_median_age_nan',
 'ocean_proximity_nan',
 'longitude_outlier',
 'latitude_outlier',
 'population_per_room',
 '<1H OCEAN',
 'INLAND',
 'ISLAND',
 'NEAR BAY',
 'NEAR OCEAN']

In [24]:
scaler = StandardScaler()
stand_features = scaler.fit_transform(df[feature_names_for_stand])

In [25]:
df[feature_names_for_stand] = pd.DataFrame(stand_features, columns=feature_names_for_stand)

In [26]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,housing_median_age_nan,ocean_proximity_nan,longitude_outlier,latitude_outlier,population_per_room,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-1.330277,1.036473,1.025853,-1.137723,-1.348849,-0.982678,-0.97864,2.959952,452600.0,-0.189052,-0.034234,-0.021407,-0.01009,-0.560393,-0.881005,-0.704248,-0.015954,2.901549,-0.376027
1,-1.325291,1.027171,-0.603438,3.450773,2.196127,0.855759,1.661028,2.944799,358500.0,-0.189052,-0.034234,-0.021407,-0.01009,-0.618133,-0.881005,-0.704248,-0.015954,2.901549,-0.376027
2,-1.335263,1.022519,1.921963,-0.704623,-1.127515,-0.828811,-0.845613,2.280068,352100.0,-0.189052,-0.034234,-0.021407,-0.01009,-0.618366,-0.881005,-0.704248,-0.015954,2.901549,-0.376027
3,-1.340249,1.022519,1.921963,-0.847022,-0.964235,-0.773985,-0.736062,1.25222,341300.0,-0.189052,-0.034234,-0.021407,-0.01009,-0.410101,-0.881005,-0.704248,-0.015954,2.901549,-0.376027
4,-1.340249,1.022519,1.921963,-0.586572,-0.800956,-0.767795,-0.631727,0.108107,342200.0,-0.189052,-0.034234,-0.021407,-0.01009,-0.599267,-0.881005,-0.704248,-0.015954,2.901549,-0.376027


In [27]:
df.to_csv(TRAIN_DATASET_PATH, index=False, encoding='utf-8')