## 3.1 Normalizing features

In [1]:
import pickle
import pandas as pd
import numpy as np

In [2]:
def get_mean(x):
    return sum(x)/len(x)

In [3]:
def get_std(x, mean):
    return ( sum((x-mean)*(x-mean)) / len(x) )**0.5

In [4]:
def get_mean_and_std(x):
    mean = get_mean(x)
    std = get_std(x, mean)
    return mean, std

In [5]:
def normalization(x, mean, std):
    return ( x - mean )/std

In [6]:
def proc_data(data_path):
    """
    standardize the housing data and output it to an external file 'normalized.txt'
    """
    # standardize the housing data, save the normalized data to 'normalized.txt'
    data = pd.read_csv(data_path, header = None).values
    data_mean, data_std = get_mean_and_std(data)
    data_normalized = normalization(data, data_mean, data_std)
    data_normalized = pd.DataFrame(data_normalized)
    data_normalized.to_csv("normalized.txt", header = False, index = False)
    # store the values used for normalization
    area_mean = data_mean[0]
    area_std = data_std[0]
    n_mean = data_mean[1]
    n_std = data_std[1]
    price_mean = data_mean[2]
    price_std = data_std[2]
    
    f = open("mean_std.pk", "wb")
    pickle.dump({'area':{'mean': area_mean, 'std': area_std},
                 'n_bedroom':{'mean': n_mean, 'std': n_std},
                 'price':{'mean': price_mean, 'std': price_std},
                 },
                f)
    f.close()
    
    return

In [7]:
proc_data('housing.txt')

In [8]:
with open('mean_std.pk','rb') as read_file:
    df = pickle.load(read_file)

In [9]:
df

{'area': {'mean': 2000.6808510638298, 'std': 786.2026187430467},
 'n_bedroom': {'mean': 3.1702127659574466, 'std': 0.7528428090618782},
 'price': {'mean': 340412.6595744681, 'std': 123702.53600614739}}