dataset:
http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors

In [2]:
class Housing:
    def __init__(self):
        self.housing = pd.read_csv('housing.csv',sep=',',decimal='.',encoding='utf-8')

        
    def encode_fit_transform(self, database):
        
        # one hot encoder for ocean_proximity
        self.onehot = OneHotEncoder(sparse=False, categories= 'auto')
        ocean_labels=database.ocean_proximity.values.reshape(-1, 1)
        onehot_ocean_labels = self.onehot.fit_transform(ocean_labels)
        
        # concatenate onehot encoded values to dataframe
        dfOneHot = pd.DataFrame(onehot_ocean_labels,
                                columns = ["Ocean_"+str(int(i)) for i in range(onehot_ocean_labels.shape[1])])
        encoded_database = pd.concat([database, dfOneHot], axis=1)
        
        #clean dataframe
        encoded_database.drop('ocean_proximity', axis = 1, inplace = True) #drop ocean_proximity original column
        encoded_database.dropna(axis=0, how='any',inplace = True) #drop NaN
        
        #dataframe columns
        self.columns = encoded_database.columns.tolist()
        
        #normilize values all columns
        self.scaler = MinMaxScaler()
        encoded_database[self.columns] = self.scaler.fit_transform(encoded_database[self.columns])
        return encoded_database
    
    def findKneighbors_fit(self, database):
        self.nNeighbors = NearestNeighbors(n_neighbors=2, algorithm='ball_tree').fit(database)        
    
    def transform(self, sample_df):
        
        #transform
        #onehot encoder
        ocean_labels=self.sample_df.ocean_proximity.values.reshape(-1, 1)
        onehot_ocean_labels = self.onehot.transform(ocean_labels)
        
        #concatenate onehot encoded values to dataframe
        dfOneHot = pd.DataFrame(onehot_ocean_labels,
                                columns = ["Ocean_"+str(int(i)) for i in range(onehot_ocean_labels.shape[1])])
        encoded_sample = pd.concat([self.sample_df, dfOneHot], axis=1)
        encoded_sample.drop('ocean_proximity', axis = 1, inplace = True) #drop ocean_proximity original column
        
        
        encoded_sample[self.columns] = self.scaler.transform(encoded_sample[self.columns]).astype(float)
        
        return encoded_sample
    
    def findKneighbors(self,sample, n_kneighbors = 5):
        self.housing_ = self.housing.copy(deep = True)
        n_none = [i for i in range(len(sample)) if sample[i] == None]
        
        # Transform list to dataframe
        columns_ = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                    'total_bedrooms', 'population', 'households', 'median_income',
                    'median_house_value', 'ocean_proximity']

        for index in sorted(n_none, reverse=True):
            del columns_[index]
            del sample[index]
        
        # transform list to dataframe
        self.sample_df = pd.DataFrame(columns = columns_)
        self.sample_df.loc[0] = sample
        
        if len(n_none) != 0:
            #self.sample_df.drop(self.sample_df.columns[n_none],axis=1,inplace=True)
            self.housing_.drop(self.housing_.columns[n_none],axis=1, inplace=True)
        
        self.housing_encoded = self.encode_fit_transform(self.housing_)
        self.findKneighbors_fit(self.housing_encoded)
        
        self.encoded_sample = self.transform(self.sample_df)
        dist, ind = self.nNeighbors.kneighbors(self.encoded_sample, n_kneighbors)
        
        return self.housing.loc[ind[0].tolist()]

In [3]:
house1 = Housing()

In [27]:
returned = house1.findKneighbors([-125, 39, None, None, None, None,
                                  None, None, None, 'NEAR BAY'], 5)
returned

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
9277,-122.58,38.12,13.0,5027.0,871.0,1912.0,770.0,4.9286,309500.0,NEAR BAY
9282,-122.58,38.1,22.0,11872.0,2300.0,5600.0,2200.0,4.6463,276300.0,NEAR BAY
9274,-122.57,38.11,24.0,2863.0,734.0,1583.0,682.0,3.1981,215300.0,NEAR BAY
9275,-122.57,38.11,32.0,3521.0,748.0,1706.0,723.0,3.4705,228600.0,NEAR BAY
9283,-122.58,38.08,27.0,10839.0,1637.0,4406.0,1623.0,5.615,285600.0,NEAR BAY


In [66]:
housing = pd.read_csv('housing.csv',sep=',',decimal='.',encoding='utf-8')

In [67]:
new_housing = housing.copy(deep = True)

In [89]:
new_housing['new_total_rooms'] = new_housing['total_rooms']/(new_housing['households']*(new_housing['population']/2.5))
new_housing['new_total_bedrooms'] = new_housing['total_bedrooms']/(new_housing['households']*(new_housing['population']/4))

In [90]:
new_housing['new_total_rooms'] = new_housing['new_total_rooms'].apply(np.ceil)
new_housing['new_total_bedrooms'] = new_housing['new_total_bedrooms'].apply(np.ceil)

In [91]:
new_housing.nlargest(1,'new_total_bedrooms')

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,new_total_rooms,new_total_bedrooms
1914,-120.1,38.91,33.0,1561.0,282.0,30.0,11.0,1.875,500001.0,INLAND,12.0,4.0


In [101]:
new_housing['new_median_house_value'] = new_housing['median_house_value']*2 - 15000

In [105]:
new_housing.nsmallest(5,'new_median_house_value')

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,new_total_rooms,new_total_bedrooms,new_median_house_value
2521,-122.74,39.71,16.0,255.0,73.0,85.0,38.0,1.6607,14999.0,INLAND,1.0,1.0,14998.0
2799,-117.02,36.4,19.0,619.0,239.0,490.0,164.0,2.1,14999.0,INLAND,1.0,1.0,14998.0
9188,-117.86,34.24,52.0,803.0,267.0,628.0,225.0,4.1932,14999.0,INLAND,1.0,1.0,14998.0
19802,-123.17,40.31,36.0,98.0,28.0,18.0,8.0,0.536,14999.0,INLAND,2.0,1.0,14998.0
5887,-118.33,34.15,39.0,493.0,168.0,259.0,138.0,2.3667,17500.0,<1H OCEAN,1.0,1.0,20000.0
