In [1]:
#building a custom transformer to calculate distance between a given coordinate and california

In [2]:
#importing data
from sklearn.datasets import fetch_california_housing


#instantiating
data = fetch_california_housing()

#initializing
X = data['data']
y = data['target']

In [23]:
#importing the estimators
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

#building a class

class GetDistanceFrom(BaseEstimator, TransformerMixin):
    
    def __init__(self, coordinates):
        self.coordinates = coordinates 
        
    def fit(self,X,y=None):
        return self
    
    def transform(self, X):
        latitude  = X[:,0]
        longitude = X[:,1]
        
        distance = np.sqrt((latitude - self.coordinates[0])**2 + (longitude - self.coordinates[1])**2)
        distance = distance.reshape(-1,1)     #reshaping into 2D array
        return distance

In [24]:
print(data['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block
        - HouseAge      median house age in block
        - AveRooms      average number of rooms
        - AveBedrms     average number of bedrooms
        - Population    block population
        - AveOccup      average house occupancy
        - Latitude      house block latitude
        - Longitude     house block longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
http://lib.stat.cmu.edu/datasets/

The target variable is the median house value for California districts.

This dataset was derived from the 1990 U.S. census, using one row per census
block group. A block group is the smallest geographical unit for which the U.S.
Census Bur

In [25]:
#the latitude and longitude are the two features of interest and distance is to be calculated using these features 
#using Euclidean distanc formula.


In [26]:
#interested data

X[:,-2:]

array([[  37.88, -122.23],
       [  37.86, -122.22],
       [  37.85, -122.24],
       ...,
       [  39.43, -121.22],
       [  39.43, -121.32],
       [  39.37, -121.24]])

In [29]:
#calculating distance 
#LosAngeles coordinates are 34N, 118W

LA_coordinates = [34,-118]

dist_from_LA = GetDistanceFrom(LA_coordinates)

distance = dist_from_LA.fit_transform(X[:,-2:])


In [28]:
print(distance)
print(distance.shape)

[[5.73997387]
 [5.71909084]
 [5.72713716]
 ...
 [6.31294701]
 [6.36453455]
 [6.27172225]]
(20640, 1)


In [30]:
#since only the last two columns are required;hence there arises a need to automate or code for dropping the 
#possibility of other columns.

class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, indexed_cols):
        self.indexed_cols = indexed_cols
        
    def fit(self, X,y):
        return self
    
    def transform(self,X):
        return X[:, self.indexed_cols]

In [32]:
SF_coordinates = [37,-122]
dist_from_SF = GetDistanceFrom(SF_coordinates)
dist_from_SF.fit_transform(X[:,-2:])

array([[0.90956033],
       [0.88769364],
       [0.8832327 ],
       ...,
       [2.55211677],
       [2.52335095],
       [2.48887525]])

In [33]:
#automation is necessary in this step since not always the clumns can be chosen and resuts be obtained.
#hence utilising FeatureUnion and then encapsulating with the help of Pipeline.

In [35]:
#importing necessary libraries
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.linear_model import LinearRegression

In [37]:
coordinates_LA = [34,-118]
coordinates_SF = [37,-122]

distance_LA = GetDistanceFrom(coordinates_LA)
distance_SF = GetDistanceFrom(coordinates_SF)

drop = DropColumns([0,1,2,3,4,5])

#making a union

union = FeatureUnion([('drop', drop), ('LA', distance_LA), ('SF', distance_SF)])

#making a pipeline
pipe = Pipeline([('union',union), ('regressor',LinearRegression())])

#fitting data
pipe.fit(X,y)


Pipeline(memory=None,
         steps=[('union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('drop',
                                                 DropColumns(indexed_cols=[0, 1,
                                                                           2, 3,
                                                                           4,
                                                                           5])),
                                                ('LA',
                                                 GetDistanceFrom(coordinates=[34,
                                                                              -118])),
                                                ('SF',
                                                 GetDistanceFrom(coordinates=[37,
                                                                              -122]))],
                              transformer_weights=None, verbose=False)),
         

In [41]:
a = pipe.predict(X)

In [39]:
#obtaining scores
pipe.score(X,y)

0.5492141613373734