In [1]:
import mcnultymod
import pandas as pd
import numpy as np

from geopy.distance import vincenty

In [2]:
auctions = pd.read_pickle('../data/auctionsclosed.pkl')

Two of the devices used in feature transformation essentially require a "fitting" before transformation -- finding weights by proximity, and binning neighborhoods. Both of these things are based on relationships to the training set, so can be fitted off of that before transforming the training and test sets.

## Neighborhood Weights

In [None]:
class BinNeighborhoods(object):
    """
    Bins neighborhoods according to density of investors, with bin bounds provided
    during fit.
    """
    
    def __init__(self, nh_col='neighborhood'):
        self.nh_col_ = nh_col
        self.bin_bounds_ = None
        self.fit_ = False


    def _nhood_investor_ratios(self, fit_df):
        """
        Calculates the ratio of investors to all buyers per neighborhood, appends as    
        a new column to neighborhoods dataframe.
        ---
        IN
        fit_df: main auction dataframe
        OUT
        nhood_df: updated with investor ratio column
        """

        if self.fit_ == False:
            print("ERROR: You haven't fit the object, do that first!")
            return None
        
        inv_ratios = []
        
        for _, row in self.nhoods_.iterrows():
            nhood = row[self.nh_col_]
            temp = fit_df[fit_df[self.nh_col_] == nhood]
            inv_ratio = (temp[temp.purchasertype == 'Investor'].shape[0] / 
                         temp.shape[0])
            inv_ratios.append(inv_ratio)
            
        self.nhoods_['inv_ratio'] = inv_ratios

        
    def _bin_neighborhoods(self):
        """
        Assigns each neighborhood a bin number based on ratio of investors.
        ---
        IN
        df: neighborhoods dataframe with investor ratios (df)
        binlist: lower bounds for each bin
        OUT
        df: neighborhoods dataframe with investor ratio bins column
        """
        
        if self.fit_ == False:
            print("ERROR: You haven't fit the object, do that first!")
            return None
        
        self.nhoods_['n_bin'] = 0

        for n, val in enumerate(self.bin_bounds_):
            # self.nhoods_.n_bin[self.nhoods_.inv_ratio > val] = n
            self.nhoods_.loc[self.nhoods_.inv_ratio > val, 'n_bin'] = n
        
        
    def fit(self, fit_df, lower_bin_bounds=[0,0.1,0.25,0.33,0.45,0.75]):
        """
        Docstring coming soon!
        """
        
        if self.fit_ == True:
            print("ERROR: Object already fit!")
            return None
        
        self.fit_ = True
        self.bin_bounds_ = lower_bin_bounds 
        
        self.nhoods_ = (fit_df[self.nh_col_]
         .value_counts()
         .reset_index()
         .rename(columns={'index': self.nh_col_, self.nh_col_: 'n_count'})
                   )
        
        self._nhood_investor_ratios(fit_df)
        self._bin_neighborhoods()

        
    def merge_bins(self, df, how_merge='left'):
        """
        Docstring coming soon!
        """
        
        if self.fit_ == False:
            print("You haven't fit the object, do that first!")
            return None            
        
        df = pd.merge(df, 
                      self.nhoods_.filter([self.nh_col_, 'n_bin']), 
                      how=how_merge, 
                      on=self.nh_col_
                     )
        return df

In [None]:
try_the_bin = BinNeighborhoods()

In [None]:
try_the_bin.fit(auctions)

In [None]:
try_the_bin.bin_bounds_

In [None]:
auctions_new = try_the_bin.merge_bins(auctions)

In [None]:
try_the_bin.fit(auctions)

## Proximity Weights/Counts

In [None]:
class WeightByProximity(object):
    """
    Includes functions to assign a weight to a particular location based on its
    proximity to a list of other locations based on latitude/longitude. Can be 
    set to return a binary weight if any other locations fall within a certain
    threshold, or the count of other locations that fall within a certain
    threshold. The fit() method must be run before using the prox_weight()
    method.
    """
    
    def __init__(self, lat_col='latitude', lng_col='longitude'):
        self.lat_col_ = lat_col
        self.lng_col_ = lng_col
        self.fit_ = False

        
    def fit(self, fit_df, match_column, lat_filter=0.015, lng_filter=0.02):
        """
        Loads table containing latitude/longitude information to be used to
        provide proximity weights for other table(s).  
        ---
        IN
        fit_df: dataframe containing latitude and longitude columns to be used 
            in calculation of weights (df)
        match_column: set to the name of a column appearing in both dfs to 
            filter based on matching value. COMING SOON: leave as None if no 
            filter needed, ignore matching lat/lng pairs option. (str)
        lat_filter: latitude delta that should be close to 1 mi, or slightly 
            larger -- used to create a sub-table for precise calculations.
            Preset works for a latitude of ~43. (float)
        lng_filter: longitude delta close to 1 mi, or slightly larger (float)
        """

        if self.fit_ == True:
            print("ERROR: Object already fit!")
            return None
        
        self.fit_ = True
        self.match_col_ = match_column
        self.lat_filter_ = lat_filter
        self.lng_filter_ = lng_filter
        self.match_df_ = (fit_df
         .filter([self.match_col_, 'latitude', 'longitude']))
        
        
    def prox_weight(self, current, threshold, mode='binary', coarse_filter=True):
        """
        Calculates proximity weight of a location.
        ---
        IN
        current: row of dataframe for which to calculate proximity weight
        threshold: length of radius (in miles) for comparison (float)
        mode: 'binary' or 'cumulative' -- if 'binary', will return 0 if no location
            matches found within threshold, and 1 if at least one. If 'cumulative', 
            will return the number of matches found within threshold. (str)
        coarse_filter: cuts down on computation time by filtering lat/lng values
            outside of a certain perimeter from the current lat/lng. STILL UNSTABLE.
            (bool)
        OUT
        weight: proximity weight for location (df)
        """
        
        if self.fit_ == False:
            print("ERROR: You haven't fit the object, do that first.")
            return None
        
        weight = 0
        lat_in = current[self.lat_col_]
        lng_in = current[self.lng_col_]
        
        if coarse_filter == True:
            lat_lim = threshold * self.lat_filter_
            lng_lim = threshold * self.lng_filter_
            match_df = (self.match_df_[
                (self.match_df_[self.lat_col_] > (lat_in - lat_lim)) &
                (self.match_df_[self.lat_col_] < (lat_in + lat_lim)) &
                (self.match_df_[self.lng_col_] > (lng_in - lng_lim)) &
                (self.match_df_[self.lng_col_] < (lng_in + lng_lim))
                ])
        else:
            match_df = self.match_df_

        for _, row in match_df.iterrows():
            lat = row[self.lat_col_]
            lng = row[self.lng_col_]

            if current[self.match_col_] == row[self.match_col_]:
                continue
            else:
                dist = vincenty((lat, lng), (lat_in, lng_in)).miles

            if dist < threshold and mode == 'binary':
                weight = 1
                break
            elif dist < threshold and mode == 'cumulative':
                weight += 1
            else:
                continue

        return weight

In [None]:
prox_test = WeightByProximity()

In [3]:
auc_sub = auctions.filter(['address', 'parcelid', 'neighborhood', 'latitude', 'longitude']).sample(100)

In [None]:
prox_test.fit(auc_sub, match_column='parcelid')

In [None]:
# %%timeit
t = 0.5
auc_sub['bin_weight_t'] = (auc_sub
    .apply(lambda row: prox_test.prox_weight(row, t, coarse_filter=True), axis=1))

In [None]:
# %%timeit
auc_sub['bin_weight_f'] = (auc_sub
    .apply(lambda row: prox_test.prox_weight(row, t, coarse_filter=False), axis=1))

In [None]:
auc_sub.bin_weight_t.value_counts()

In [None]:
auc_sub.bin_weight_f.value_counts()

## Testing the Module

In [4]:
auc_sub.head()

Unnamed: 0,address,parcelid,neighborhood,latitude,longitude
1081,2451 Parker,17009082.0,Islandview,42.361455,-83.001227
241,18304 Grayfield,22120385.0,Seven Mile-Rouge,42.423264,-83.269373
5,17242 Beland,17015242.0,Mount Olivet,42.42192,-83.008682
434,17125 Murray Hill,22060379.0,College Park,42.416325,-83.208277
859,6902 Winthrop,22051514.0,Warren Ave Community,42.342924,-83.19709


In [5]:
pweighter = mcnultymod.WeightByProximity()

In [6]:
pweighter.fit(auc_sub, match_column='parcelid')

In [7]:
auc_sub['bin_weight'] = (auc_sub
    .apply(lambda row: pweighter.prox_weight(row, 0.2), axis=1))

In [8]:
auc_sub.head()

Unnamed: 0,address,parcelid,neighborhood,latitude,longitude,bin_weight
1081,2451 Parker,17009082.0,Islandview,42.361455,-83.001227,0
241,18304 Grayfield,22120385.0,Seven Mile-Rouge,42.423264,-83.269373,0
5,17242 Beland,17015242.0,Mount Olivet,42.42192,-83.008682,0
434,17125 Murray Hill,22060379.0,College Park,42.416325,-83.208277,0
859,6902 Winthrop,22051514.0,Warren Ave Community,42.342924,-83.19709,1
