In [1]:
# KNN implementation
# Author: Felipe Bras, https://github.com/felipebras/
# Date: 14-Sep-2022

In [2]:
from collections import Counter
from typing import TypeVar
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
def calc_euclid_dist(a: TypeVar('T'), b:TypeVar('T')) -> float:
    """Calculates euclidean distance between vectors a and b.
    """
    
    assert len(a) == len(b), "Vectors must have the same number of dimensions."
    
    _a, _b = np.asarray(a), np.asarray(b)
    
    return np.sqrt(np.sum((_a - _b)**2))

In [4]:
class Knn:
    """"K-nearest neighbours class implementation. """
    
    def __init__(self, k=3):
        assert k % 2 != 0, "Hyperparameter k should be odd."
        self.k = k
        
    def fit(self, X: TypeVar('T'), y: TypeVar('T')):
        self.X_train = np.asarray(X)
        self.y_train = np.asarray(y)
        
        assert self.y_train.ndim == 1, "Labels should be a 1-dimensional array."
        assert self.X_train.shape[0] == self.y_train.shape[0], "Check shape of training data."
    
    def predict(self, x: TypeVar('T')):
        x_pred = np.asarray(x)
        assert x_pred.ndim == 1, "Check only one point at a time."
        assert x_pred.shape[0] == self.X_train.shape[1], "Incorrect dimension of the point to be classified."
        
        label_indexes = self._order_distances(x_pred)
        order = Counter([self.y_train[_t] for _t in label_indexes])
        
        return order.most_common(1)[0][0]
    
    def _order_distances(self, x_pred: TypeVar('T')) -> list[int]:
        distances = [calc_euclid_dist(x_pred, _x_train) for _x_train in self.X_train]
        dist_indexes = np.argsort(distances)
        
        return list(dist_indexes)[:self.k]
        

In [5]:
nyc = pd.read_csv("nyc-rolling-sales.csv")
nyc

Unnamed: 0.1,Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,...,5,0,5,1633,6440,1900,2,C2,6625000,2017-07-19 00:00:00
1,5,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,26,,C7,234 EAST 4TH STREET,...,28,3,31,4616,18690,1900,2,C7,-,2016-12-14 00:00:00
2,6,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,39,,C7,197 EAST 3RD STREET,...,16,1,17,2212,7803,1900,2,C7,-,2016-12-09 00:00:00
3,7,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,...,10,0,10,2272,6794,1913,2,C4,3936272,2016-09-23 00:00:00
4,8,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,...,6,0,6,2369,4615,1900,2,C2,8000000,2016-11-17 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84543,8409,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7349,34,,B9,37 QUAIL LANE,...,2,0,2,2400,2575,1998,1,B9,450000,2016-11-28 00:00:00
84544,8410,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7349,78,,B9,32 PHEASANT LANE,...,2,0,2,2498,2377,1998,1,B9,550000,2017-04-21 00:00:00
84545,8411,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7351,60,,B2,49 PITNEY AVENUE,...,2,0,2,4000,1496,1925,1,B2,460000,2017-07-05 00:00:00
84546,8412,5,WOODROW,22 STORE BUILDINGS,4,7100,28,,K6,2730 ARTHUR KILL ROAD,...,0,7,7,208033,64117,2001,4,K6,11693337,2016-12-21 00:00:00


In [6]:
nyc.sample(3, random_state=314)

Unnamed: 0.1,Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
63990,14592,4,JACKSON HEIGHTS,10 COOPS - ELEVATOR APARTMENTS,2,1245,30,,D4,"72-17 34TH AVENUE, 2E",...,0,0,0,-,-,1937,2,D4,440000,2016-12-13 00:00:00
49604,206,4,ARVERNE,31 COMMERCIAL VACANT LAND,4,16005,37,,V1,62-27 DECOSTA AVENUE,...,0,0,0,-,-,0,4,V1,460356,2017-02-14 00:00:00
38969,13618,3,FLATBUSH-NORTH,07 RENTALS - WALKUP APARTMENTS,2A,4605,63,,C3,109 EAST 51ST STREET,...,4,0,4,2700,3608,1930,2,C3,575000,2016-09-26 00:00:00


In [7]:
nyc.columns

Index(['Unnamed: 0', 'BOROUGH', 'NEIGHBORHOOD', 'BUILDING CLASS CATEGORY',
       'TAX CLASS AT PRESENT', 'BLOCK', 'LOT', 'EASE-MENT',
       'BUILDING CLASS AT PRESENT', 'ADDRESS', 'APARTMENT NUMBER', 'ZIP CODE',
       'RESIDENTIAL UNITS', 'COMMERCIAL UNITS', 'TOTAL UNITS',
       'LAND SQUARE FEET', 'GROSS SQUARE FEET', 'YEAR BUILT',
       'TAX CLASS AT TIME OF SALE', 'BUILDING CLASS AT TIME OF SALE',
       'SALE PRICE', 'SALE DATE'],
      dtype='object')

In [8]:
nyc['BUILDING CLASS CATEGORY'].unique()

array(['07 RENTALS - WALKUP APARTMENTS             ',
       '08 RENTALS - ELEVATOR APARTMENTS           ',
       '09 COOPS - WALKUP APARTMENTS               ',
       '10 COOPS - ELEVATOR APARTMENTS             ',
       '11A CONDO-RENTALS                           ',
       '12 CONDOS - WALKUP APARTMENTS              ',
       '13 CONDOS - ELEVATOR APARTMENTS            ',
       '14 RENTALS - 4-10 UNIT                     ',
       '15 CONDOS - 2-10 UNIT RESIDENTIAL          ',
       '16 CONDOS - 2-10 UNIT WITH COMMERCIAL UNIT ',
       '17 CONDO COOPS                             ',
       '22 STORE BUILDINGS                         ',
       '37 RELIGIOUS FACILITIES                    ',
       '42 CONDO CULTURAL/MEDICAL/EDUCATIONAL/ETC  ',
       '46 CONDO STORE BUILDINGS                   ',
       '47 CONDO NON-BUSINESS STORAGE              ',
       '01 ONE FAMILY DWELLINGS                    ',
       '02 TWO FAMILY DWELLINGS                    ',
       '03 THREE FAMILY DWE

In [9]:
(nyc[
    (nyc['BUILDING CLASS CATEGORY'].isin(['07 RENTALS - WALKUP APARTMENTS             ', '08 RENTALS - ELEVATOR APARTMENTS           '])
    &
    (nyc['SALE PRICE'] != 0))
]
)

Unnamed: 0.1,Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,...,5,0,5,1633,6440,1900,2,C2,6625000,2017-07-19 00:00:00
1,5,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,26,,C7,234 EAST 4TH STREET,...,28,3,31,4616,18690,1900,2,C7,-,2016-12-14 00:00:00
2,6,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,39,,C7,197 EAST 3RD STREET,...,16,1,17,2212,7803,1900,2,C7,-,2016-12-09 00:00:00
3,7,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,...,10,0,10,2272,6794,1913,2,C4,3936272,2016-09-23 00:00:00
4,8,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,...,6,0,6,2369,4615,1900,2,C2,8000000,2016-11-17 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83861,7727,5,WEST NEW BRIGHTON,07 RENTALS - WALKUP APARTMENTS,2A,208,20,,C3,47 CAROLINE STREET,...,4,0,4,7520,3040,1931,2,C3,512000,2017-03-09 00:00:00
83862,7728,5,WEST NEW BRIGHTON,07 RENTALS - WALKUP APARTMENTS,2A,219,15,,C3,618 DELAFIELD AVENUE,...,4,0,4,9042,6190,1899,2,C3,900000,2017-06-19 00:00:00
83863,7729,5,WEST NEW BRIGHTON,07 RENTALS - WALKUP APARTMENTS,2A,226,46,,C3,11 DISOSWAY PLACE,...,4,0,4,13540,5300,1933,2,C3,-,2016-10-03 00:00:00
83864,7730,5,WEST NEW BRIGHTON,07 RENTALS - WALKUP APARTMENTS,2A,235,11,,C3,187 DUBOIS AVENUE,...,4,0,4,5000,1444,1917,2,C3,-,2016-10-26 00:00:00


In [10]:
nyc['SALE PRICE 2'] = nyc['SALE PRICE'].apply(lambda t: float(t) if t != ' -  ' else -10000)

In [11]:
nyc['GROSS SQUARE FEET 2'] = nyc['GROSS SQUARE FEET'].apply(lambda t: float(t) if t != ' -  ' else -10000)

In [12]:
nyc['USD per SQFT'] = nyc['SALE PRICE 2'] / nyc['GROSS SQUARE FEET 2']

In [13]:
nyc[nyc['USD per SQFT'] > 1.0]

Unnamed: 0.1,Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,...,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,SALE PRICE 2,GROSS SQUARE FEET 2,USD per SQFT
0,4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,...,1633,6440,1900,2,C2,6625000,2017-07-19 00:00:00,6625000.0,6440.0,1028.726708
3,7,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,...,2272,6794,1913,2,C4,3936272,2016-09-23 00:00:00,3936272.0,6794.0,579.374742
4,8,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,...,2369,4615,1900,2,C2,8000000,2016-11-17 00:00:00,8000000.0,4615.0,1733.477790
6,10,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,406,32,,C4,210 AVENUE B,...,1750,4226,1920,2,C4,3192840,2016-09-23 00:00:00,3192840.0,4226.0,755.522953
9,13,1,ALPHABET CITY,08 RENTALS - ELEVATOR APARTMENTS,2,387,153,,D9,629 EAST 5TH STREET,...,4489,18523,1920,2,D9,16232000,2016-11-07 00:00:00,16232000.0,18523.0,876.315932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84543,8409,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7349,34,,B9,37 QUAIL LANE,...,2400,2575,1998,1,B9,450000,2016-11-28 00:00:00,450000.0,2575.0,174.757282
84544,8410,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7349,78,,B9,32 PHEASANT LANE,...,2498,2377,1998,1,B9,550000,2017-04-21 00:00:00,550000.0,2377.0,231.384098
84545,8411,5,WOODROW,02 TWO FAMILY DWELLINGS,1,7351,60,,B2,49 PITNEY AVENUE,...,4000,1496,1925,1,B2,460000,2017-07-05 00:00:00,460000.0,1496.0,307.486631
84546,8412,5,WOODROW,22 STORE BUILDINGS,4,7100,28,,K6,2730 ARTHUR KILL ROAD,...,208033,64117,2001,4,K6,11693337,2016-12-21 00:00:00,11693337.0,64117.0,182.374986


In [14]:
nyc['YEAR BUILT'].unique()

array([1900, 1913, 1920, 1910, 2009, 1925, 1902, 1928, 1930, 1935, 1937,
       1915, 1950, 1929, 1901, 1940, 2005,    0, 1989, 2014, 2008, 1965,
       2013, 2003, 2006, 2007, 1951, 1899, 1850, 1905, 1864, 1917, 1911,
       1983, 1926, 1963, 1960, 1889, 1898, 1939, 1938, 1927, 1909, 1958,
       1904, 1907, 1987, 1931, 1984, 1948, 2004, 1918, 1875, 2012, 1973,
       2011, 1922, 2001, 1932, 1980, 1908, 1953, 1906, 2015, 1946, 1921,
       2010, 1954, 1111, 1924, 1990, 1890, 1991, 1988, 1895, 2016, 1957,
       1986, 1966, 1998, 1870, 1923, 1969, 2017, 1968, 1934, 1956, 1982,
       1914, 1903, 1967, 1840, 1912, 1964, 1955, 1961, 1851, 2000, 1959,
       1962, 1945, 1972, 1976, 1916, 1880, 1970, 1846, 1941, 1952, 1896,
       1985, 1981, 1888, 1947, 1975, 1974, 2002, 1994, 1892, 1894, 1891,
       1996, 1997, 1949, 1999, 1800, 1979, 1971, 1977, 1942, 1978, 1826,
       1881, 1919, 1883, 1936, 1993, 1995, 1933, 1992, 1943, 1944, 1847,
       1829, 1844, 1835, 1852, 1856, 1854, 1832, 18

In [15]:
nyc_subset = (
    nyc[(nyc['BUILDING CLASS CATEGORY'].isin(['07 RENTALS - WALKUP APARTMENTS             ', '08 RENTALS - ELEVATOR APARTMENTS           ']))
        &
        (nyc['USD per SQFT'] > 1.0)
        &
        (nyc['YEAR BUILT'] > 0)
    ]
)
nyc_subset

Unnamed: 0.1,Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,...,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE,SALE PRICE 2,GROSS SQUARE FEET 2,USD per SQFT
0,4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,...,1633,6440,1900,2,C2,6625000,2017-07-19 00:00:00,6625000.0,6440.0,1028.726708
3,7,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,...,2272,6794,1913,2,C4,3936272,2016-09-23 00:00:00,3936272.0,6794.0,579.374742
4,8,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,...,2369,4615,1900,2,C2,8000000,2016-11-17 00:00:00,8000000.0,4615.0,1733.477790
6,10,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,406,32,,C4,210 AVENUE B,...,1750,4226,1920,2,C4,3192840,2016-09-23 00:00:00,3192840.0,4226.0,755.522953
9,13,1,ALPHABET CITY,08 RENTALS - ELEVATOR APARTMENTS,2,387,153,,D9,629 EAST 5TH STREET,...,4489,18523,1920,2,D9,16232000,2016-11-07 00:00:00,16232000.0,18523.0,876.315932
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83486,7352,5,TOTTENVILLE,07 RENTALS - WALKUP APARTMENTS,2A,7997,58,,C3,5309 ARTHUR KILL ROAD,...,8060,2385,1915,2,C3,354987,2016-09-02 00:00:00,354987.0,2385.0,148.841509
83859,7725,5,WEST NEW BRIGHTON,07 RENTALS - WALKUP APARTMENTS,2A,134,99,,C3,420 HOYT AVENUE,...,5250,4672,1925,2,C3,50000,2016-09-23 00:00:00,50000.0,4672.0,10.702055
83860,7726,5,WEST NEW BRIGHTON,07 RENTALS - WALKUP APARTMENTS,2A,136,46,,C3,468 HOYT AVENUE,...,5250,3424,1920,2,C3,1150000,2016-09-14 00:00:00,1150000.0,3424.0,335.864486
83861,7727,5,WEST NEW BRIGHTON,07 RENTALS - WALKUP APARTMENTS,2A,208,20,,C3,47 CAROLINE STREET,...,7520,3040,1931,2,C3,512000,2017-03-09 00:00:00,512000.0,3040.0,168.421053


In [23]:
X_train, X_test, y_train, y_test = (
    train_test_split(nyc_subset[['YEAR BUILT', 'USD per SQFT']], 
                     nyc_subset['NEIGHBORHOOD'], 
                     test_size=0.33, random_state=314, shuffle=True))

In [24]:
KNN = Knn()

In [25]:
KNN.fit(X_train, y_train)

In [26]:
KNN.predict(X_test.iloc[100])

'HARLEM-CENTRAL'

In [27]:
y_test.iloc[100]

'UPPER WEST SIDE (79-96)'

In [28]:
predictions = []
for _i in tqdm(range(y_test.shape[0])):
    predictions.append(KNN.predict(X_test.iloc[_i]) == y_test.iloc[_i])

100%|███████████████████████████████████████████████████████████████████████████████| 641/641 [00:06<00:00, 100.03it/s]


In [29]:
print(
    f"Correct predictions: {np.sum(predictions)} \n",
    f"Test sample size: {y_test.shape[0]} \n",
    f"Accuracy: {np.sum(predictions) / y_test.shape[0]:.2f}"
)

Correct predictions: 48 
 Test sample size: 641 
 Accuracy: 0.07


In [None]:
# Yikes!