In [1]:
#add auto reload for src function testing
%load_ext autoreload
%autoreload 2

#let's add the project directory to our module path
import os
import sys

module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
    
#also import all of our modules
import pandas as pd
import numpy as np
from src import data_cleaning

#and here is our data directiory
data_folder = '../../data/'

# Initial Import

In [2]:
rp_sale = pd.read_csv(data_folder+'EXTR_RPSale.csv', dtype={'Major': 'str', 'Minor':'str'})
res_bldg = pd.read_csv(data_folder+'EXTR_ResBldg.csv', dtype={'Major': 'str', 'Minor':'str', 'ZipCode': 'str'})

Some csv's require being imported with latin-1 encoding.

In [3]:
parcel = pd.read_csv(data_folder+'EXTR_Parcel.csv', dtype={'Major': 'str', 'Minor':'str'}, encoding='latin-1')
accessory = pd.read_csv(data_folder+'EXTR_Accessory_V.csv', dtype={'Major': 'str', 'Minor':'str'}, encoding='latin-1')

# Filter by year

In [4]:
rp_sale_2019 = data_cleaning.filter_data_by_year(rp_sale, 2019)
rp_sale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2087944 entries, 0 to 2087943
Data columns (total 24 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ExciseTaxNbr        int64         
 1   Major               object        
 2   Minor               object        
 3   DocumentDate        datetime64[ns]
 4   SalePrice           int64         
 5   RecordingNbr        object        
 6   Volume              object        
 7   Page                object        
 8   PlatNbr             object        
 9   PlatType            object        
 10  PlatLot             object        
 11  PlatBlock           object        
 12  SellerName          object        
 13  BuyerName           object        
 14  PropertyType        int64         
 15  PrincipalUse        int64         
 16  SaleInstrument      int64         
 17  AFForestLand        object        
 18  AFCurrentUseLand    object        
 19  AFNonProfitUse      object        
 20  AF

# Creating PINS

In [5]:
res_bldg = data_cleaning.add_PIN_column(res_bldg)
rp_sale_2019 = data_cleaning.add_PIN_column(rp_sale_2019)
parcel = data_cleaning.add_PIN_column(parcel)
accessory = data_cleaning.add_PIN_column(accessory)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Major'] = df['Major'].apply(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Minor'] = df['Minor'].apply(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Major'] = df['Major'].apply(lambda elem: elem.rjust(6, '0'))
A value is trying to be set on a copy of a slice from a DataFrame.

# Filtering by number of buildings per parcel

First I grouped the res_bldg data by PIN and then counted the number of buildings.

In [6]:
number_of_buildings = res_bldg.groupby('PIN').BldgNbr.count()
number_of_buildings.sort_values()

PIN
0000400001     1
6127600080     1
6127600075     1
6127600070     1
6127600065     1
              ..
2422079026    16
2722059164    19
0822119001    21
3223059036    28
1522049091    31
Name: BldgNbr, Length: 509573, dtype: int64

### Then I created a dataframe with only the pins that have one building.

In [7]:
one_building_parcels = pd.DataFrame(number_of_buildings[number_of_buildings==1].index) 
one_building_parcels = one_building_parcels.set_index('PIN')
one_building_parcels

0000400001
0001000003
0001000009
0001000035
0001000040
...
9906000060
9906000065
9906000080
9906000090
9906000100


### Then I did an inner join on the sales data to find the overlap between 2019 sales and parcels that have only one building on them.

In [8]:
one_building_sales_2019 = rp_sale_2019.join(one_building_parcels, how='inner', on='PIN')
one_building_sales_2019

Unnamed: 0,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,...,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning,PIN
257,3027422,213043,0120,2019-12-20,560000,20191226000848,,,,,...,6,3,N,N,N,N,1,8,,2130430120
302,3002257,940652,0630,2019-07-22,435000,20190730001339,,,,,...,6,3,N,N,N,N,1,8,,9406520630
465,2993601,140281,0020,2019-06-04,450000,20190614000489,,,,,...,6,3,N,N,N,N,1,8,,1402810020
482,3015516,779790,0030,2019-10-07,0,20191016000009,,,,,...,6,3,N,N,N,N,1,8,,7797900030
594,3015264,124550,0098,2019-09-27,193000,20191015000395,,,,,...,6,15,N,N,N,N,18,8,18 51 52,1245500098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2087770,2992593,405940,0095,2019-05-21,442000,20190607001670,,,,,...,6,3,N,N,N,N,1,8,,4059400095
2087773,3003079,381670,0025,2019-07-22,680000,20190802000628,,,,,...,6,3,N,N,N,N,1,8,,3816700025
2087774,3025068,615020,0505,2019-11-18,745000,20191212000602,,,,,...,6,3,N,N,N,N,1,8,,6150200505
2087900,2997920,302300,0320,2019-05-01,0,20190705000539,,,,,...,6,15,N,N,N,N,1,8,,3023000320


In [9]:
rp_sale_2019

Unnamed: 0,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,...,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning,PIN
72,2999169,919715,0200,2019-07-08,192000,20190712001080,,,,,...,2,3,N,N,N,N,1,3,,9197150200
236,3000673,894444,0200,2019-06-26,185000,20190722001395,,,,,...,2,3,N,N,N,N,1,3,,8944440200
257,3027422,213043,0120,2019-12-20,560000,20191226000848,,,,,...,6,3,N,N,N,N,1,8,,2130430120
302,3002257,940652,0630,2019-07-22,435000,20190730001339,,,,,...,6,3,N,N,N,N,1,8,,9406520630
446,3018109,152504,9008,2019-10-18,7600000,20191030001615,,,,,...,7,3,N,N,N,N,1,2,,1525049008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2087782,2986698,033310,0255,2019-04-30,6500000,20190508000847,,,,,...,6,3,N,N,N,N,1,8,,0333100255
2087856,2980997,334840,1022,2019-04-03,29000,20190405000544,,,,,...,6,18,N,N,N,N,18,7,13 31,3348401022
2087900,2997920,302300,0320,2019-05-01,0,20190705000539,,,,,...,6,15,N,N,N,N,1,8,,3023000320
2087907,3028691,277110,4239,2019-12-19,955000,20200103000685,,,,,...,6,3,N,N,N,N,1,8,,2771104239


### I'm going to get a list of unique pins that are found in both the 2019 sales data, and the list of Parcels that have only one building on them. Then I will create a dataframe that only contains one column containing only PINS

In [11]:
unique_pins = one_building_sales_2019.PIN.unique()

PINS = pd.DataFrame(unique_pins, columns=['PIN']).set_index('PIN')
PINS.to_csv(data_folder+'PINS.csv')
PINS


2130430120
9406520630
1402810020
7797900030
1245500098
...
4059400095
3816700025
6150200505
3023000320
2771104239


## Final filter

In [12]:
rp_sale_2019.join(PINS, how='inner', on='PIN')

Unnamed: 0,ExciseTaxNbr,Major,Minor,DocumentDate,SalePrice,RecordingNbr,Volume,Page,PlatNbr,PlatType,...,PrincipalUse,SaleInstrument,AFForestLand,AFCurrentUseLand,AFNonProfitUse,AFHistoricProperty,SaleReason,PropertyClass,SaleWarning,PIN
257,3027422,213043,0120,2019-12-20,560000,20191226000848,,,,,...,6,3,N,N,N,N,1,8,,2130430120
302,3002257,940652,0630,2019-07-22,435000,20190730001339,,,,,...,6,3,N,N,N,N,1,8,,9406520630
465,2993601,140281,0020,2019-06-04,450000,20190614000489,,,,,...,6,3,N,N,N,N,1,8,,1402810020
482,3015516,779790,0030,2019-10-07,0,20191016000009,,,,,...,6,3,N,N,N,N,1,8,,7797900030
594,3015264,124550,0098,2019-09-27,193000,20191015000395,,,,,...,6,15,N,N,N,N,18,8,18 51 52,1245500098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2087770,2992593,405940,0095,2019-05-21,442000,20190607001670,,,,,...,6,3,N,N,N,N,1,8,,4059400095
2087773,3003079,381670,0025,2019-07-22,680000,20190802000628,,,,,...,6,3,N,N,N,N,1,8,,3816700025
2087774,3025068,615020,0505,2019-11-18,745000,20191212000602,,,,,...,6,3,N,N,N,N,1,8,,6150200505
2087900,2997920,302300,0320,2019-05-01,0,20190705000539,,,,,...,6,15,N,N,N,N,1,8,,3023000320


# Export

In [13]:
res_bldg_2019 = res_bldg.join(PINS, how='inner', on='PIN')
rp_sale_2019 = rp_sale_2019.join(PINS, how='inner', on='PIN')
parcel_2019 = parcel.join(PINS, how='inner', on='PIN')
accessory_2019 = accessory.join(PINS, how='inner', on='PIN')a

In [24]:
res_bldg_2019.to_csv(data_folder+'EXTR_ResBldg_2019.csv')
parcel_2019.to_csv(data_folder+'EXTR_Parcel_2019.csv')
accessory_2019.to_csv(data_folder+'EXTR_Accessory_V_2019.csv')
rp_sale_2019.to_csv(data_folder+'EXTR_RPSale_2019.csv')

# Adding more data

First, make sure you import the original csv with the right arguments, it may need special encoding, and it may need you to specifiy data type.

In [53]:
condo_unit2 = pd.read_csv(data_folder+'EXTR_CondoUnit2.csv', dtype={'Major': 'string', 'Minor':'string'})

Then, filter the file by our PINS

In [54]:
condo_unit2 = data_cleaning.filter_data_by_PIN(condo_unit2)

Check to make sure it looks okay.

In [56]:
condo_unit2.head()

Unnamed: 0,Major,Minor,UnitType,BldgNbr,UnitNbr,PcntOwnership,UnitQuality,UnitLoc,FloorNbr,TopFloor,...,Address,BuildingNumber,Fraction,DirectionPrefix,StreetName,StreetType,DirectionSuffix,UnitDescr,ZipCode,PIN


Export it the data file, with the suffix '_2019'

In [43]:
#skipped because this is a useless example

#PINS = pd.read_csv(data_folder+'PINS.csv',dtype={'PIN':'string'}, index_col=0)