In [1]:
# imports
import pandas as pd

# Condo Conversion Data Cleaning

**Goal**: Create dataset of condo conversions from 2013-2022 for City of Cambridge

**Steps**:
- Download the tabular data (just the .csv files) for each year from 2013-2022 - Cambridge Parcels, from 2013-2022. Include the metadata!
- Iterate over these files to detect condo conversions using this method:
   - If something changed into a condo, it will have multiple records in the parcel data but the multiple records are linked to just one street address. 
   - So, if one year a parcel just has one property in the table and then the next year it has multiple properties in the table (with same address) then we can reasonably assume a condo conversion happened.
- The new dataset should include just condo conversions and have all the same fields but labeled by year. For example, a condo conversion is detected at 4 Canal Pk. It would have all the prior fields but labeled with the prior year (PID_prior, GISID_prior, BldgNum_prior, StateClassCode_prior….etc) and all the new fields after the conversion labeled with the year (PID_post, GISID_post, BldgNum_post, StateClassCode_post….etc) along with a prior_year field and post_year field designated the years that the change took place
- The new dataset should include a new field num_condo_units that counts the number of units in the condo because that info will be interesting to explore
- Once that part is done, join this tabular data to the spatial parcel data for 2024 (join on GISID)

## Testing
Preliminary testing (just on 2022 and 2023 data) to test this

In [42]:
years = [2022, 2023, 2024]

parcel_data = {
    year: pd.read_csv(f"ASSESSING_PropertyDatabase_FY{year}.csv") for year in years
}

In [43]:
num_records_by_address_by_year = {year: pd.DataFrame({f"num_records_{year}": parcel["Address"].value_counts()}) for (year, parcel) in parcel_data.items()}

In [60]:
# filter for all the items that had 1 parcel before but more than 1 after
df = num_records_by_address_by_year[2022].join(num_records_by_address_by_year[2023])
df.head()

Unnamed: 0,num_records_2022,num_records_2023
8-12 Museum Way,437,437.0
2 Earhart St,230,230.0
75-83 Cambridge Pkwy,206,206.0
303 Third St,193,193.0
10 Rogers St,173,173.0


In [65]:
condos_df = df[(df.num_records_2022 == 1) & (df.num_records_2023 > 1)].reset_index().rename(columns={"index": "Address"})
condos_df.head()

Unnamed: 0,Address,num_records_2022,num_records_2023
0,2 Stearns St,1,3.0
1,30 Berkshire St,1,4.0
2,87 Kinnaird St,1,4.0
3,46-50 Jay St,1,2.0
4,234-236 Allston St,1,3.0


In [77]:
# merge data from previous year with current year
merged_data = pd.merge(parcel_data[2022], parcel_data[2023], how="left", on="Address", suffixes=("_prior", "_post"))
merged_data.head()

Unnamed: 0,PID_prior,GISID_prior,BldgNum_prior,Address,Unit_prior,StateClassCode_prior,PropertyClass_prior,Zoning_prior,MapLot_prior,LandArea_prior,...,Systems_Plumbing_post,Condition_YearBuilt_post,Condition_InteriorCondition_post,Condition_OverallCondition_post,Condition_OverallGrade_post,Parking_Open_post,Parking_Covered_post,Parking_Garage_post,UnfinishedBasementGross_post,FinishedBasementGross_post
0,7207,112-10,1,18 Ellsworth Ave,,104,TWO-FAM-RES,C-1,112-10,4297,...,,1903.0,Good,Good,Good Very Good,0.0,0.0,,625.0,624.0
1,7208,112-11,1,20 Ellsworth Ave,,199,CONDO-BLDG,C-1,112-11,3600,...,,0.0,,,,0.0,0.0,,1292.0,0.0
2,7208,112-11,1,20 Ellsworth Ave,,199,CONDO-BLDG,C-1,112-11,3600,...,,1903.0,,Good,Average,0.0,0.0,0.0,0.0,0.0
3,7208,112-11,1,20 Ellsworth Ave,,199,CONDO-BLDG,C-1,112-11,3600,...,,1903.0,,Excellent,Average,0.0,0.0,0.0,0.0,0.0
4,7209,112-11,1,20 Ellsworth Ave,1.0,102,CONDOMINIUM,,112-11-1,0,...,,0.0,,,,0.0,0.0,,1292.0,0.0


In [78]:
merged_data.shape

(717384, 121)

In [112]:
# get conversion data
# TODO: we have multiple records per address because there are small differences in the data (i.e. the unit is different / value is different) 
#       ==> do we want to keep those or remove? 
conversion_data = condos_df.set_index("Address").join(merged_data.set_index("Address"))
conversion_data.head()

Unnamed: 0_level_0,num_records_2022,num_records_2023,PID_prior,GISID_prior,BldgNum_prior,Unit_prior,StateClassCode_prior,PropertyClass_prior,Zoning_prior,MapLot_prior,...,Systems_Plumbing_post,Condition_YearBuilt_post,Condition_InteriorCondition_post,Condition_OverallCondition_post,Condition_OverallGrade_post,Parking_Open_post,Parking_Covered_post,Parking_Garage_post,UnfinishedBasementGross_post,FinishedBasementGross_post
Address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100 Banks St,1,13.0,10217,131-47,1,,112,>8-UNIT-APT,C-1,131-47,...,,0.0,,,,0.0,0.0,,0.0,0.0
100 Banks St,1,13.0,10217,131-47,1,,112,>8-UNIT-APT,C-1,131-47,...,,1902.0,,Good,Good,1.0,0.0,0.0,0.0,0.0
100 Banks St,1,13.0,10217,131-47,1,,112,>8-UNIT-APT,C-1,131-47,...,,10902.0,,Good,Good,1.0,0.0,0.0,0.0,0.0
100 Banks St,1,13.0,10217,131-47,1,,112,>8-UNIT-APT,C-1,131-47,...,,1902.0,,Good,Good,1.0,0.0,0.0,0.0,0.0
100 Banks St,1,13.0,10217,131-47,1,,112,>8-UNIT-APT,C-1,131-47,...,,1902.0,,Good,Good,1.0,0.0,0.0,0.0,0.0


In [113]:
conversion_data = conversion_data.rename(columns={"num_records_2023": "num_condo_units"}).drop(columns=["num_records_2022"])
conversion_data["num_condo_units"] = conversion_data["num_condo_units"].astype(int)
conversion_data["prior_year"] = 2022
conversion_data["post_year"] = 2023
conversion_data.head()

Unnamed: 0_level_0,num_condo_units,PID_prior,GISID_prior,BldgNum_prior,Unit_prior,StateClassCode_prior,PropertyClass_prior,Zoning_prior,MapLot_prior,LandArea_prior,...,Condition_InteriorCondition_post,Condition_OverallCondition_post,Condition_OverallGrade_post,Parking_Open_post,Parking_Covered_post,Parking_Garage_post,UnfinishedBasementGross_post,FinishedBasementGross_post,prior_year,post_year
Address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100 Banks St,13,10217,131-47,1,,112,>8-UNIT-APT,C-1,131-47,13451,...,,,,0.0,0.0,,0.0,0.0,2022,2023
100 Banks St,13,10217,131-47,1,,112,>8-UNIT-APT,C-1,131-47,13451,...,,Good,Good,1.0,0.0,0.0,0.0,0.0,2022,2023
100 Banks St,13,10217,131-47,1,,112,>8-UNIT-APT,C-1,131-47,13451,...,,Good,Good,1.0,0.0,0.0,0.0,0.0,2022,2023
100 Banks St,13,10217,131-47,1,,112,>8-UNIT-APT,C-1,131-47,13451,...,,Good,Good,1.0,0.0,0.0,0.0,0.0,2022,2023
100 Banks St,13,10217,131-47,1,,112,>8-UNIT-APT,C-1,131-47,13451,...,,Good,Good,1.0,0.0,0.0,0.0,0.0,2022,2023
