# 1. Check if all datasets have the same columns

In [1]:
import pandas as pd

In [2]:
base_file_path = "all_data/ASSESSING_PropertyDatabase"

def check_columns_consistency(base_file_path, start_year=2015, end_year=2024):
    columns_dict = {}  # To store columns of each dataset by year
    inconsistencies = []  # To store any inconsistencies found

    # Load each dataset and store its columns
    for year in range(start_year, end_year + 1):
        file_path = f'{base_file_path}_FY{year}.csv'
        df = pd.read_csv(file_path)
        columns_dict[year] = set(df.columns)
#         print(columns_dict)

    # Compare columns of the first dataset with the rest
    base_columns = columns_dict[start_year]
    for year, columns in columns_dict.items():
        if columns != base_columns:
            differences = base_columns.symmetric_difference(columns)
            inconsistencies.append((year, differences))

    # Prepare the summary report
    if not inconsistencies:
        summary = "All datasets from {} to {} have the same columns.".format(start_year, end_year)
    else:
        summary = "Found inconsistencies in dataset columns for the following years:\n"
        for year, diff in inconsistencies:
            summary += f"Year {year}: Different columns - {diff}\n"

    return summary


summary = check_columns_consistency(base_file_path, 2015, 2024)
print(summary)


Found inconsistencies in dataset columns for the following years:
Year 2016: Different columns - {'Owner_City', 'Owner_Address', 'Owner_Name', 'Owner_Address2', 'Owner_Zip', 'Owner_State', 'Owners', 'Owner_CoOwnerName'}
Year 2017: Different columns - {'Owner_City', 'Owner_Address', 'Owner_Name', 'Owner_Address2', 'Owner_Zip', 'Owner_State', 'Owners', 'Owner_CoOwnerName'}
Year 2018: Different columns - {'Owner_City', 'Owner_Address', 'Owner_Name', 'Owner_Address2', 'Owner_Zip', 'Owner_State', 'Owners', 'Owner_CoOwnerName'}
Year 2019: Different columns - {'Owner_City', 'PropertyTaxAmount', 'Owner_Address', 'Owner_Name', 'Owner_Address2', 'Owner_Zip', 'Owner_State', 'Owners', 'Owner_CoOwnerName'}
Year 2020: Different columns - {'Owner_City', 'PropertyTaxAmount', 'Exterior_Occupancy', 'Owner_Address', 'Map/Lot', 'BookPage', 'Book/Page', 'Owner_Name', 'Exterior_occupancy', 'Owner_Address2', 'Owner_Zip', 'MapLot', 'Owner_State', 'Owners', 'Owner_CoOwnerName'}
Year 2021: Different columns - {

In [3]:
## load each datasets into a dataframe, data_yyyy

def load_datasets():
    for year in range(2015, 2025):  # From 2015 to 2023 inclusive
        file_name = f"all_data/ASSESSING_PropertyDatabase_FY{year}.csv"  # Adjust the path as needed
        variable_name = f"data_{year}"
        # Dynamically construct the command to read the CSV and assign it to a variable
        command = f"{variable_name} = pd.read_csv('{file_name}')"
        exec(command, globals())

# Call the function to load the datasets
load_datasets()

# After calling load_datasets(), variables data_2015, data_2016, ..., data_2024 will be available

In [4]:
data_2015["GISID"]

0             ="7-18"
1             ="7-19"
2             ="7-29"
3             ="7-31"
4             ="7-34"
             ...     
28798       ="1A-179"
28799       ="93-137"
28800       ="23-157"
28801       ="75-171"
28802    ="267.1-277"
Name: GISID, Length: 28803, dtype: object

### Clean GISID entry of data_2015

In [5]:
# Function to clean the GISID column
def clean_gisid(value):
    # Remove unwanted characters
    cleaned_value = value.replace('="', '').replace('"', '')
    # Extract numbers and reformat
    numbers = cleaned_value.split('-')
    if len(numbers) == 2:
        try:
            # Convert to float and format back to string to normalize the number format
            num1 = int(numbers[0])
            num2 = int(numbers[1])
            return f"{num1}-{num2}"
        except ValueError:
            # If conversion fails, return the original cleaned string
            return cleaned_value
    else:
        # Return the original cleaned string if it doesn't contain exactly two numbers
        return cleaned_value

# Apply the cleaning function to the GISID column
data_2015['GISID'] = data_2015['GISID'].apply(clean_gisid)

data_2015["GISID"]


0             7-18
1             7-19
2             7-29
3             7-31
4             7-34
           ...    
28798       1A-179
28799       93-137
28800       23-157
28801       75-171
28802    267.1-277
Name: GISID, Length: 28803, dtype: object

## Resolving column inconsistencies across datasets

For example: 

Dataset 2015 - 2019 has column name "Exterior_occupancy"

Dataset 2020 - 2024 has column name "Exterior_Occupancy"

In [6]:
# Sets of different columns for each year
columns_2016 = {'Owner_Zip', 'Owner_Address2', 'Owner_Name', 'Owner_CoOwnerName', 'Owners', 'Owner_Address', 'Owner_State', 'Owner_City'}
columns_2017 = {'Owner_Zip', 'Owner_Address2', 'Owner_Name', 'Owner_CoOwnerName', 'Owners', 'Owner_Address', 'Owner_State', 'Owner_City'}
columns_2018 = {'Owner_Zip', 'Owner_Address2', 'Owner_Name', 'Owner_CoOwnerName', 'Owners', 'Owner_Address', 'Owner_State', 'Owner_City'}
columns_2019 = {'Owner_Zip', 'Owner_Address2', 'PropertyTaxAmount', 'Owner_Name', 'Owner_CoOwnerName', 'Owners', 'Owner_Address', 'Owner_State', 'Owner_City'}
columns_2020 = {'Book/Page', 'Exterior_Occupancy', 'BookPage', 'Owner_Zip', 'Owner_Address2', 'PropertyTaxAmount', 'Exterior_occupancy', 'Owner_Name', 'Owner_CoOwnerName', 'Owners', 'Owner_Address', 'Map/Lot', 'Owner_State', 'MapLot', 'Owner_City'}
columns_2021 = {'Book/Page', 'Exterior_Occupancy', 'BookPage', 'Owner_Zip', 'Owner_Address2', 'Exterior_occupancy', 'Owner_Name', 'Owner_CoOwnerName', 'Owners', 'Owner_Address', 'Map/Lot', 'Owner_State', 'MapLot', 'Owner_City'}
columns_2022 = {'Book/Page', 'Exterior_Occupancy', 'BookPage', 'Owner_Zip', 'Owner_Address2', 'PropertyTaxAmount', 'Exterior_occupancy', 'Owner_Name', 'Owner_CoOwnerName', 'Owners', 'Owner_Address', 'Map/Lot', 'Owner_State', 'MapLot', 'Owner_City'}
columns_2023 = {'Book/Page', 'Exterior_Occupancy', 'BookPage', 'Owner_Zip', 'Owner_Address2', 'PropertyTaxAmount', 'Exterior_occupancy', 'Owner_Name', 'Owner_CoOwnerName', 'Owners', 'Owner_Address', 'Map/Lot', 'Owner_State', 'MapLot', 'Owner_City'}
columns_2024 = {'Book/Page', 'Exterior_Occupancy', 'BookPage', 'Owner_Zip', 'Owner_Address2', 'PropertyTaxAmount', 'Exterior_occupancy', 'Owner_Name', 'Owner_CoOwnerName', 'Owners', 'Owner_Address', 'Map/Lot', 'Owner_State', 'MapLot', 'Owner_City'}

# Combine all sets to get a set of unique column names
unique_columns = set.union(columns_2016, columns_2017, columns_2018, columns_2019, columns_2020, columns_2021, columns_2022, columns_2023, columns_2024)
unique_columns


{'Book/Page',
 'BookPage',
 'Exterior_Occupancy',
 'Exterior_occupancy',
 'Map/Lot',
 'MapLot',
 'Owner_Address',
 'Owner_Address2',
 'Owner_City',
 'Owner_CoOwnerName',
 'Owner_Name',
 'Owner_State',
 'Owner_Zip',
 'Owners',
 'PropertyTaxAmount'}

### "Exterior_occupancy -> Exterior_Occupancy"

In [7]:
### We rename 2015-2019 column name to be "Exterior_Occupancy"

for year in range(2015, 2020):  # 2020 is not included, so it goes from 2015 to 2019
    # Construct the variable name
    variable_name = f"data_{year}"
    # Access the DataFrame using globals() and rename the column
    globals()[variable_name].rename(columns={'Exterior_occupancy': 'Exterior_Occupancy'}, inplace=True)

In [8]:
## we remove "Exterior_occupancy"
unique_columns.remove("Exterior_occupancy")
unique_columns.remove('Exterior_Occupancy')
unique_columns

{'Book/Page',
 'BookPage',
 'Map/Lot',
 'MapLot',
 'Owner_Address',
 'Owner_Address2',
 'Owner_City',
 'Owner_CoOwnerName',
 'Owner_Name',
 'Owner_State',
 'Owner_Zip',
 'Owners',
 'PropertyTaxAmount'}

## Checking which dataset has / misses which columns

In [9]:
# List of dataset variables
datasets = [data_2015, data_2016, data_2017, data_2018, data_2019, data_2020, data_2021, data_2022, data_2023, data_2024]

# Initialize a list to store the presence of unique columns in each dataset
column_presence = []

# Iterate over each dataset and check for column presence
for i, dataset in enumerate(datasets, start=2015):
    column_presence.append({column: (column in dataset.columns) for column in unique_columns})

# Convert the list of dictionaries to a DataFrame and then transpose it
presence_df = pd.DataFrame(column_presence, index=[f'data_{year}' for year in range(2015, 2025)]).T.sort_index()
presence_df


Unnamed: 0,data_2015,data_2016,data_2017,data_2018,data_2019,data_2020,data_2021,data_2022,data_2023,data_2024
Book/Page,True,True,True,True,True,False,False,False,False,False
BookPage,False,False,False,False,False,True,True,True,True,True
Map/Lot,True,True,True,True,True,False,False,False,False,False
MapLot,False,False,False,False,False,True,True,True,True,True
Owner_Address,False,True,True,True,True,True,True,True,True,True
Owner_Address2,False,True,True,True,True,True,True,True,True,True
Owner_City,False,True,True,True,True,True,True,True,True,True
Owner_CoOwnerName,False,True,True,True,True,True,True,True,True,True
Owner_Name,False,True,True,True,True,True,True,True,True,True
Owner_State,False,True,True,True,True,True,True,True,True,True


### "Book/Page" -> "BookPage"

In [10]:
### We rename 2015-2019 column name to be "BookPage"

for year in range(2015, 2020):  # 2020 is not included, so it goes from 2015 to 2019
    # Construct the variable name
    variable_name = f"data_{year}"
    # Access the DataFrame using globals() and rename the column
    globals()[variable_name].rename(columns={'Book/Page': 'BookPage'}, inplace=True)

In [11]:
## we remove "BookPage" and "Book/Page"
unique_columns.remove("Book/Page")
unique_columns.remove('BookPage')
unique_columns

{'Map/Lot',
 'MapLot',
 'Owner_Address',
 'Owner_Address2',
 'Owner_City',
 'Owner_CoOwnerName',
 'Owner_Name',
 'Owner_State',
 'Owner_Zip',
 'Owners',
 'PropertyTaxAmount'}

### "Map/Log" -> "MapLot"

In [12]:
### We rename 2015-2019 column name to be "MapLot"

for year in range(2015, 2020):  # 2020 is not included, so it goes from 2015 to 2019
    # Construct the variable name
    variable_name = f"data_{year}"
    # Access the DataFrame using globals() and rename the column
    globals()[variable_name].rename(columns={'Map/Lot': 'MapLot'}, inplace=True)

In [13]:
unique_columns.remove("Map/Lot")
unique_columns.remove('MapLot')
unique_columns

{'Owner_Address',
 'Owner_Address2',
 'Owner_City',
 'Owner_CoOwnerName',
 'Owner_Name',
 'Owner_State',
 'Owner_Zip',
 'Owners',
 'PropertyTaxAmount'}

### "Owner_name" & "Owner_CoOwnerName" -> "Owners"

Looking closer at the data, we notice that 2015 has one column that listed all owners, whereas other years it was listed as "Owner_name" and "Owner_CoOwnerName"

Here we combine "Owner_name" and "Owner_CoOwnerName" for year 2016 - 2024 to "Owners"

In [None]:
data_2015["Owners"]

In [None]:
data_2020["Owner_Name"]

In [14]:
for year in range(2016, 2025):  # Loop from 2016 to 2024
    # Construct the variable name for the dataset
    dataset_var = f'data_{year}'
    
    # Access the DataFrame using globals()
    df = globals().get(dataset_var)
    
    if df is not None:
        # Check if both columns exist in the DataFrame
        if 'Owner_Name' in df.columns and 'Owner_CoOwnerName' in df.columns:
            # Combine the columns into a new column, handling NaN values appropriately
            # we also know for a fact from our table above that owner_name and owner_CoOwnerName always appear together in the datasets
            # so we are not writing any exception handling
            df['Owners'] = df.apply(lambda x: f"{x['Owner_Name']}, {x['Owner_CoOwnerName']}" if pd.notna(x['Owner_CoOwnerName']) else x['Owner_Name'], axis=1)
            # Drop the original columns
            df.drop(columns=['Owner_Name', 'Owner_CoOwnerName'], inplace=True)


In [15]:
## we remove "['Owner_Name', 'Owner_CoOwnerName']"
unique_columns.remove('Owner_Name')
unique_columns.remove('Owner_CoOwnerName')
unique_columns.remove('Owners')

In [16]:
unique_columns

{'Owner_Address',
 'Owner_Address2',
 'Owner_City',
 'Owner_State',
 'Owner_Zip',
 'PropertyTaxAmount'}

### Adding cols to datasets

In [17]:
# List of columns to add
new_columns = ['Owner_Address', 'Owner_Address2', 'Owner_City', 'Owner_State', 'Owner_Zip', 'PropertyTaxAmount']

# Initialize the columns with 'NA'
for column in new_columns:
    data_2015[column] = 'NA'

In [18]:
data_2016["PropertyTaxAmount"] = 'NA'
data_2017["PropertyTaxAmount"] = 'NA'
data_2018["PropertyTaxAmount"] = 'NA'
data_2021["PropertyTaxAmount"] = 'NA'

In [19]:
def check_columns_consistency(data_frames):
    # Get the column names for the first DataFrame
    reference_columns = set(data_frames[0].columns)
    
    # Iterate through the rest of the DataFrames and compare their column names
    for i, df in enumerate(data_frames[1:], start=2016):
        if set(df.columns) != reference_columns:
            print(f"Columns in data_{i} are not consistent with data_2015")
            return False
    
    print("All datasets have the same columns.")
    return True

# Create a list of your DataFrames for 2015 to 2024
data_frames = [globals()[f"data_{year}"] for year in range(2015, 2025)]

# Call the function to check column consistency
check_columns_consistency(data_frames)

All datasets have the same columns.


True

# 2. Detect Condo Conversion

Let's start by counting how many entries are associated with each address in year 2015 (prior) and year 2016 (post)

In [20]:
count_prior = data_2015.groupby('Address').size()

In [21]:
count_post = data_2016.groupby('Address').size()

In [22]:
# we assume that an address has been converted into a condo, 
# if the previous year (2015) only has one address record
# but the post year (2016) has more than one entry associted with the same address
# then we consider it a condo conversion
condo_conversion_addresses = count_post[(count_post > 1) & (count_prior == 1)].index.tolist()
condo_conversion_addresses

['10 Ellery St',
 '12 Avon Pl',
 '12 Rindgefield St',
 '133-135 Fresh Pond Pkwy',
 '139 Charles St',
 '15 Oak St',
 '150 Cambridgepark Dr',
 '155 Webster Ave',
 '159 Fayerweather St',
 '17 Cambridge Ter',
 '17 Copley St',
 '18 White St',
 '18-20 Whittemore Ave',
 '198 Sherman St',
 '20 Vincent St',
 '207 Prospect St',
 '21 Sciarappa St',
 '24-26 Whittemore Ave',
 '2579 Massachusetts Ave',
 '280 Walden St',
 '30-32 Whittemore Ave',
 '32 Church St',
 '346 Putnam Ave',
 '354 Rindge Ave',
 '37-39 Mt Pleasant St',
 '51 Cedar St',
 '55-59 Cushing St',
 '676-694 Huron Ave',
 '7 Temple St',
 '7 Trowbridge Pl',
 '700 Main St',
 '9 Harding St',
 '93 Kirkland St',
 '99 Brookline St']

In [23]:
# we filter the original data_2015 & data_2016 for just the addresses of condo conversions
condo_conversion_data_prior = data_2015[data_2015['Address'].isin(condo_conversion_addresses)]
condo_conversion_data_post = data_2016[data_2016['Address'].isin(condo_conversion_addresses)]

In [24]:
# let's do a test: address 10 Ellery St is detected as a condo_conversion
# there's only one entry (PID) in prior year (2015)
condo_conversion_data_prior[condo_conversion_data_prior["Address"] == "10 Ellery St"]

Unnamed: 0,PID,GISID,BldgNum,Address,Unit,StateClassCode,PropertyClass,Zoning,MapLot,LandArea,...,Parking_Covered,Parking_Garage,UnfinishedBasementGross,FinishedBasementGross,Owner_Address,Owner_Address2,Owner_City,Owner_State,Owner_Zip,PropertyTaxAmount
8212,8053,116-119,1,10 Ellery St,,104,TWO-FAM-RES,"=""C-1""","=""116-119""",6082,...,0.0,,1154,0.0,,,,,,


In [25]:
# whereas in post_year (2016), there are 4 entries (PIDs)
condo_conversion_data_post[condo_conversion_data_post["Address"] == "10 Ellery St"]

Unnamed: 0,PID,GISID,BldgNum,Address,Unit,StateClassCode,PropertyClass,Zoning,MapLot,LandArea,...,Condition_InteriorCondition,Condition_OverallCondition,Condition_OverallGrade,Parking_Open,Parking_Covered,Parking_Garage,UnfinishedBasementGross,FinishedBasementGross,Owners,PropertyTaxAmount
8207,8053,116-119,1,10 Ellery St,,199,CONDO-BLDG,C-1,116-119,6082,...,,,,0.0,0.0,,720,0.0,10 ELLERY STREET LLC,
8208,8053,116-119,2,10 Ellery St,,199,CONDO-BLDG,C-1,116-119,6082,...,,,,0.0,0.0,,0,663.0,10 ELLERY STREET LLC,
28859,192375,116-119,1,10 Ellery St,A,102,CONDOMINIUM,,116-119-A,0,...,,Excellent,Very Good,1.0,0.0,0.0,0,686.0,"THE MARIE WU & RENEE WU LLC,",
28860,192376,116-119,1,10 Ellery St,B,102,CONDOMINIUM,,116-119-B,0,...,,Excellent,Very Good,1.0,0.0,0.0,0,693.0,"10 ELLERY STREET LLC,, C/O SIEGEL, ADAM & REBE...",


In [26]:
# count the number of condos in the post year
condo_conversion_data_post['num_condo_units'] = condo_conversion_data_post.groupby('Address')['Address'].transform('count')
condo_conversion_data_post

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  condo_conversion_data_post['num_condo_units'] = condo_conversion_data_post.groupby('Address')['Address'].transform('count')


Unnamed: 0,PID,GISID,BldgNum,Address,Unit,StateClassCode,PropertyClass,Zoning,MapLot,LandArea,...,Condition_OverallCondition,Condition_OverallGrade,Parking_Open,Parking_Covered,Parking_Garage,UnfinishedBasementGross,FinishedBasementGross,Owners,PropertyTaxAmount,num_condo_units
656,690,17-20,1,139 Charles St,,199,CONDO-BLDG,C-1,17-20,2115,...,,,0.0,0.0,,0,821.0,139 CHARLES STREET LLC,,3
920,958,22-55,1,21 Sciarappa St,,199,CONDO-BLDG,C-1,22-55,1679,...,,,0.0,0.0,,0,1023.0,SCIARAPPA STREET LLC,,3
2177,2167,36-185,1,9 Harding St,,199,CONDO-BLDG,C-1,36-185,2938,...,,,0.0,0.0,,918,0.0,"9 HARDING STREET, LLC",,4
3014,3016,71-56,1,700 Main St,,199,CONDO-BLDG,IB,71-56,138947,...,,,0.0,0.0,,0,0.0,"MIT 650 MAIN STREET, LLC",,3
3015,3016,71-56,2,700 Main St,,199,CONDO-BLDG,IB,71-56,138947,...,,,0.0,0.0,,0,0.0,"MIT 650 MAIN STREET, LLC",,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28859,192375,116-119,1,10 Ellery St,A,102,CONDOMINIUM,,116-119-A,0,...,Excellent,Very Good,1.0,0.0,0.0,0,686.0,"THE MARIE WU & RENEE WU LLC,",,4
28860,192376,116-119,1,10 Ellery St,B,102,CONDOMINIUM,,116-119-B,0,...,Excellent,Very Good,1.0,0.0,0.0,0,693.0,"10 ELLERY STREET LLC,, C/O SIEGEL, ADAM & REBE...",,4
28874,192471,107-133,1,7 Temple St,,957,Charitable Svc,O-3,107-133E,37068,...,Good,GOOD,,,,0,,"CAMBRIDGE YOUNG WOMEN, CHRISTIAN ASSOCIATES",,2
28875,192471,107-133,2,7 Temple St,,957,Charitable Svc,O-3,107-133E,37068,...,Excellent,AVERAGE,,,,0,,"CAMBRIDGE YOUNG WOMEN, CHRISTIAN ASSOCIATES",,2


In [27]:
condo_conversion_data_post[condo_conversion_data_post["Address"] == "10 Ellery St"]["num_condo_units"]

8207     4
8208     4
28859    4
28860    4
Name: num_condo_units, dtype: int64

### Keeping only one entry after calculating num_condo_units

In [28]:
# keeping only the first row of each address
condo_conversion_data_post = condo_conversion_data_post.drop_duplicates(subset='Address', keep='first')
condo_conversion_data_post

Unnamed: 0,PID,GISID,BldgNum,Address,Unit,StateClassCode,PropertyClass,Zoning,MapLot,LandArea,...,Condition_OverallCondition,Condition_OverallGrade,Parking_Open,Parking_Covered,Parking_Garage,UnfinishedBasementGross,FinishedBasementGross,Owners,PropertyTaxAmount,num_condo_units
656,690,17-20,1,139 Charles St,,199,CONDO-BLDG,C-1,17-20,2115,...,,,0.0,0.0,,0,821.0,139 CHARLES STREET LLC,,3
920,958,22-55,1,21 Sciarappa St,,199,CONDO-BLDG,C-1,22-55,1679,...,,,0.0,0.0,,0,1023.0,SCIARAPPA STREET LLC,,3
2177,2167,36-185,1,9 Harding St,,199,CONDO-BLDG,C-1,36-185,2938,...,,,0.0,0.0,,918,0.0,"9 HARDING STREET, LLC",,4
3014,3016,71-56,1,700 Main St,,199,CONDO-BLDG,IB,71-56,138947,...,,,0.0,0.0,,0,0.0,"MIT 650 MAIN STREET, LLC",,3
3502,3498,78-79,1,155 Webster Ave,,1095,MULT-RES-3FAM,C-1,78-79,2689,...,Average,Average,2.0,0.0,,1112,0.0,"PACHECO, ALBERT M. & LEONTINE O. PACHECO, C/O ...",,2
3936,3891,83-25,1,15 Oak St,,199,CONDO-BLDG,C-1,83-25,5744,...,,,0.0,0.0,,2532,0.0,"RESNICK, MARC & R. SCOTT PULVER, TRUSTEE, 15 O...",,2
5093,4996,94-89,1,99 Brookline St,,199,CONDO-BLDG,C,94-89,1866,...,,,0.0,0.0,,926,0.0,"99 BROOKLINE LLC,",,4
7120,6984,109-42,1,207 Prospect St,,199,CONDO-BLDG,C-1,109-42,5000,...,,,0.0,0.0,,592,0.0,CONCEPT PROPERTIES LLC,,4
8207,8053,116-119,1,10 Ellery St,,199,CONDO-BLDG,C-1,116-119,6082,...,,,0.0,0.0,,720,0.0,10 ELLERY STREET LLC,,4
10189,10028,127-152,1,346 Putnam Ave,,199,CONDO-BLDG,C,127-152,2023,...,,,0.0,0.0,,1040,0.0,"346 PUTNAM AVE., LLC",,3


### Add sufix to datasets

In [29]:
column_mapping = {}
for column in condo_conversion_data_prior.columns:
    if column != 'Address':
        column_mapping[column] = column + '_prior'

# Rename the columns using the mapping dictionary
condo_conversion_data_prior.rename(columns=column_mapping, inplace=True)
condo_conversion_data_prior

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  condo_conversion_data_prior.rename(columns=column_mapping, inplace=True)


Unnamed: 0,PID_prior,GISID_prior,BldgNum_prior,Address,Unit_prior,StateClassCode_prior,PropertyClass_prior,Zoning_prior,MapLot_prior,LandArea_prior,...,Parking_Covered_prior,Parking_Garage_prior,UnfinishedBasementGross_prior,FinishedBasementGross_prior,Owner_Address_prior,Owner_Address2_prior,Owner_City_prior,Owner_State_prior,Owner_Zip_prior,PropertyTaxAmount_prior
659,690,17-20,1,139 Charles St,,101,SNGL-FAM-RES,"=""C-1""","=""17-20""",2115,...,0.0,,0,821.0,,,,,,
923,958,22-55,1,21 Sciarappa St,,104,TWO-FAM-RES,"=""C-1""","=""22-55""",1679,...,0.0,,0,1023.0,,,,,,
2179,2167,36-185,1,9 Harding St,,105,THREE-FM-RES,"=""C-1""","=""36-185""",2938,...,0.0,,918,0.0,,,,,,
3021,3016,71-50,1,700 Main St,,406,HIGH-TECH,"=""IB""","=""71-50""",83179,...,,,0,,,,,,,
3507,3498,78-79,1,155 Webster Ave,,105,THREE-FM-RES,"=""C-1""","=""78-79""",2689,...,0.0,,1112,0.0,,,,,,
3940,3891,83-25,1,15 Oak St,,105,THREE-FM-RES,"=""C-1""","=""83-25""",5744,...,0.0,,2532,0.0,,,,,,
5097,4996,94-89,1,99 Brookline St,,105,THREE-FM-RES,"=""C""","=""94-89""",1866,...,0.0,,926,0.0,,,,,,
6845,6714,107-133,1,7 Temple St,,957,Charitable Svc,"=""O-3""","=""107-133""",37068,...,,,0,,,,,,,
7124,6984,109-42,1,207 Prospect St,,31,MULTIUSE-COM,"=""C-1""","=""109-42""",5000,...,,,592,,,,,,,
8212,8053,116-119,1,10 Ellery St,,104,TWO-FAM-RES,"=""C-1""","=""116-119""",6082,...,0.0,,1154,0.0,,,,,,


In [30]:
column_mapping = {}
for column in condo_conversion_data_post.columns:
    if column != 'Address':
        column_mapping[column] = column + '_post'

# Rename the columns using the mapping dictionary
condo_conversion_data_post.rename(columns=column_mapping, inplace=True)
condo_conversion_data_post

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  condo_conversion_data_post.rename(columns=column_mapping, inplace=True)


Unnamed: 0,PID_post,GISID_post,BldgNum_post,Address,Unit_post,StateClassCode_post,PropertyClass_post,Zoning_post,MapLot_post,LandArea_post,...,Condition_OverallCondition_post,Condition_OverallGrade_post,Parking_Open_post,Parking_Covered_post,Parking_Garage_post,UnfinishedBasementGross_post,FinishedBasementGross_post,Owners_post,PropertyTaxAmount_post,num_condo_units_post
656,690,17-20,1,139 Charles St,,199,CONDO-BLDG,C-1,17-20,2115,...,,,0.0,0.0,,0,821.0,139 CHARLES STREET LLC,,3
920,958,22-55,1,21 Sciarappa St,,199,CONDO-BLDG,C-1,22-55,1679,...,,,0.0,0.0,,0,1023.0,SCIARAPPA STREET LLC,,3
2177,2167,36-185,1,9 Harding St,,199,CONDO-BLDG,C-1,36-185,2938,...,,,0.0,0.0,,918,0.0,"9 HARDING STREET, LLC",,4
3014,3016,71-56,1,700 Main St,,199,CONDO-BLDG,IB,71-56,138947,...,,,0.0,0.0,,0,0.0,"MIT 650 MAIN STREET, LLC",,3
3502,3498,78-79,1,155 Webster Ave,,1095,MULT-RES-3FAM,C-1,78-79,2689,...,Average,Average,2.0,0.0,,1112,0.0,"PACHECO, ALBERT M. & LEONTINE O. PACHECO, C/O ...",,2
3936,3891,83-25,1,15 Oak St,,199,CONDO-BLDG,C-1,83-25,5744,...,,,0.0,0.0,,2532,0.0,"RESNICK, MARC & R. SCOTT PULVER, TRUSTEE, 15 O...",,2
5093,4996,94-89,1,99 Brookline St,,199,CONDO-BLDG,C,94-89,1866,...,,,0.0,0.0,,926,0.0,"99 BROOKLINE LLC,",,4
7120,6984,109-42,1,207 Prospect St,,199,CONDO-BLDG,C-1,109-42,5000,...,,,0.0,0.0,,592,0.0,CONCEPT PROPERTIES LLC,,4
8207,8053,116-119,1,10 Ellery St,,199,CONDO-BLDG,C-1,116-119,6082,...,,,0.0,0.0,,720,0.0,10 ELLERY STREET LLC,,4
10189,10028,127-152,1,346 Putnam Ave,,199,CONDO-BLDG,C,127-152,2023,...,,,0.0,0.0,,1040,0.0,"346 PUTNAM AVE., LLC",,3


In [31]:
merged_condo_data = pd.merge(
        condo_conversion_data_prior, 
        condo_conversion_data_post, 
        on="Address",
        how='left'
    )
merged_condo_data

Unnamed: 0,PID_prior,GISID_prior,BldgNum_prior,Address,Unit_prior,StateClassCode_prior,PropertyClass_prior,Zoning_prior,MapLot_prior,LandArea_prior,...,Condition_OverallCondition_post,Condition_OverallGrade_post,Parking_Open_post,Parking_Covered_post,Parking_Garage_post,UnfinishedBasementGross_post,FinishedBasementGross_post,Owners_post,PropertyTaxAmount_post,num_condo_units_post
0,690,17-20,1,139 Charles St,,101,SNGL-FAM-RES,"=""C-1""","=""17-20""",2115,...,,,0.0,0.0,,0,821.0,139 CHARLES STREET LLC,,3
1,958,22-55,1,21 Sciarappa St,,104,TWO-FAM-RES,"=""C-1""","=""22-55""",1679,...,,,0.0,0.0,,0,1023.0,SCIARAPPA STREET LLC,,3
2,2167,36-185,1,9 Harding St,,105,THREE-FM-RES,"=""C-1""","=""36-185""",2938,...,,,0.0,0.0,,918,0.0,"9 HARDING STREET, LLC",,4
3,3016,71-50,1,700 Main St,,406,HIGH-TECH,"=""IB""","=""71-50""",83179,...,,,0.0,0.0,,0,0.0,"MIT 650 MAIN STREET, LLC",,3
4,3498,78-79,1,155 Webster Ave,,105,THREE-FM-RES,"=""C-1""","=""78-79""",2689,...,Average,Average,2.0,0.0,,1112,0.0,"PACHECO, ALBERT M. & LEONTINE O. PACHECO, C/O ...",,2
5,3891,83-25,1,15 Oak St,,105,THREE-FM-RES,"=""C-1""","=""83-25""",5744,...,,,0.0,0.0,,2532,0.0,"RESNICK, MARC & R. SCOTT PULVER, TRUSTEE, 15 O...",,2
6,4996,94-89,1,99 Brookline St,,105,THREE-FM-RES,"=""C""","=""94-89""",1866,...,,,0.0,0.0,,926,0.0,"99 BROOKLINE LLC,",,4
7,6714,107-133,1,7 Temple St,,957,Charitable Svc,"=""O-3""","=""107-133""",37068,...,Good,GOOD,,,,0,,"CAMBRIDGE YOUNG WOMEN, CHRISTIAN ASSOCIATES",,2
8,6984,109-42,1,207 Prospect St,,31,MULTIUSE-COM,"=""C-1""","=""109-42""",5000,...,,,0.0,0.0,,592,0.0,CONCEPT PROPERTIES LLC,,4
9,8053,116-119,1,10 Ellery St,,104,TWO-FAM-RES,"=""C-1""","=""116-119""",6082,...,,,0.0,0.0,,720,0.0,10 ELLERY STREET LLC,,4


## Above is an example, now let's write it as a function

In [32]:
def process_condo_conversion(data_prior, data_post, prior_year, post_year):
    # Grouping the data by address and counting the number of records for each address
    count_prior = data_prior.groupby('Address').size()
    count_post = data_post.groupby('Address').size()

    # Identifying addresses which have 1 record in the prior year and more than 1 in the post year
    condo_conversion_addresses = count_post[(count_post > 1) & (count_prior == 1)].index.tolist()

    # Filtering the datasets to include only these addresses
    condo_conversion_data_prior = data_prior[data_prior['Address'].isin(condo_conversion_addresses)]
    condo_conversion_data_post = data_post[data_post['Address'].isin(condo_conversion_addresses)]

    # Calculate the number of condo conversions based on the post data
    condo_conversion_data_post['num_condo_units'] = condo_conversion_data_post.groupby('Address')['Address'].transform('count')
    
    # Keep only the first entry of all duplicate addresses
    condo_conversion_data_post = condo_conversion_data_post.drop_duplicates(subset='Address', keep='first')

    # Renaming all columns for prior year data except for Address column which we will use to merge
    column_mapping = {}
    for column in condo_conversion_data_prior.columns:
        if column != 'Address':
            column_mapping[column] = column + '_prior'
    condo_conversion_data_prior.rename(columns=column_mapping, inplace=True)
    
    # Renaming all columns for post year data except for Address column which we will use to merge
    column_mapping = {}
    for column in condo_conversion_data_post.columns:
        if column != 'Address':
            column_mapping[column] = column + '_post'

    # Rename the columns using the mapping dictionary
    condo_conversion_data_post.rename(columns=column_mapping, inplace=True)

    # Merging the prior and post year datasets based on address
    merged_condo_data = pd.merge(
        condo_conversion_data_prior, 
        condo_conversion_data_post, 
        on="Address",
        how="left"
    )

    # Adding the prior_year and post_year fields
    merged_condo_data['prior_year'] = prior_year
    merged_condo_data['post_year'] = post_year

    return merged_condo_data


In [33]:
def process_conversions_for_years(start_year, end_year, accumulated_df=None):
    if accumulated_df is None:
        accumulated_df = pd.DataFrame()  # Initialize an empty DataFrame if not provided
    
    for year in range(start_year, end_year):
        prior_year = year
        post_year = year + 1

        # Assuming you have data_2015, data_2016, ..., data_2023 loaded or created
        prior_data = globals()[f"data_{prior_year}"]
        post_data = globals()[f"data_{post_year}"]

        result_df = process_condo_conversion(prior_data, post_data, prior_year, post_year)
        accumulated_df = pd.concat([accumulated_df, result_df], ignore_index=True)

    return accumulated_df

# Initialize an empty DataFrame to accumulate the results
accumulated_results = pd.DataFrame()

# Call the function to process condo conversions for years 2015 to 2023 and accumulate the results
accumulated_results = process_conversions_for_years(2015, 2024, accumulated_results)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  condo_conversion_data_post['num_condo_units'] = condo_conversion_data_post.groupby('Address')['Address'].transform('count')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  condo_conversion_data_prior.rename(columns=column_mapping, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  condo_conversion_data_post['num_condo_units'] = condo_conversion_

In [34]:
accumulated_results

Unnamed: 0,PID_prior,GISID_prior,BldgNum_prior,Address,Unit_prior,StateClassCode_prior,PropertyClass_prior,Zoning_prior,MapLot_prior,LandArea_prior,...,Parking_Open_post,Parking_Covered_post,Parking_Garage_post,UnfinishedBasementGross_post,FinishedBasementGross_post,Owners_post,PropertyTaxAmount_post,num_condo_units_post,prior_year,post_year
0,690,17-20,1,139 Charles St,,101,SNGL-FAM-RES,"=""C-1""","=""17-20""",2115,...,0.0,0.0,,0,821.0,139 CHARLES STREET LLC,,3,2015,2016
1,958,22-55,1,21 Sciarappa St,,104,TWO-FAM-RES,"=""C-1""","=""22-55""",1679,...,0.0,0.0,,0,1023.0,SCIARAPPA STREET LLC,,3,2015,2016
2,2167,36-185,1,9 Harding St,,105,THREE-FM-RES,"=""C-1""","=""36-185""",2938,...,0.0,0.0,,918,0.0,"9 HARDING STREET, LLC",,4,2015,2016
3,3016,71-50,1,700 Main St,,406,HIGH-TECH,"=""IB""","=""71-50""",83179,...,0.0,0.0,,0,0.0,"MIT 650 MAIN STREET, LLC",,3,2015,2016
4,3498,78-79,1,155 Webster Ave,,105,THREE-FM-RES,"=""C-1""","=""78-79""",2689,...,2.0,0.0,,1112,0.0,"PACHECO, ALBERT M. & LEONTINE O. PACHECO, C/O ...",,2,2015,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,22504,267C-70,1,767 Concord Ave,,340,GEN-OFFICE,IB-2,267C-70,49856,...,,,,0,,"WEST CAMBRIDGE SCIENCE PARK, LLC",98564.0,2,2023,2024
281,22633,268A-30,1,278 Rindge Ave,,970,Housing Authority,B,268A-30,4561,...,,,,0,,CAMBRIDGE HOUSING AUTHORITY,-1.0,11,2023,2024
282,22666,268B-45,1,402 Rindge Ave,,114,AFFORDABLE APT,C-2,268B-45,155863,...,0.0,0.0,0.0,0,0.0,"RINDGE TOWER APARTMENT LLC,",-1.0,6,2023,2024
283,3767,81-23,1,333 Webster Ave,,331,AUTO-SUPPLY,BA,81-23,6032,...,0.0,0.0,,0,0.0,CITY OF CAMBRIDGE,-1.0,2,2023,2024


## Connecting GISID data

In [35]:
import geopandas as gpd

In [36]:
geo_df = gpd.read_file("all_data/ASSESSING_ParcelsFY2024.geojson")

In [37]:
geo_df.head()

Unnamed: 0,POLY_TYPE,MAP,UYEAR,LOC_ID,GlobalID,EditDate,ML,SOURCE,created_user,Editor,LOT,created_date,PLAN_ID,last_edited_date,last_edited_user,geometry
0,PRIV_ROW,-,0.0,F_761786_2962533,{1118511F-457E-4275-ACDD-1A0C3D8307F0},,---,ASSESS,,,-,,,2023-09-26,SSWEENEY,"POLYGON ((-71.10911 42.37663, -71.10912 42.376..."
1,,263,0.0,F_754404_2965391,{135EB22F-4680-4A35-8736-A2D1E588AE61},,263-23,ASSESS,,,23,,,2023-09-26,SSWEENEY,"POLYGON ((-71.13621 42.38470, -71.13640 42.384..."
2,,201,0.0,F_758048_2966643,{318E939B-73BB-473A-B3A9-B5E3A6A8AB0B},,201-95,ASSESS,,,95,,,2023-09-26,SSWEENEY,"POLYGON ((-71.12275 42.38791, -71.12283 42.387..."
3,,26,0.0,F_768259_2959811,{728A555A-EACA-48C7-9A75-1E00CD92672D},,26-37,ASSESS,,,37,,,2023-09-26,SSWEENEY,"POLYGON ((-71.08510 42.36914, -71.08511 42.369..."
4,,36,0.0,F_767412_2961233,{7EC94D31-F4E5-4B0C-9AE7-13D946E8A754},,36-203,ASSESS,,,203,,,2023-09-26,SSWEENEY,"POLYGON ((-71.08823 42.37308, -71.08825 42.372..."


In [38]:
geo_df.columns

Index(['POLY_TYPE', 'MAP', 'UYEAR', 'LOC_ID', 'GlobalID', 'EditDate', 'ML',
       'SOURCE', 'created_user', 'Editor', 'LOT', 'created_date', 'PLAN_ID',
       'last_edited_date', 'last_edited_user', 'geometry'],
      dtype='object')

In [39]:
merged_conversion_df = pd.merge(accumulated_results, 
                                geo_df, 
                                how="left", 
                                left_on="GISID_prior", 
                                right_on="ML")
merged_conversion_df

Unnamed: 0,PID_prior,GISID_prior,BldgNum_prior,Address,Unit_prior,StateClassCode_prior,PropertyClass_prior,Zoning_prior,MapLot_prior,LandArea_prior,...,ML,SOURCE,created_user,Editor,LOT,created_date,PLAN_ID,last_edited_date,last_edited_user,geometry
0,690,17-20,1,139 Charles St,,101,SNGL-FAM-RES,"=""C-1""","=""17-20""",2115,...,17-20,ASSESS,,,20,,,2023-09-26,SSWEENEY,"POLYGON ((-71.08203 42.36803, -71.08207 42.367..."
1,958,22-55,1,21 Sciarappa St,,104,TWO-FAM-RES,"=""C-1""","=""22-55""",1679,...,,,,,,,,,,
2,2167,36-185,1,9 Harding St,,105,THREE-FM-RES,"=""C-1""","=""36-185""",2938,...,36-185,ASSESS,,KJL,185,,,2023-09-26,SSWEENEY,"POLYGON ((-71.09062 42.37292, -71.09063 42.372..."
3,3016,71-50,1,700 Main St,,406,HIGH-TECH,"=""IB""","=""71-50""",83179,...,,,,,,,,,,
4,3498,78-79,1,155 Webster Ave,,105,THREE-FM-RES,"=""C-1""","=""78-79""",2689,...,78-79,ASSESS,,,79,,,2023-09-26,SSWEENEY,"POLYGON ((-71.09339 42.37031, -71.09317 42.370..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,22504,267C-70,1,767 Concord Ave,,340,GEN-OFFICE,IB-2,267C-70,49856,...,267C-70,ASSESS,,,70,,,2023-09-26,SSWEENEY,"POLYGON ((-71.15371 42.39312, -71.15369 42.393..."
281,22633,268A-30,1,278 Rindge Ave,,970,Housing Authority,B,268A-30,4561,...,,,,,,,,,,
282,22666,268B-45,1,402 Rindge Ave,,114,AFFORDABLE APT,C-2,268B-45,155863,...,268B-45,ASSESS,,,45,,,2023-09-26,SSWEENEY,"POLYGON ((-71.13956 42.39393, -71.13960 42.393..."
283,3767,81-23,1,333 Webster Ave,,331,AUTO-SUPPLY,BA,81-23,6032,...,,,,,,,,,,


In [40]:
merged_conversion_df['ML'].isna().sum()

23

In [42]:
merged_conversion_df.to_csv("katie_condo_conversions.csv")

# 4. Comparing Results

In [44]:
grace_df = pd.read_csv('grace_condo_conversions.csv')
katie_df = pd.read_csv('katie_condo_conversions.csv')

# Check if they have the same columns
same_columns = grace_df.columns.equals(katie_df.columns)

if not same_columns:
    # List the columns that are different
    different_columns_grace = grace_df.columns.difference(katie_df.columns).tolist()
    different_columns_katie = katie_df.columns.difference(grace_df.columns).tolist()
else:
    different_columns_grace = []
    different_columns_katie = []

same_columns, different_columns_grace, different_columns_katie


(False,
 ['Book/Page',
  'Book/Page_post',
  'Book/Page_prior',
  'BookPage',
  'Exterior_Occupancy',
  'Exterior_occupancy',
  'Exterior_occupancy_post',
  'Exterior_occupancy_prior',
  'Map/Lot',
  'Map/Lot_post',
  'Map/Lot_prior',
  'MapLot',
  'Owner_Address',
  'Owner_Address2',
  'Owner_City',
  'Owner_CoOwnerName',
  'Owner_CoOwnerName_post',
  'Owner_CoOwnerName_prior',
  'Owner_Name',
  'Owner_Name_post',
  'Owner_Name_prior',
  'Owner_State',
  'Owner_Zip',
  'Owners',
  'PropertyTaxAmount',
  'num_condo_units'],
 ['Owners_post', 'Owners_prior', 'num_condo_units_post'])

In [46]:
address_unique_grace = grace_df['Address'].is_unique
address_unique_katie = katie_df['Address'].is_unique

address_unique_grace, address_unique_katie


(True, True)

In [54]:
katie_df["Address"] = katie_df["Address"].str.upper()

In [55]:
# Convert the "Address" columns in both DataFrames to sets of unique values
addresses_in_grace = set(grace_df['Address'])
addresses_in_katie = set(katie_df['Address'])

# Find addresses that are unique to grace_df
unique_to_grace = addresses_in_grace - addresses_in_katie

# Find addresses that are unique to katie_df
unique_to_katie = addresses_in_katie - addresses_in_grace

# Print the results
if unique_to_grace:
    print("Addresses unique to grace_df:")
    for address in unique_to_grace:
        print(address)

if unique_to_katie:
    print("Addresses unique to katie_df:")
    for address in unique_to_katie:
        print(address)

if not unique_to_grace and not unique_to_katie:
    print("There are no different addresses. All addresses in both DataFrames are the same.")


There are no different addresses. All addresses in both DataFrames are the same.
