Import libraries

In [24]:
import pandas as pd
from datetime import datetime
import os

## Step 1: Load data from URL

Need to do: column headers does not start at the same row number for all the years. Hence why the detect function. But there are other years where the expected start , 'boro', 'block', 'lot'... are in a different row than the other column headers. We need to build guardrails in case this is the case

In [51]:
def load_excel_by_column_names(url, engine=None, expected_start=("boro", "block", "lot"), max_scan=30):
    """
    Loads an Excel file starting from the first row where column headers match `expected_start`.
    """
    temp = pd.read_excel(url, header=None, engine=engine)

    for i in range(max_scan):
        row = temp.iloc[i].astype(str).str.strip().str.lower().tolist()
        if row[:len(expected_start)] == list(expected_start):
            print(f"✅ Header row found at line {i - 1}")
            return pd.read_excel(url, skiprows=i, engine=engine)

    raise ValueError("❌ Header row not found within scan range.")

<b> Use this (change file path) if it is an xlsx file </b>

In [52]:
df_2025 = load_excel_by_column_names(
    "https://www.nyc.gov/assets/finance/downloads/pdf/nopv/revised/revised_nopv_04182025.xlsx",
    engine="openpyxl"
)

✅ Header row found at line 10


<b> Use this (change file path) if xls file </b>

In [30]:
df_2022 = load_excel_by_column_names(
    "https://www.nyc.gov/assets/finance/downloads/pdf/nopv/revised/revised-nopv-05182022.xls",
    engine="xlrd"
)

✅ Header row found at line 11


<b> Check </b>

In [53]:
df_2025
print(df_2025.columns)

Index(['BORO', 'BLOCK', 'LOT', 'EASE', 'ADDRESS 1', 'ADDRESS 2', 'ADDRESS 3',
       'CITY, STATE, ZIP', 'COUNTRY', 'TAX CLASS', 'BLDG CLASS',
       'MARKET VALUE', 'ASSESSED VALUE', 'EXEMPTION',
       'TRANSITIONAL ASSESSSED VALUE', 'TRANSITIONAL EXEMPTION',
       'TAXABLE VALUE', 'MARKET VALUE.1', 'TRANSITIONAL ASSESSSED VALUE.1',
       'EXEMPTION.1', 'TRANSITIONAL ASSESSSED VALUE.2',
       'TRANSITIONAL EXEMPTION.1', 'TAXABLE VALUE.1', 'RC1', 'RC2', 'RC3',
       'RC4', 'RC5'],
      dtype='object')


# Step 2: Make clean function

Need to do: some DOF column are not named correctly. ex. in 05/2025 case, there were two revised transitional assessed value, etc.

can drop repeated columns?

In [54]:
RC_REASON_MAP = {
    "A": "Alteration",
    "AP": "Apportionment",
    "B": "Building in Progress Last Year",
    "D": "Demolition",
    "E": "Fully Exempt and now restored to taxable",
    "E0": "Sales",
    "E1": "Economics",
    "E2": "Admin Review (Assessor initiated)",
    "E3": "Sales (Taxpayer initiated)",
    "E4": "Economics (Taxpayer initiated)",
    "E5": "Admin Review (Taxpayer initiated)",
    "M": "MV only Change",
    "N": "New Building",
    "NP": "New Building in Progress",
    "P": "Physical Change",
    "S": "Street Gain or Loss",
    "T": "Transfer to/from REUC",
    "X": "Other Change"
}


In [55]:
def clean_nopv(df, fiscal_year, publish_date):
    """
    Cleans a Notice of Property Value DataFrame by standardizing structure and adding key fields.
    """
    RC_REASON_MAP = {
        "A": "Alteration", "AP": "Apportionment", "B": "Building in Progress Last Year",
        "D": "Demolition", "E": "Fully Exempt and now restored to taxable", "E0": "Sales",
        "E1": "Economics", "E2": "Admin Review (Assessor initiated)", "E3": "Sales (Taxpayer initiated)",
        "E4": "Economics (Taxpayer initiated)", "E5": "Admin Review (Taxpayer initiated)",
        "M": "MV only Change", "N": "New Building", "NP": "New Building in Progress",
        "P": "Physical Change", "S": "Street Gain or Loss", "T": "Transfer to/from REUC",
        "X": "Other Change"
    }

    # 1. Normalize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

    # 2. Drop rows without core identifying info
    df = df.dropna(subset=["boro", "block", "lot"])

    # 3. Rename final roll value columns if they exist
    rename_map = {
        "market_value.1": "market_value_final_roll",
        "taxable_value.1": "taxable_value_final_roll",
        "transitional_assesssed_value.1": "transitional_assessed_value_final_roll",
        "transitional_exemption.1": "transitional_exemption_final_roll"
    }
    df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})

    # 4. Create BBL
    df["bbl"] = (
        df["boro"].astype(str).str.zfill(1) +
        df["block"].astype(str).str.zfill(5) +
        df["lot"].astype(str).str.zfill(4)
    )

    # 5. Add publication metadata
    df["fiscal_year"] = fiscal_year
    df["date_published"] = publish_date
    df["published_year"] = publish_date.year
    df["published_month"] = publish_date.month
    df["published_day"] = publish_date.day

    # 6. Translate RC codes to descriptions
    for col in ["rc1", "rc2", "rc3", "rc4", "rc5"]:
        if col in df.columns:
            df[f"{col}_desc"] = df[col].astype(str).str.strip().map(RC_REASON_MAP).fillna("Unknown")

    return df


In [57]:
from datetime import datetime

df_2025_raw = load_excel_by_column_names(
    "https://www.nyc.gov/assets/finance/downloads/pdf/nopv/revised/revised_nopv_04182025.xlsx",
    engine="openpyxl"
)

df_2025 = clean_nopv(df_2025_raw, fiscal_year=2025, publish_date=datetime(2025, 5, 14))
df_2025.head()


✅ Header row found at line 10


Unnamed: 0,boro,block,lot,ease,address_1,address_2,address_3,"city,_state,_zip",country,tax_class,...,fiscal_year,date_published,published_year,published_month,published_day,rc1_desc,rc2_desc,rc3_desc,rc4_desc,rc5_desc
0,1,16.0,3.0,,,55 WATER ST,,NEW YORK NY 10041-0004,,4,...,2025,2025-05-14,2025,5,14,Economics,Unknown,Unknown,Unknown,Unknown
1,1,53.0,12.0,,,4601 PARK RD STE 450,,CHARLOTTE NC 28209-3568,,2,...,2025,2025-05-14,2025,5,14,Economics,Unknown,Unknown,Unknown,Unknown
2,1,107.0,134.0,,,28 OLD MILL RD,,MANHASSET NY 11030-3310,,1,...,2025,2025-05-14,2025,5,14,Economics,Unknown,Unknown,Unknown,Unknown
3,1,132.0,1403.0,,,83 MURRAY STREET,APT 2,NEW YORK NY 10007,,2C,...,2025,2025-05-14,2025,5,14,Economics,Unknown,Unknown,Unknown,Unknown
4,1,144.0,1204.0,,,159 DUANE ST PH,,NEW YORK NY 10013-3890,,2C,...,2025,2025-05-14,2025,5,14,Economics,Unknown,Unknown,Unknown,Unknown


In [58]:
print(df_2025.columns)

Index(['boro', 'block', 'lot', 'ease', 'address_1', 'address_2', 'address_3',
       'city,_state,_zip', 'country', 'tax_class', 'bldg_class',
       'market_value', 'assessed_value', 'exemption',
       'transitional_assesssed_value', 'transitional_exemption',
       'taxable_value', 'market_value_final_roll',
       'transitional_assessed_value_final_roll', 'exemption.1',
       'transitional_assesssed_value.2', 'transitional_exemption_final_roll',
       'taxable_value_final_roll', 'rc1', 'rc2', 'rc3', 'rc4', 'rc5', 'bbl',
       'fiscal_year', 'date_published', 'published_year', 'published_month',
       'published_day', 'rc1_desc', 'rc2_desc', 'rc3_desc', 'rc4_desc',
       'rc5_desc'],
      dtype='object')


# Step 3: Append to repository dataset

In [59]:
from datetime import datetime

# Load and clean the raw data
df_2025_raw = load_excel_by_column_names(
    "https://www.nyc.gov/assets/finance/downloads/pdf/nopv/revised/revised_nopv_04182025.xlsx",
    engine="openpyxl"
)

df_cleaned = clean_nopv(df_2025_raw, fiscal_year=2025, publish_date=datetime(2025, 5, 14))


✅ Header row found at line 10


In [60]:
# Append to master
df_master = pd.concat([df_master, df_cleaned], ignore_index=True)
...


Ellipsis

In [61]:
df_master.to_csv(master_path, index=False)


In [62]:
pd.read_csv("../data/cleaned/nopv_master.csv").tail()


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,boro,block,lot,ease,address_1,address_2,address_3,"city,_state,_zip",country,tax_class,...,fiscal_year,date_published,published_year,published_month,published_day,rc1_desc,rc2_desc,rc3_desc,rc4_desc,rc5_desc
31064,5,7913.0,76.0,,,131 CARTERET ST,,STATEN ISLAND NY 10307-1606,,1,...,2025,2025-05-14,2025,5,14,Economics,Unknown,Unknown,Unknown,Unknown
31065,5,7945.0,52.0,,,45 SATTERLEE ST,,STATEN ISLAND NY 10307-1501,,1,...,2025,2025-05-14,2025,5,14,Other Change,Unknown,Unknown,Unknown,Unknown
31066,5,8012.0,32.0,,,11 CRAIG AVE,,STATEN ISLAND NY 10307-1328,,1,...,2025,2025-05-14,2025,5,14,Economics,Unknown,Unknown,Unknown,Unknown
31067,5,8025.0,88.0,,,272 CARTERET ST,,STATEN ISLAND NY 10307-1631,,1,...,2025,2025-05-14,2025,5,14,Economics,Unknown,Unknown,Unknown,Unknown
31068,5,8047.0,73.0,,,198 JOHNSON AVE,,STATEN ISLAND NY 10307-1262,,1,...,2025,2025-05-14,2025,5,14,Building in Progress Last Year,Economics,Unknown,Unknown,Unknown
