In [29]:
import pdfplumber
import pandas as pd
import re

# Known suffixes
street_suffixes = ["ST", "STREET", "AVE", "AVENUE", "BLVD", "ROAD", "RD", "PL", "PLACE", "DR", "DRIVE", "CT", "COURT", "WAY", "LN", "LANE", "PKWY", "TERRACE", "TER", "BROADWAY"]

def parse_line(line):
    tokens = line.strip().split()
    if len(tokens) < 10:
        return None

    zip_code = tokens[0]
    bldgno1 = tokens[1]

    suffix_idx = -1
    for i, token in enumerate(tokens[2:], start=2):
        if token.upper() in street_suffixes:
            suffix_idx = i
            break
    if suffix_idx == -1:
        return None

    street_name = " ".join(tokens[2:suffix_idx])
    suffix = tokens[suffix_idx]
    rest = tokens[suffix_idx+1:]

    if len(rest) < 7:
        return None

    block = rest[-2]
    lot = rest[-1]
    county, city, status1, status2, status3 = rest[-7:-2]

    return [zip_code, bldgno1, street_name, suffix, county, city, status1, status2, status3, block, lot]

# Parse the PDF
pdf_path = "2023-DHCR-Bldg-File-Queens.pdf"
parsed_data = []

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        lines = page.extract_text().split("\n")
        for line in lines:
            if line.startswith("ZIP") or "Source:" in line:
                continue
            parsed = parse_line(line)
            if parsed:
                parsed_data.append(parsed)

# Create DataFrame
columns = ["ZIP", "BLDGNO1", "STREET1", "STSUFX1", "COUNTY", "CITY", "STATUS1", "STATUS2", "STATUS3", "BLOCK", "LOT"]
df = pd.DataFrame(parsed_data, columns=columns)

# Build address
df["full_address"] = (
    df["BLDGNO1"].astype(str).str.strip() + " " +
    df["STREET1"].astype(str).str.strip() + " " +
    df["STSUFX1"].astype(str).str.strip()
).str.upper()

# Save result
df.to_csv("reparsed_queens_stabilized_buildings.csv", index=False)
print("Saved to 'reparsed_queens_stabilized_buildings.csv'")


Saved to 'reparsed_queens_stabilized_buildings.csv'


In [28]:
import pdfplumber
import pandas as pd
import re

# Known suffixes
street_suffixes = ["ST", "STREET", "AVE", "AVENUE", "BLVD", "ROAD", "RD", "PL", "PLACE", "DR", "DRIVE", "CT", "COURT", "WAY", "LN", "LANE", "PKWY", "TERRACE", "TER", "BROADWAY"]

def parse_line(line):
    tokens = line.strip().split()
    if len(tokens) < 10:
        return None

    zip_code = tokens[0]
    bldgno1 = tokens[1]

    suffix_idx = -1
    for i, token in enumerate(tokens[2:], start=2):
        if token.upper() in street_suffixes:
            suffix_idx = i
            break
    if suffix_idx == -1:
        return None

    street_name = " ".join(tokens[2:suffix_idx])
    suffix = tokens[suffix_idx]
    rest = tokens[suffix_idx+1:]

    if len(rest) < 7:
        return None

    block = rest[-2]
    lot = rest[-1]
    county, city, status1, status2, status3 = rest[-7:-2]

    return [zip_code, bldgno1, street_name, suffix, county, city, status1, status2, status3, block, lot]

# Parse the PDF
pdf_path = "2023-DHCR-Bldg-File-Staten-Island.pdf"
parsed_data = []

with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        lines = page.extract_text().split("\n")
        for line in lines:
            if line.startswith("ZIP") or "Source:" in line:
                continue
            parsed = parse_line(line)
            if parsed:
                parsed_data.append(parsed)

# Create DataFrame
columns = ["ZIP", "BLDGNO1", "STREET1", "STSUFX1", "COUNTY", "CITY", "STATUS1", "STATUS2", "STATUS3", "BLOCK", "LOT"]
df = pd.DataFrame(parsed_data, columns=columns)

# Build address
df["full_address"] = (
    df["BLDGNO1"].astype(str).str.strip() + " " +
    df["STREET1"].astype(str).str.strip() + " " +
    df["STSUFX1"].astype(str).str.strip()
).str.upper()

# Save result
df.to_csv("reparsed_staten_stabilized_buildings.csv", index=False)
print("Saved to 'staten_stabilized_buildings.csv")


Saved to 'staten_stabilized_buildings.csv


In [27]:
import pandas as pd
import re
from fuzzywuzzy import fuzz, process

# Load files
complaints_df = pd.read_csv("elevator_complaints_matched_to_nycha.csv")
buildings_df = pd.read_csv("combined_output.csv")

# Step 1: Normalize addresses
def normalize_address(addr):
    addr = str(addr).upper()
    addr = re.sub(r'\bSTREET\b', 'ST', addr)
    addr = re.sub(r'\bAVENUE\b', 'AVE', addr)
    addr = re.sub(r'\bROAD\b', 'RD', addr)
    addr = re.sub(r'\bBOULEVARD\b', 'BLVD', addr)
    addr = re.sub(r'\s+', ' ', addr).strip()
    return addr

buildings_df['full_address_clean'] = buildings_df['full_address'].apply(normalize_address)
complaints_df['address_clean'] = complaints_df['Incident Address'].apply(normalize_address)

# Step 2: Fuzzy match
address_lookup = buildings_df['full_address_clean'].tolist()

def fuzzy_match_address(addr, choices, threshold=90):
    match, score = process.extractOne(addr, choices, scorer=fuzz.token_sort_ratio)
    return match if score >= threshold else None

complaints_df['matched_address'] = complaints_df['address_clean'].apply(
    lambda x: fuzzy_match_address(x, address_lookup)
)

# Step 3: Flag stabilized buildings
complaints_df['is_stabilized'] = complaints_df['matched_address'].notnull()

# Step 4: Save
complaints_df.to_csv("elevator_complaints_with_stabilized_flag_fuzzy.csv", index=False)
print(f"Done. Matched {complaints_df['is_stabilized'].sum()} out of {len(complaints_df)} complaints.")


Done. Matched 4822 out of 8558 complaints.


In [26]:
import pandas as pd
import glob
import os

# Step 1: Define your folder path and file pattern
folder_path = "nyc_rent"  # <-- change this to your folder
file_pattern = os.path.join(folder_path, "*.csv")

# Step 2: Use glob to get all CSV files in the folder
csv_files = glob.glob(file_pattern)

# Step 3: Read and concatenate all CSVs
combined_df = pd.concat([pd.read_csv(f) for f in csv_files], ignore_index=True)

# Step 4: Save to a single CSV
combined_df.to_csv("combined_output.csv", index=False)

print(f"Combined {len(csv_files)} files into 'combined_output.csv'")


Combined 5 files into 'combined_output.csv'
