In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import spacy
import re
from thefuzz import fuzz, process
from collections import defaultdict

In [None]:
df = pd.read_csv("../data/raw/Building_Permits__Addition_Alteration_20250305.csv")
display(df.head(5))

### Clean Columns

In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")  #Convert colummns to lowercase
df.columns

In [None]:
df.dtypes

In [None]:
df["description"]

In [None]:
df["building_construction_type"].unique()

In [None]:
df.isna().sum().sort_values(ascending=False).head(50)

In [None]:
df["energy_compliance_option"].value_counts()

### Fill NaN Values

In [None]:
df["description_of_demolition"] = df["description_of_demolition"].fillna("No Description")
df["description_of_demolition"].value_counts()

In [None]:
df["debris_disposal"] = df["debris_disposal"].fillna("Other")
df["debris_disposal"].value_counts()

In [None]:
df["method_of_removal"] = df["method_of_removal"].fillna("Other")
df["method_of_removal"].value_counts()

In [None]:
df["number_of_units"] = df["number_of_units"].fillna(0)
df["number_of_units"].describe()

In [None]:
df[df["number_of_units"].isna() == True]

In [None]:
df["current_property_use"] = df["current_property_use"].fillna("Not Specified")
df["current_property_use"].value_counts()

In [None]:
df["building_use"].value_counts()

In [None]:
df["size_of_new_addition"] = df["size_of_new_addition"].fillna(0)
df["size_of_new_addition"].describe()

In [None]:
df["firm_name"] = df["firm_name"].fillna("Other")
df["firm_name"].value_counts()

In [None]:
df["isd_description"] = df["isd_description"].fillna("Not Specified")
df["isd_description"].value_counts()

In [None]:
df.isna().sum().sort_values(ascending=True).head(50)

In [None]:
df.shape

### Drop selected Columns

In [None]:
df.drop(columns=[ "address_for_mapping", "submit_date", "change_in_units", "change_in_property_use", "maplot_number", "id_field", "viewpoint_id"], inplace=True)

### Drop all Columns where NaN values are above the given threshold

In [None]:
# Count NaN values in each column
nan_counts = df.isna().sum()

# Sort columns by NaN counts (ascending)
sorted_columns = nan_counts.sort_values().index.tolist()

# Display the original DataFrame
print("\nNaN counts per column:")
print(nan_counts)
print("\nColumns sorted by NaN counts (ascending):")
print(sorted_columns)

# Create a new DataFrame with columns sorted by NaN counts
sorted_df = df[sorted_columns]

# Filter columns based on a threshold (e.g., keep only columns with fewer than 2 NaNs)
threshold = 5000
filtered_columns = nan_counts[nan_counts < threshold].index.tolist()
df = df[filtered_columns]
print("\nShape of DataFrame with columns having fewer than", threshold, "NaNs:")
print(df.shape)

### Cast date column as datetime

In [None]:
df['issue_date'] = pd.to_datetime(df['issue_date'], errors='coerce')
df['issue_year'] = df['issue_date'].dt.year
df['issue_month'] = df['issue_date'].dt.month
print("\nDate range of permits:")
print(f"From {df['issue_date'].min()} to {df['issue_date'].max()}")

In [None]:
def get_season(month):
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else:
            return 'Autumn'
        
df['season'] = df['issue_month'].apply(get_season)

### Handle missing True/False and fill NaN values

In [None]:
df["condo_association"] = df["condo_association"].fillna(False)
# df["condo_association"] = df["condo_association"].apply(lambda x: int(x))
df["condo_association"].value_counts()


In [None]:
df["bza_case"] = df["bza_case"].fillna(False)
# df["bza_case"] = df["bza_case"].apply(lambda x: int(x))
df["bza_case"].value_counts()

In [None]:
df["planning_board_special_permit"] = df["planning_board_special_permit"].fillna(False)
# df["planning_board_special_permit"] = df["planning_board_special_permit"].apply(lambda x: int(x))
df["planning_board_special_permit"].value_counts()


In [None]:
df["bicycle_parking_change"] = df["bicycle_parking_change"].apply(lambda x: False if x == "No" else True)
df["bicycle_parking_change"].value_counts()

In [None]:
# df["status"] = df["status"].apply(lambda x: int(0) if x == "Active" else int(1))
df["status"].value_counts()

### Drop remaining rows with NaN values

In [None]:
df.isna().sum().sort_values(ascending=False).head(25)

In [None]:
df = df.dropna(how="any")

In [None]:
df.isna().sum().sort_values(ascending=False).head(20)

### Create calculated columns

#### Total Cost

In [None]:
df["calc_total_cost"] = (df["building_cost"] + df["electrical_cost"] + df['plumbing_cost'] + df['gas_cost'] + df['hvac_cost'] + df['fire_prevention_cost'])
df[["total_cost", "calc_total_cost"]]

In [None]:
# Drop original total_cost
df.drop("total_cost", axis=1, inplace=True)

### Dropping Outliers

#### Total Cost

In [None]:
def plot_outliers(df, column):
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(20,8))
    sns.boxplot(x = df[column], color="lightblue", ax=ax1)
    sns.histplot(df[column], kde=True, bins=30, color="salmon", ax=ax2)
    plt.tight_layout()
    plt.show()

df_test_total_cost = df[df["calc_total_cost"] < 10000000]
df_test_number_of_units = df[(df["number_of_units"] < 10)]

testing_dict = {
    "calc_total_cost": df_test_total_cost, 
    "number_of_units": df_test_number_of_units
}

for key, value in testing_dict.items():
    plot_outliers(value, key)

In [None]:
plot_outliers(df[df["electrical_cost"] > 1000000], "electrical_cost")

In [None]:
plot_outliers(df[df["fire_prevention_cost"] < 250000], "fire_prevention_cost")

In [None]:
df = df[df["fire_prevention_cost"] < 250000]

In [None]:
plot_outliers(df[df["hvac_cost"] < 250000], "hvac_cost")

#### Discretizing Features to include Outliers

##### Calculated Total Cost

In [None]:
# Discretizing 'total_cost' into 3 categories
bins = [0, 20000, 200000, 2000000, df['calc_total_cost'].max()]
labels = ["low", "medium", "high", "very high"]
df['total_cost_bins'] = pd.cut(df['calc_total_cost'], bins=bins, labels=labels, include_lowest=True)

df['total_cost_bins'].value_counts()

In [None]:
df = df[df["calc_total_cost"] < 10000000]
df = df[df["number_of_units"] < 200]
df

##### Drop One/Two Family dwellings where there are more than 1 units

In [None]:
df.drop(df[(df["number_of_units"] > 2.0) & (df["building_use"] == "One or Two Family Dwelling")].index, inplace=True)


In [None]:
df[(df["number_of_units"] > 2.0) & (df["building_use"] == "One or Two Family Dwelling")]

##### Deleting edge cases

In [None]:
df[df["record_number"] == 2323]

In [None]:
df = df.drop(df[df["record_number"] == 3877].index)
df = df.drop(df[df["record_number"] == 8409].index)
df = df.drop(df[df["record_number"] == 2304].index)
df = df.drop(df[df["record_number"] == 2323].index)
df = df.drop(df[df["record_number"] == 106].index)

### Clean Firma Name's Typos

In [None]:
display(list(df["firm_name"].unique()))
df["firm_name"].nunique()

In [None]:
from firm_name_cleaner import clean_firm_names

In [None]:
df = clean_firm_names(df)


In [None]:
display(list(df["standardized_firm_name"].unique()))
df["standardized_firm_name"].nunique()

In [None]:
display(df[['original_firm_name', 'firm_name', 'standardized_firm_name']].head(50))

### Convert Description into a soup of words

In [None]:
from tqdm.auto import tqdm

nlp = spacy.load('en_core_web_sm')

def clean_text(text):
    """
    Clean and preprocess text before keyword extraction
    
    Args:
        text (str): Input text
        
    Returns:
        str: Cleaned text
    """
    # Handle NaN values
    if pd.isna(text) or text == '':
        return ''
    
    # Fix spacing
    text = re.sub(r'\s+', ' ', str(text).strip())
    
    # Fix common typos
    text = text.replace("wityh", "with")
    
    # Remove parentheses and other problematic characters
    text = re.sub(r'[\(\)]', ' ', text)
    
    # Clean up numbers followed by single characters
    text = re.sub(r'(\d+)\s+([a-zA-Z])\s', r'\1\2 ', text)
    
    return text

def extract_keywords_with_context(text):
    """
    Extract meaningful keyword phrases from construction/repair text using spaCy
    
    Args:
        text (str): Input text
        
    Returns:
        list: List of extracted meaningful keyword phrases
    """
    # Clean text
    text = clean_text(text)
    if not text:
        return []
    
    # Process with spaCy
    doc = nlp(text)
    
    keywords = []
    
    # Extract verb phrases with their objects (install wall, replace floor)
    keywords.extend(extract_construction_verb_phrases(doc))
    
    # Extract material + component combinations (metal stud, wood floor)
    keywords.extend(extract_material_component_combinations(doc))
    
    # Extract measurements with context (5/8 fire, 3 kw)
    keywords.extend(extract_measurements_with_context(doc))
    
    # Extract important noun phrases (kitchen floor, hood vent)
    keywords.extend(extract_noun_phrases(doc))
    
    # Extract important compound terms (noun+noun combinations)
    keywords.extend(extract_compound_terms(doc))
    
    # Final cleanup and filtering
    final_keywords = filter_and_clean_keywords(keywords)
    
    return final_keywords

def extract_construction_verb_phrases(doc):
    """Extract verb phrases related to construction/repair"""
    keywords = []
    construction_verbs = ['install', 'replace', 'repair', 'remove', 'build', 'construct']
    
    for token in doc:
        if token.lemma_.lower() in construction_verbs:
            # Find all noun phrases that are children of this verb
            obj_tokens = []
            for child in token.children:
                if child.pos_ in ['NOUN', 'PROPN']:
                    # Include modifiers of this noun
                    modifiers = []
                    for modifier in child.children:
                        if modifier.pos_ in ['ADJ', 'NOUN', 'PROPN'] or modifier.dep_ == 'compound':
                            modifiers.append(modifier.text.lower())
                    
                    # Create phrase with modifiers + noun
                    if modifiers:
                        phrase = ' '.join(modifiers) + ' ' + child.text.lower()
                    else:
                        phrase = child.text.lower()
                    
                    obj_tokens.append(phrase)
            
            # Create construction action phrases
            for obj in obj_tokens:
                action_phrase = f"{token.lemma_.lower()} {obj}"
                if action_phrase not in keywords:
                    keywords.append(action_phrase)
                    
    return keywords

def extract_material_component_combinations(doc):
    """Extract material + component combinations"""
    keywords = []
    material_words = ['metal', 'wood', 'vinyl', 'plastic', 'steel', 'aluminum', 'fire']
    component_words = ['stud', 'wall', 'floor', 'ceiling', 'door', 'window', 'vent', 'hood', 'cabinet', 'heat']
    
    # Find material + component combinations
    for i, token in enumerate(doc):
        if token.lemma_.lower() in material_words:
            # Look ahead for component words
            for j in range(i+1, min(i+4, len(doc))):
                if doc[j].lemma_.lower() in component_words:
                    material_component = f"{token.lemma_.lower()} {doc[j].lemma_.lower()}"
                    if material_component not in keywords:
                        keywords.append(material_component)
                        
    return keywords

def extract_measurements_with_context(doc):
    """Extract measurements with their context"""
    keywords = []
    measurement_pattern = re.compile(r'\d+/\d+|\d+\.\d+|\d+')
    
    for i, token in enumerate(doc):
        if measurement_pattern.match(token.text):
            # Check if next token is a unit or material
            if i < len(doc) - 1:
                # Skip single characters or stopwords as the next token
                if len(doc[i+1].text) <= 1 or doc[i+1].is_stop:
                    continue
                
                measurement = f"{token.text} {doc[i+1].text.lower()}"
                if measurement not in keywords:
                    keywords.append(measurement)
                    
    return keywords

def extract_noun_phrases(doc):
    """Extract important noun phrases"""
    keywords = []
    
    for chunk in doc.noun_chunks:
        # Skip chunks with only stopwords
        if all(token.is_stop for token in chunk):
            continue
            
        # Extract clean noun phrases
        if 2 <= len(chunk) <= 3:
            clean_phrase = ' '.join([token.lemma_.lower() for token in chunk 
                                    if not token.is_punct and len(token.text) > 1 
                                    and not token.is_stop])
            if clean_phrase and len(clean_phrase) > 2 and clean_phrase not in keywords:
                keywords.append(clean_phrase)
                
    return keywords

def extract_compound_terms(doc):
    """Extract compound terms (noun+noun combinations)"""
    keywords = []
    
    for i in range(len(doc) - 1):
        if doc[i].pos_ in ['NOUN', 'PROPN'] and doc[i+1].pos_ in ['NOUN', 'PROPN']:
            if not doc[i].is_stop and not doc[i+1].is_stop:
                compound = f"{doc[i].lemma_.lower()} {doc[i+1].lemma_.lower()}"
                if compound not in keywords:
                    keywords.append(compound)
                    
    return keywords

def filter_and_clean_keywords(keywords):
    """Final filtering and cleaning of keywords"""
    final_keywords = []
    
    for keyword in keywords:
        # Skip single words that are too short
        words = keyword.split()
        if len(words) == 1 and len(keyword) <= 2:
            continue
            
        # Skip phrases with single characters or parentheses
        if re.search(r'\b[a-zA-Z]\b|\(|\)', keyword):
            continue
            
        # Skip phrases with only numbers and single characters
        if re.match(r'^\d+\s+[a-zA-Z]$', keyword):
            continue
            
        # Check if this keyword is a subset of an existing keyword
        if not any(keyword in k and keyword != k for k in keywords):
            final_keywords.append(keyword)
    
    return final_keywords

def process_dataframe(df, description_col='description', id_col=None):
    """
    Process a dataframe to extract keywords from a description column
    
    Args:
        df (pandas.DataFrame): Input dataframe
        description_col (str): Name of the description column
        id_col (str, optional): Name of the ID column. If None, uses dataframe index.
        
    Returns:
        pandas.DataFrame: DataFrame with added keywords column
    """
    # Check if the description column exists
    if description_col not in df.columns:
        raise ValueError(f"Column '{description_col}' not found in dataframe")
    
    # Make a copy to avoid modifying the original
    result_df = df.copy()
    
    # Add record_number column if id_col is not specified
    if id_col is None:
        result_df['record_number'] = result_df.index
    else:
        if id_col not in df.columns:
            raise ValueError(f"ID column '{id_col}' not found in dataframe")
        result_df['record_number'] = result_df[id_col]
    
    # Extract keywords for each row with progress bar
    print(f"Extracting keywords from '{description_col}' column...")
    result_df['keywords'] = [extract_keywords_with_context(text) for text in tqdm(result_df[description_col].tolist())]
    
    print(f"Keyword extraction complete. Added 'keywords' column.")
    return result_df

In [None]:
df = process_dataframe(df, description_col='description')
df

In [None]:
def create_keyword_dataframe(df_with_keywords):
    """
    Create a new dataframe where each keyword is in a separate row
    with the record_number from original data
    """
    keyword_records = []
    
    # Make sure you're using the EXACT same record_number from the original dataframe
    for _, row in df_with_keywords.iterrows():
        record_number = row['record_number']  # This must be the original ID
        
        for keyword in row['keywords']:
            keyword_records.append({
                'record_number': record_number,  # Use the original record_number
                'keyword': keyword
            })
    
    return pd.DataFrame(keyword_records)

In [None]:
keyword_df = create_keyword_dataframe(df)

In [None]:
# Display the keyword dataframe
print("\nKeyword dataframe sample:")
display(keyword_df.head(50))

In [None]:
np.array(df[df["record_number"] == 11875][["description", "keywords"]])

In [None]:
display(keyword_df.tail(50))

In [None]:
keyword_df["keyword"].value_counts().head(50)

In [None]:
# Add frequency count to keywords
keyword_counts = keyword_df['keyword'].value_counts().reset_index()
keyword_counts.columns = ['keyword', 'frequency']

# Merge to add frequency to each keyword instance
keyword_df_with_freq = pd.merge(keyword_df, keyword_counts, on='keyword', how='left')

# Export keyword dataframe
keyword_df_with_freq.to_csv("../data/clean/keyword_data.csv", index=False)
print(f"Keyword dataframe exported with {len(keyword_df_with_freq)} rows.")

# 3. Create a summary file with unique keywords and their frequencies for the word cloud
unique_keywords = keyword_counts.copy()
unique_keywords.to_csv("../data/clean/unique_keywords.csv", index=False)

### Equalize Current Property Use and Building Use

In [None]:
df = df[(df["current_property_use"] != 'Vacant Lot') & (df["current_property_use"] != 'Accessory')]

In [None]:
print(df["current_property_use"].unique())
print(df["building_use"].unique())

In [None]:
building_use_dict = {
    "Not Specified": "Commercial/Mixed",
    "Commercial / Mixed Use": "Commercial/Mixed",

    "Multi-Family (3 units or greater)": "Multi-Family",
    "Multi Family (3 or more dwelling units)": "Multi-Family",

    'Two-Family': 'One/Two-Family',
    'One-Family': 'One/Two-Family',
    'One or Two Family Dwelling': 'One/Two-Family',

    'Townhouse': 'Townhouse',
}

df = df.replace({"current_property_use": building_use_dict})
df = df.replace({"building_use": building_use_dict})

df[["current_property_use", "building_use"]]

### Last Checks

In [None]:
df.columns

In [None]:
columns = df.columns

# for column in columns:
#     if df[column].nunique() == 2:
#         print(column, df[column].unique(), df[column].dtype)

binary_columns = {}
for column in df.columns:
        try:
            # Check if column has exactly 2 unique values
            unique_values = df[column].dropna().unique()
            
            # Only consider a column binary if it has exactly 2 unique values
            if len(unique_values) == 2:
                binary_columns[column] = list(unique_values)
                print(f"Binary column found: {column}, Values: {unique_values}, Type: {df[column].dtype}")
        except TypeError as e:
            # Handle unhashable types (like lists or dicts in cells)
            print(f"Skipping column '{column}' - contains unhashable type: {e}")

### Cast string booleans as booleans

In [None]:
boolean_columns = [
    "change_in_exterior", 
    "discharge_to_sewer_or_storm_water_system", 
    "new_or_replaced_storm_sewer", 
    "construction_dewatering", 
    "public_right-of-way", 
    "basement_plumbing_fixture", 
    "change_in_at_least_half_of_total_area",
]

df[boolean_columns] = df[boolean_columns].replace({"True": True, "False": False})

In [None]:
df.dtypes

In [None]:
df

In [None]:
df.reset_index(drop=True)

In [None]:
df[df["record_number"] == 3877]

### Saving clean DataFrame as csv

In [None]:
df.to_csv("../data/clean/building_permits_addition_alteration_clean.csv", index=False)