# Natural Language Processing: Dwelling Permit Data

## Preliminaries

In [8]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'notebook_connected' # For plotly graphs to render in this environment

import geopandas as gpd
from shapely.geometry import Point, Polygon
import json


import re
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer


import itertools
import time

In [2]:
# Set directory

PATH = "C:/Users/emshe/Desktop/BRAINSTATION/CAPSTONE/GIT_REPO/DATA/PERMITS"

In [3]:
# Define function to examine dataframes

def examine_df(name,df):
    """
    Check basic info about a dataframe df
    """
    
    print(f"\n\nNumber of records in the {name} is: {len(df)}\n")
    print(f"The columns in the {name} are: {df.columns}\n")
    print(f"\n Other info about {name}:")
    display(df.info())
    print(f"\n\nSample of records in the {name}:")
    display(df.head(5))

In [6]:
# Load permits data

permits_df = pd.read_csv(f"{PATH}/issued_building_permits_filter_dwelling_purposes_preprocessed.csv")

examine_df('permits dataframe',permits_df)



Number of records in the permits dataframe is: 25445

The columns in the permits dataframe are: Index(['issue_date', 'project_description', 'geom', 'project_value', 'nbhd',
       'zone', 'duplex_w_secondary_suite', 'laneway_house', 'duplex',
       'multiple_conversion_dwelling', 'dwelling_unit', 'multiple_dwelling',
       'single_detached_house', 'single_detached_house_w_sec_suite',
       'type_of_work_demolition_deconstruction', 'type_of_work_new_building',
       'permit_category_new_build_low_density_housing',
       'permit_category_new_build_standalone_laneway',
       'permit_category_renovation_residential_lower_complexity'],
      dtype='object')


 Other info about permits dataframe:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25445 entries, 0 to 25444
Data columns (total 19 columns):
 #   Column                                                   Non-Null Count  Dtype  
---  ------                                                   --------------  -----  
 0   issue_

None



Sample of records in the permits dataframe:


Unnamed: 0,issue_date,project_description,geom,project_value,nbhd,zone,duplex_w_secondary_suite,laneway_house,duplex,multiple_conversion_dwelling,dwelling_unit,multiple_dwelling,single_detached_house,single_detached_house_w_sec_suite,type_of_work_demolition_deconstruction,type_of_work_new_building,permit_category_new_build_low_density_housing,permit_category_new_build_standalone_laneway,permit_category_renovation_residential_lower_complexity
0,2017-04-12,Low Density Housing - New Building - To constr...,POINT (-123.0438851 49.2545202),250115.294118,Renfrew,Mount Pleasant/Renfrew Heights,0,0,0,0,0,0,0,1,0,1,1,0,0
1,2022-11-14,Low Density Housing - Demolition / Deconstruct...,POINT (-123.0677731 49.2249324),15625.427204,Fraser View/Killarny,Southeast Vancouver,0,0,0,0,0,0,1,0,1,0,0,0,0
2,2017-08-23,Field Review - Demolition / Deconstruction - T...,POINT (-123.1211653 49.2588709),75669.289412,South Granville,South Granville/Oak,0,0,0,0,0,1,0,0,1,0,0,0,0
3,2017-09-28,Low Density Housing - New Building - To constr...,POINT (-123.0846679 49.2365573),183866.117647,Sunset,Southeast Vancouver,0,1,0,0,0,0,0,0,0,1,0,1,0
4,2018-08-03,Low Density Housing - Demolition / Deconstruct...,POINT (-123.0894871 49.239411),17490.436113,Sunset,Southeast Vancouver,0,0,0,0,0,0,0,1,1,0,0,0,0


## Text column EDA

In [7]:
# Examine sample descriptions

# Ensure pandas doesn't truncate text
pd.set_option('display.max_colwidth', None)

# Sample and print 5 full negative reviews
print("Sample descriptions:\n")
sample = permits_df['project_description'].sample(5)
for i, description in enumerate(sample, 1):
    print(f"Project description {i}:\n{description}\n")

Sample descriptions:

Project description 1:
Field Review - Addition / Alteration - Interior alterations to provide improvements including the removal of load bearing partition walls on the lower and upper floor at this existing One-Family Dwelling building.

Scope of work includes the removal of an unauthorized rear awning cover over the rear deck as indicated on the bubbled drawings.

OK for field review and district building inspector to determine if a Sprinkler Permit would be required as per Howie Chow, July 30, 2020.

Structural & Geotechnical (Permanent) - Schedule B submitted by N.K. Varshney, P. Eng, 604-251-6320

Project description 2:
Low Density Housing - Addition / Alteration - Exterior and interior alterations to provide improvements, and to change the use of this existing Single Detached House to Single Detached House with Secondary Suite at this existing inside with lane site. Scope of work includes removal of the existing attached carport and construction of a new acce

## Define Text Processing Pipeline

In [10]:
def clean_tokenizer(text):

    """
    Given a text object, execute basic text cleaning (lowercase, remove punctuation, and stop words) then return list of unigrams and bigrams
    """
    
    # Lowercase
    text = text.lower()
    
    # Remove other punctuation and digits
    text = re.sub(r"[^a-z\s]", "", text)
    
    # Tokenize by whitespace
    tokens = text.split()
    
    # Remove stopwords and single letter words
    tokens = [token for token in tokens if token not in ENGLISH_STOP_WORDS and len(token) != 1]

    # Generate bigrams
    bigrams = [f"{tokens[i]}_{tokens[i+1]}" for i in range(len(tokens)-1)]
    
    # Return combined list of unigrams and bigrams
    return tokens + bigrams

In [11]:
# Define vectorizer 

vectorizer = TfidfVectorizer(
    tokenizer=clean_tokenizer,  # Use your custom tokenizer function
    max_features=100,           # Keep the top 100 most frequent features
    min_df=10                   # Minimum document frequency of 10
)


In [12]:
# Apply vectorizer

word_cols = vectorizer.fit_transform(permits_df["project_description"])


The parameter 'token_pattern' will not be used since 'tokenizer' is not None'



In [14]:
# Add word columns

# Add 'word' prefix to feature names
word_feature_names = [f"word_{word}" for word in vectorizer.get_feature_names_out()]

word_df = pd.DataFrame(word_cols.toarray(), columns=word_feature_names)

permits_df = pd.concat([permits_df, word_df], axis=1)

In [16]:
print(vectorizer.get_feature_names_out())

['ac' 'ac_unit' 'access' 'access_lane' 'access_provided' 'accessory'
 'accessory_building' 'accommodate' 'accommodate_improved' 'accordance'
 'accordance_building' 'achieve' 'achieve_required' 'add' 'addition'
 'addition_alteration' 'address' 'address_assigned' 'address_information'
 'address_note' 'address_number' 'address_suite' 'addressed' 'addresses'
 'addresses_assigned' 'addresses_posted' 'addressing' 'alteration'
 'alteration_exterior' 'alteration_interior' 'alterations'
 'alterations_provide' 'andor' 'appliances' 'applicable'
 'applicable_sprinklered' 'application' 'application_section' 'applied'
 'applied_application' 'apply' 'apply_recycling' 'approval' 'approved'
 'approved_plans' 'architectural' 'area' 'assigned' 'assigned_approved'
 'assurance' 'attached' 'attached_garage' 'av' 'av_laneway' 'av_secondary'
 'ave' 'bar' 'bar_sink' 'basement' 'bathroom' 'bldg' 'bldg_addressed' 'bp'
 'builder' 'building' 'building_addresses' 'building_attached'
 'building_bylaw' 'building_cons

In [15]:
examine_df('permits dataframe',permits_df)



Number of records in the permits dataframe is: 25445

The columns in the permits dataframe are: Index(['issue_date', 'project_description', 'geom', 'project_value', 'nbhd',
       'zone', 'duplex_w_secondary_suite', 'laneway_house', 'duplex',
       'multiple_conversion_dwelling',
       ...
       'word_walls', 'word_west', 'word_window', 'word_windows', 'word_work',
       'word_work_include', 'word_work_includes', 'word_yard', 'word_zoning',
       'word_zoning_development'],
      dtype='object', length=519)


 Other info about permits dataframe:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25445 entries, 0 to 25444
Columns: 519 entries, issue_date to word_zoning_development
dtypes: float64(501), int64(13), object(5)
memory usage: 100.8+ MB


None



Sample of records in the permits dataframe:


Unnamed: 0,issue_date,project_description,geom,project_value,nbhd,zone,duplex_w_secondary_suite,laneway_house,duplex,multiple_conversion_dwelling,...,word_walls,word_west,word_window,word_windows,word_work,word_work_include,word_work_includes,word_yard,word_zoning,word_zoning_development
0,2017-04-12,"Low Density Housing - New Building - To construct a 2 storey laneway house building \nproviding 1 on grade parking space, having vehicular access \nfrom the lane. \n \n1. No A/C unit proposed \n \nNote: Bldg 1 (principal bldg) addressed 3286 Renfrew St retained on site.",POINT (-123.0438851 49.2545202),250115.294118,Renfrew,Mount Pleasant/Renfrew Heights,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022-11-14,"Low Density Housing - Demolition / Deconstruction - To demolish the existing one family dwelling building ($15,000) on this site by means of deconstruction.\n\nDemo Declaration – Bhullar Excavating and Demolition (778-891-4556)\n\nThis permit is subject to the Green Demolition Bylaw (11023)\n\nPre-1950 Green Demolition Conditions Apply\n75% Recycling Rate of Building Materials Required",POINT (-123.0677731 49.2249324),15625.427204,Fraser View/Killarny,Southeast Vancouver,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2017-08-23,"Field Review - Demolition / Deconstruction - To demolish this existing Multiple Dwelling building containing six (6) dwelling unit at this existing Multiple Dwelling buildings site.\n\nOK for field review as per Howie Chow, May 2, 2017.\n\nReference to:\ndb-2017-02307, S&A bp-2017-02308\ndb-2017-02311, S&A bp-2017-02312\ndb-2017-02314, S&A bp-2017-02315\ndb-2017-02316, S&A bp-2017-02317\ndb-2017-02318, S&A bp-2017-02319",POINT (-123.1211653 49.2588709),75669.289412,South Granville,South Granville/Oak,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2017-09-28,"Low Density Housing - New Building - To construct a 2 storey laneway house building providing 1 on grade parking space, having vehicular access from the lane. \n \n1. No A/C unit proposed \n2. B1/B2. E.H.Y. Man P.Eng (604-874-3237) Structural\n3. HPO - JET Demolition & Excavating Ltd. \n \n******THIS PERMIT HAS BEEN ISSUED UNDER THE REQUIREMENTS OF VBBL 2014***** \n \nNote: Bldg 1 (principal bldg) addressed 1004 E 37th Av retained on site.",POINT (-123.0846679 49.2365573),183866.117647,Sunset,Southeast Vancouver,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2018-08-03,Low Density Housing - Demolition / Deconstruction - To demolish the existing one family dwelling building on this site \nby means of deconstruction.\n\nThis permit is subject to the Green Demolition Bylaw (11023)\n\nNote: Pre-1940 recycling requirement: 75% of non-hazardous construction waste,POINT (-123.0894871 49.239411),17490.436113,Sunset,Southeast Vancouver,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
