# Cleaning and Preprocessing: Dwelling Permits Data

## Preliminaries

In [6]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings

# Important functionality for this lesson
from scipy import stats
import statsmodels.api as sm
import os

In [7]:
# Set directory
os.chdir("C:/Users/emshe/Desktop/BRAINSTATION/CAPSTONE/GIT_REPO/DATA/PERMITS")

## Define base data processor class

In [12]:
# Define base data processor class


class BaseDataProcessor:
        
    def __init__(self, data_path):
         
        """
        Initialize with just a file path
        """
        
        self.data_path = data_path
        self.original_data = pd.read_csv(self.data_path)
        self.data = pd.read_csv(self.data_path)
        self.cleaned = False
        
    def reset_data(self):
        
        """
        Reset data to original read of the csv
        """
        
        self.data = pd.read_csv(self.data_path)
        self.cleaned = False
        return self,self.data
        
    def is_clean(self):
                
        """
        Check if data is clean
        """
        
        print(f'Data is clean: {self.cleaned}')
        return self
        
    def preprocess_data(self):
        pass

    def load_and_process_data(self):
        pass

    def save_processed_data(self,file_path):

        """
        Save processed file if data is cleaned
        """
        
        if not self.cleaned:
            raise ValueError("Data has not been cleaned. Run self.clean_data() before saving.")
        self.data.to_csv(file_path, index=False)
        print(f"Processed data saved to {file_path}.")

In [13]:
# Define permits processor child class

class PermitsProcessor(BaseDataProcessor):
    
    def clean_data(self):

        """
        Clean data
        """
        
        if self.cleaned is False:
            self.data = self.update_data_types()
            self.data = self.drop_unnecessary_cols()
            self.data = self.remove_ProjectValue_zeros()
            self.data = self.impute_unknowns()
            self.data = self.drop_rows_with_key_nulls()
            self.cleaned = True
        else: print("Data is already clean.")
        
        return self, self.data
        
    def update_data_types(self):

        """
        Update data types as necessary
        """
        
        for date_col in ['IssueDate','PermitNumberCreatedDate']:
            self.data[date_col] = pd.to_datetime(self.data[date_col],format='%Y-%m-%d') # Change date columns to datetime format
        for date_col in ['YearMonth']:
            self.data[date_col] = pd.to_datetime(self.data[date_col],format = '%Y-%m')
        return self.data
    
    def drop_unnecessary_cols(self):
        
        """
        Drop all unnecessary or redundant columns
        """
        
        if self.cleaned is False:
            
            
            irrelevant_cols = ['ApplicantAddress', 'BuildingContractorAddress']
            self.data = self.data.drop(columns = irrelevant_cols) # Drop irrelevant columns
            
            # Drop IssueYear if there are no conflicts
            if (self.data['IssueDate'].dt.year != self.data['IssueYear']).sum() == 0: # Check if IssueDate and IssueYear are consistent
                        self.data = self.data.drop(columns = ['IssueYear']) # Drop redundant IssueYear column
                        # return self.data
            # Drop YearMonth from data set if there are no conflicts
            YYConflicts = (self.data['IssueDate'].dt.year != self.data['YearMonth'].dt.year)
            MMConflicts = (self.data['IssueDate'].dt.month != self.data['YearMonth'].dt.month)
            conflict_sum = YYConflicts.sum() + MMConflicts.sum()
            if conflict_sum == 0:
                self.data = self.data.drop(columns = ['YearMonth'])
            return self.data
            
    def drop_rows_with_key_nulls(self):

        """
        Drop rows with important nulls as necessary
        """
        
        # Columns with essential values
        # SpecificUseCategory is included as it only has one null value to drop
        columns_to_check = ['GeoLocalArea', 'Geom', 'geo_point_2d', 'Address','SpecificUseCategory']
        # Drop rows where any of these columns have nulls
        self.data = self.data.dropna(subset=columns_to_check)
        return self.data
        
    def impute_unknowns(self):

        """
        Impute 'unknown' strings into data as necessary
        """
        
        # List of categorical columns with nulls to replace with 'unknown'
        categorical_cols = ['PermitCategory', 'BuildingContractor']
        # Fill NaN values in these columns with 'Unknown'
        self.data[categorical_cols] = self.data[categorical_cols].fillna('unknown')
        return self.data

    def remove_ProjectValue_zeros(self):

        """
        Remove projects with value equal to zero
        """
        
        self.data = self.data[self.data['ProjectValue'] != 0.0]
        return self.data

In [8]:
# Initialize dwelling permits instance
dwelling_permits = PermitsProcessor("issued_building_permits_filter_dwelling_purposes.csv")

# Reset data using .reset_data() method
dwelling_permits.reset_data()

# Check if data is clean
print(f'Data is clean: {dwelling_permits.cleaned}')

Data is clean: False


In [90]:
# store dwelling_permits_og_df using .original_data method
dwelling_permits_og_df = dwelling_permits.original_data
dwelling_permits_og_df.columns

Index(['PermitNumber', 'PermitNumberCreatedDate', 'IssueDate',
       'PermitElapsedDays', 'ProjectValue', 'TypeOfWork', 'Address',
       'ProjectDescription', 'PermitCategory', 'Applicant', 'ApplicantAddress',
       'PropertyUse', 'SpecificUseCategory', 'BuildingContractor',
       'BuildingContractorAddress', 'IssueYear', 'GeoLocalArea', 'Geom',
       'YearMonth', 'geo_point_2d'],
      dtype='object')

In [91]:
# Uncomment to run .clean_data() method
# Set dwelling_permits_df as numpy array returned by .clean_data() method
    # dwelling_permits_df = dwelling_permits.clean_data()
    # dwelling_permits_df.info()

In [92]:
# Clean data
dwelling_permits, dwelling_permits_df = dwelling_permits.clean_data()

In [93]:
# Check if data is clean
dwelling_permits.is_clean()

Data is clean: True


<__main__.PermitsProcessor at 0x2a3c3c144d0>

## Step 1: update data types

In [94]:
# Update date columns of dwelling_permits_df to datetime format with .update_data_types method
# dwelling_permits_df = dwelling_permits.update_data_types()
# dwelling_permits_df.info()

In [95]:
# Update data types 
# dwelling_permits_df = dwelling_permits.update_data_types()
# dwelling_permits_df.info()

## Step 2: Drop unnecessary columns (IssueYear, YearMonth)

In [96]:
# Drop irrelevant columns

# 'ApplicantAddress' and 'BuildingContractorAddress' are not relevant as these columns just contain contact info
# irrelevant_cols = ['ApplicantAddress', 'BuildingContractorAddress']
# dwelling_permits_df.drop(columns = irrelevant_cols, axis = 1,inplace = True)

In [97]:
# Check if IssueYear is present as column
issue_year_present = ('IssueYear' in dwelling_permits_df.columns)
print(f'Issue Year in Columns:  {issue_year_present}' )

Issue Year in Columns:  False


In [98]:
# Reset data and check if cleaned (should be False)
dwelling_permits.reset_data()
dwelling_permits_df = dwelling_permits.data
print(f'Data is clean: {dwelling_permits.cleaned}')

Data is clean: False


In [99]:
# Check column info
display(dwelling_permits_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32744 entries, 0 to 32743
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PermitNumber               32744 non-null  object 
 1   PermitNumberCreatedDate    32744 non-null  object 
 2   IssueDate                  32744 non-null  object 
 3   PermitElapsedDays          32744 non-null  int64  
 4   ProjectValue               32744 non-null  float64
 5   TypeOfWork                 32744 non-null  object 
 6   Address                    32581 non-null  object 
 7   ProjectDescription         32744 non-null  object 
 8   PermitCategory             16931 non-null  object 
 9   Applicant                  32744 non-null  object 
 10  ApplicantAddress           32653 non-null  object 
 11  PropertyUse                32744 non-null  object 
 12  SpecificUseCategory        32743 non-null  object 
 13  BuildingContractor         22636 non-null  obj

None

In [100]:
# Drop Issue Year from data set if there are no conflicts
# YearConflicts = (dwelling_permits_df['IssueDate'].dt.year != dwelling_permits_df['IssueYear'])
# conflict_sum = YearConflicts.sum()
# if conflict_sum == 0:
#     print('There are no conflicts between issue date and issue year. Issue year will be dropped.')
#     dwelling_permits_df.drop(['IssueYear'], axis = 1,inplace = True)
#     print(f'''Column IssueYear Present: {'IssueYear' in dwelling_permits_df.columns}''')
#     # display(dwelling_permits_df.head())
# else: Print(f'''There is a conflict between issue data and issue year. Issue year will not be dropped and should be investigated.''')
                

In [101]:
# Drop YearMonth from data set if there are no conflicts
# YYConflicts = (dwelling_permits_df['IssueDate'].dt.year != dwelling_permits_df['YearMonth'].dt.year)
# MMConflicts = (dwelling_permits_df['IssueDate'].dt.month != dwelling_permits_df['YearMonth'].dt.month)
# conflict_sum = YYConflicts.sum() + MMConflicts.sum()
# if conflict_sum == 0:
#     print('There are no conflicts between issue date and issue month/year. Issue monthyear will be dropped.')
#     dwelling_permits_df.drop(['YearMonth'], axis = 1,inplace = True)
#     print(f'''Column IssueYear Present: {'YearMonth' in dwelling_permits_df.columns}''')
#     # display(dwelling_permits_df.head())
# else: Print(f'''There is a conflict between issue data and issue month/year. Issue year will not be dropped and should be investigated.''')
                

## Step 3: Deal with projects with zero project value

In [102]:
# Restrict to permits with 0.0 dollar value
zero_value_df = dwelling_permits_df[dwelling_permits_df['ProjectValue'] == 0.0]
print(f'The zero_value_df is length {len(zero_value_df)}.')
display(zero_value_df.sample(10))

The zero_value_df is length 6920.


Unnamed: 0,PermitNumber,PermitNumberCreatedDate,IssueDate,PermitElapsedDays,ProjectValue,TypeOfWork,Address,ProjectDescription,PermitCategory,Applicant,ApplicantAddress,PropertyUse,SpecificUseCategory,BuildingContractor,BuildingContractorAddress,IssueYear,GeoLocalArea,Geom,YearMonth,geo_point_2d
12458,BP-2019-00265,2019-01-21,2019-07-03,163,0.0,New Building,"258 E 17TH AVENUE, Vancouver, BC V5V 1A7",High Density Housing / Commercial - New Buildi...,,Richard Hazell,"419 W 19th Av\nVANCOUVER, BC V5Y 2B8",Dwelling Uses,Infill,Novell Construction Ltd,"1177 W BROADWAY \nUnit 303\nVancouver, BC V6...",2019,Riley Park,"{""coordinates"":[-123.099376,49.2556252],""type""...",2019-07,"49.2556252, -123.099376"
19077,BP-2025-00618,2025-02-07,2025-02-11,4,0.0,Salvage and Abatement,"3141 E 19TH AVENUE, Vancouver, BC V5M 2T2",Low Density Housing - Salvage and Abatement - ...,,Payam G Taghavi DBA: High Cliff Homes Ltd.,"312 27th Street E\nNorth Vancouver, BC V7N 1B8",Dwelling Uses,Single Detached House,Bhullar Excavating and Demolition,,2025,Renfrew-Collingwood,"{""coordinates"":[-123.0379814,49.2534035],""type...",2025-02,"49.2534035, -123.0379814"
29480,BP-2018-05682,2018-10-30,2019-02-20,113,0.0,Salvage and Abatement,"605 W 31ST AVENUE, Vancouver, BC V5Z 2J9",Enquiry Centre - Salvage and Abatement - Salva...,,Steffie Lintag DBA: Cressey Developments,"555 W 8th Avenue\nSuite 200\nVancouver, BC V5...",Dwelling Uses,Single Detached House,Fleck Contracting Ltd,"9013 SHAUGHNESSY ST \nVancouver, BC V6P 6R9",2019,South Cambie,"{""coordinates"":[-123.1183578,49.2432777],""type...",2019-02,"49.2432777, -123.1183578"
2985,BP-2017-02262,2017-04-28,2017-06-12,45,0.0,Salvage and Abatement,"5120 DUNBAR STREET, Vancouver, BC V6N 1V7",Low Density Housing - Salvage and Abatement - ...,,LUCIO PICCIANO DBA: DLP Architecture Inc,"202-460 Nanaimo St\nvancouver, BC V5L4W3",Dwelling Uses,Single Detached House,Gator Excavating Ltd,"9423 163 Street \nSurrey, BC V4N 3C5",2017,Dunbar-Southlands,"{""coordinates"":[-123.1848912,49.240241],""type""...",2017-06,"49.240241, -123.1848912"
32737,BP-2021-06004,2021-11-17,2021-12-14,27,0.0,Salvage and Abatement,"825 COMMERCIAL DRIVE, Vancouver, BC",Enquiry Centre - Salvage and Abatement - To pr...,,Jerry Rakhra DBA: Vandwell Living Inc,"303 E 40th Ave\nVancouver, BC V5M 1M1",Dwelling Uses,Infill Single Detached House,R B Excavating 2000 Ltd,"8047 Prince Albert St \nVancouver, BC V5X 3Z9",2021,Grandview-Woodland,"{""coordinates"":[-123.0707004,49.2773113],""type...",2021-12,"49.2773113, -123.0707004"
5106,BP-2021-05705,2021-11-02,2022-04-20,169,0.0,Salvage and Abatement,"3603 PRICE STREET, Vancouver, BC V5R 5R4",Low Density Housing - Salvage and Abatement - ...,,Harjeet Dhaliwal DBA: Owner,"13032\n61ave\nSurrey, BC V3X2H5",Dwelling Uses,Single Detached House,WEST DEMOLITION SERVICE LTD,,2022,Renfrew-Collingwood,"{""coordinates"":[-123.0257189,49.2409232],""type...",2022-04,"49.2409232, -123.0257189"
20511,BP-2021-02060,2021-05-05,2021-06-29,55,0.0,Salvage and Abatement,"788 E 30TH AVENUE, Vancouver, BC V5V 2V9",Low Density Housing - Salvage and Abatement - ...,,Don Piner DBA: Intarsia Design Ltd.,"7561 Barrymore Drive\nDelta, BC V4C6X5",Dwelling Uses,Duplex,,,2021,Kensington-Cedar Cottage,"{""coordinates"":[-123.0882296,49.2432595],""type...",2021-06,"49.2432595, -123.0882296"
29055,BP-2018-01900,2018-04-10,2018-05-01,21,0.0,Salvage and Abatement,"944 E 8TH AVENUE, Vancouver, BC V5T 1T8",Enquiry Centre - Salvage and Abatement - Inter...,,Dinesh Chand,"302-2818 MAIN ST\nVANCOUVER, BC V5T0C1",Dwelling Uses,Multiple Dwelling,Kingsman Excavating Ltd.,"Unit 1104, 7360 137 ST\n15559 59 Ave\nSurrey, ...",2018,Mount Pleasant,"{""coordinates"":[-123.084221,49.2631303],""type""...",2018-05,"49.2631303, -123.084221"
15417,BP-2018-06465,2018-12-13,2018-12-14,1,0.0,Salvage and Abatement,"1619 E BROADWAY, Vancouver, BC V5N 1V9",Enquiry Centre - Salvage and Abatement - Inter...,,Ken Cheung DBA: K E Concepts 2001 Ltd,"628 E 56th Ave.\nVancouver, BC V5X 1R7",Dwelling Uses,Multiple Dwelling,KE Concepts 2001 Ltd,,2018,Grandview-Woodland,"{""coordinates"":[-123.0713385,49.2626373],""type...",2018-12,"49.2626373, -123.0713385"
26624,BP-2020-03578,2020-11-13,2020-11-19,6,0.0,Salvage and Abatement,"414 E 5TH AVENUE, Vancouver, BC V5T 1H7",Enquiry Centre - Salvage and Abatement - Salva...,,Jeremy Anderson,"3455 Porter St\nVancouver, BC V5N 4H2",Dwelling Uses,Single Detached House,,,2020,Mount Pleasant,"{""coordinates"":[-123.0951421,49.2659919],""type...",2020-11,"49.2659919, -123.0951421"


In [103]:
# drop all rows with value = zero
dwelling_permits_df = dwelling_permits_df[dwelling_permits_df['ProjectValue'] != 0.0]

## Step 4: Deal with columns with null values one by one

In [122]:
# Find columns with nulls

# Find total rows for percentage calculation
total_rows = len(dwelling_permits_df)

# List of dictionaries to hold null info
null_info = []

for col in dwelling_permits_df.columns:
    null_count = dwelling_permits_df[col].isna().sum()
    if null_count > 0:
        null_info.append({
            'Column': col,
            'Null Count': null_count,
            'Percent Null': round((null_count / total_rows) * 100, 2)
        })

# Create and display DataFrame sorted by % null
if len(null_info) > 0:
    null_summary_df = pd.DataFrame(null_info).sort_values(by='Percent Null', ascending=False)
    display(null_summary_df)
else:
    print('Your data set has no null values.')

Your data set has no null values.


In [105]:
# List all columns
display(dwelling_permits_df.columns)

Index(['PermitNumber', 'PermitNumberCreatedDate', 'IssueDate',
       'PermitElapsedDays', 'ProjectValue', 'TypeOfWork', 'Address',
       'ProjectDescription', 'PermitCategory', 'Applicant', 'ApplicantAddress',
       'PropertyUse', 'SpecificUseCategory', 'BuildingContractor',
       'BuildingContractorAddress', 'IssueYear', 'GeoLocalArea', 'Geom',
       'YearMonth', 'geo_point_2d'],
      dtype='object')

In [106]:
# Define function to analyze null values for particular columns
def examine_col_nulls(df,col,output = True):
    null_df = dwelling_permits_df.isna()
    null_count = null_df[col].sum()
    null_perc = 100 * null_count / len(null_df)
    col_null_df = df[df[col].isnull()]
    if output == True:
        print(f'\nPermitCategory has the following value counts.\n')
        display(dwelling_permits_df['PermitCategory'].value_counts())
        print(f'\nPermitCategory has {null_count} null values.\n')
        print(f'\nPermitCategory nulls make up has {null_perc} % of the dataset.\n') 
        print(f'\nWe present an example of the subdataset where {col} is null\n')
        display(col_null_df.sample(5))
        col_null_df_restricted = col_null_df[['PermitNumberCreatedDate',
        'ProjectValue', 'TypeOfWork', 'Address', 'Applicant','PropertyUse', 
        'SpecificUseCategory', 'BuildingContractor','BuildingContractorAddress',
                                            'GeoLocalArea']]
        for feature in col_null_df_restricted:
            print(f'\nIn the col_null_df, the feature {feature} has the following value counts.\n') 
            display(col_null_df[feature].value_counts())
    return null_count, null_perc, col_null_df

### Step 4.1: imputing 'unknown' into categorical columns

In [107]:
# Examine PermitCategory nulls
examine_col_nulls(dwelling_permits_df,"PermitCategory")


PermitCategory has the following value counts.



PermitCategory
Renovation - Residential - Lower Complexity              7995
New Build - Low Density Housing                          6680
New Build - Standalone Laneway                           2037
Renovation - Commercial/ Mixed Use - Lower Complexity     110
Name: count, dtype: int64


PermitCategory has 9002 null values.


PermitCategory nulls make up has 34.8590458488228 % of the dataset.


We present an example of the subdataset where PermitCategory is null



Unnamed: 0,PermitNumber,PermitNumberCreatedDate,IssueDate,PermitElapsedDays,ProjectValue,TypeOfWork,Address,ProjectDescription,PermitCategory,Applicant,ApplicantAddress,PropertyUse,SpecificUseCategory,BuildingContractor,BuildingContractorAddress,IssueYear,GeoLocalArea,Geom,YearMonth,geo_point_2d
29531,BP-2023-01642,2023-05-12,2024-01-04,237,3900000.0,New Building,"1550 E 3RD AVENUE, Vancouver, BC V5N 1G9",Certified Professional Program - New Building ...,,Sayed Pouriya Hosseini DBA: Psquare Engineerin...,"602 W Hastings Street\nUnit 402\nVancouver, BC...",Dwelling Uses,Multiple Dwelling,WBCM DEVELOPMENT LTD.,"1542 Prairie Avenue \nPort Coquitlam, BC V3B...",2024,Grandview-Woodland,"{""coordinates"":[-123.0723602,49.2677791],""type...",2024-01,"49.2677791, -123.0723602"
13841,DB-2025-00435,2025-01-30,2025-04-08,68,40000.0,Demolition / Deconstruction,"2256 E PENDER STREET, Vancouver, BC V5L 1X4",Low Density Housing - Demolition / Deconstruct...,,Gurmandeep Bal,"2147 E 49th Ave\nVancouver, BC V5P1T6",Dwelling Uses,Single Detached House,JB Siteworks Inc.,,2025,Grandview-Woodland,"{""coordinates"":[-123.0584832,49.2799155],""type...",2025-04,"49.2799155, -123.0584832"
13882,BP-2017-04237,2017-08-15,2018-03-09,206,50000.0,Demolition / Deconstruction,"7325 BLENHEIM STREET, Vancouver, BC V6N 1S2",Enquiry Centre - Demolition / Deconstruction -...,,Drew Grimson DBA: Cavendish Contracting Ltd,Attn: Drew Grimston\n5520 Alma St \nVancouver...,Dwelling Uses,Single Detached House,J & R Excavation & Demolition Ltd.,"7782 Progress Way\nDelta, BC V4G 1A4",2018,Dunbar-Southlands,"{""coordinates"":[-123.1799357,49.2194302],""type...",2018-03,"49.2194302, -123.1799357"
17198,DB-2022-03621,2022-07-15,2023-05-25,314,15000.0,Demolition / Deconstruction,"3334 WINLAW PLACE, Vancouver, BC V5M 3G4",Low Density Housing - Demolition / Deconstruct...,,Mo Maani,"5200-4000 No 3 Rd\nRichmond, BC V6X 0J8",Dwelling Uses,Single Detached House,J&R Excavation & Demolition Ltd.,,2023,Renfrew-Collingwood,"{""coordinates"":[-123.0478633,49.2546021],""type...",2023-05,"49.2546021, -123.0478633"
10249,DB-2018-01789,2018-04-04,2018-08-29,147,37500.0,Demolition / Deconstruction,"392 W 41ST AVENUE, Vancouver, BC V5Y 2S7",Enquiry Centre - Demolition / Deconstruction -...,,Niall O'Shea DBA: Kindred Construction,"#308-2150 W Broadway\nVancouver, BC V6K 4L9",Dwelling Uses,Single Detached House,Fleck Contracting Ltd,"9013 SHAUGHNESSY ST \nVancouver, BC V6P 6R9",2018,Oakridge,"{""coordinates"":[-123.112523,49.2333483],""type""...",2018-08,"49.2333483, -123.112523"



In the col_null_df, the feature PermitNumberCreatedDate has the following value counts.



PermitNumberCreatedDate
2021-05-27    22
2022-05-05    22
2016-12-22    21
2021-04-13    19
2024-05-31    16
              ..
2024-07-04     1
2023-04-04     1
2024-03-14     1
2020-09-22     1
2019-01-03     1
Name: count, Length: 2182, dtype: int64


In the col_null_df, the feature ProjectValue has the following value counts.



ProjectValue
15000.0      4719
40000.0       253
20000.0       186
30000.0       123
50000.0       116
             ... 
420050.0        1
122400.0        1
1515000.0       1
362000.0        1
121050.0        1
Name: count, Length: 1342, dtype: int64


In the col_null_df, the feature TypeOfWork has the following value counts.



TypeOfWork
Demolition / Deconstruction             5727
Addition / Alteration                   2278
New Building                             934
Salvage and Abatement                     30
Outdoor Uses (No Buildings Proposed)      22
Temporary Building / Structure            11
Name: count, dtype: int64


In the col_null_df, the feature Address has the following value counts.



Address
8460 ASH STREET, Vancouver, BC V6P 3M2           8
1501 HARO STREET, Vancouver, BC V6G 1G4          4
2901 E HASTINGS STREET, Vancouver, BC V5K 5J1    4
3620 W 20TH AVENUE, Vancouver, BC V6S 1E8        3
124 DUNLEVY AVENUE, Vancouver, BC V6A 3T6        3
                                                ..
2083 W 48TH AVENUE, Vancouver, BC V6M 2P4        1
2209 E 2ND AVENUE, Vancouver, BC V5N 1G1         1
1604 SALSBURY DRIVE, Vancouver, BC V5L 4B8       1
3934 LILLOOET STREET, Vancouver, BC V5R 2E8      1
6272 BUTLER STREET, Vancouver, BC V5S 3K4        1
Name: count, Length: 8597, dtype: int64


In the col_null_df, the feature Applicant has the following value counts.



Applicant
QI LI DBA: LQ Design GROUP Ltd                    212
Vincent Wan DBA: D.V. Design Ltd.                 198
Carman Kwan DBA: Architectural Collective Inc.    174
Michael Lu DBA: DWG Design Work Group Ltd.        101
Khang Nguyen DBA: Architrix Design Studio          98
                                                 ... 
Winga LAM                                           1
HUEN KEE LIEW                                       1
RAJWINDER MANN                                      1
Matt Stogryn DBA: iFortune Homes Inc.               1
Garinder Deo                                        1
Name: count, Length: 2875, dtype: int64


In the col_null_df, the feature PropertyUse has the following value counts.



PropertyUse
Dwelling Uses                                                                        8647
Dwelling Uses, Retail Uses                                                             87
Dwelling Uses, Parking Uses                                                            53
Dwelling Uses, Parking Uses, Retail Uses                                               38
Dwelling Uses, Office Uses                                                             21
Dwelling Uses, Office Uses, Retail Uses                                                19
Dwelling Uses, Retail Uses, Service Uses                                               18
Dwelling Uses, Parking Uses, Retail Uses, Service Uses                                 17
Dwelling Uses, Service Uses                                                            17
Dwelling Uses, Institutional Uses                                                      11
Dwelling Uses, Institutional Uses, Parking Uses                                         


In the col_null_df, the feature SpecificUseCategory has the following value counts.



SpecificUseCategory
Single Detached House                                                                                        4922
Single Detached House w/Sec Suite                                                                            1459
Multiple Dwelling                                                                                            1081
Duplex                                                                                                        413
Multiple Conversion Dwelling                                                                                  191
                                                                                                             ... 
Printing or Publishing, Dwelling Unit                                                                           1
Dwelling Unit w/ Other Use, Restaurant - Class 1                                                                1
Miscellaneous Products Mfg-Class A, General Office, Restaurant - Cla


In the col_null_df, the feature BuildingContractor has the following value counts.



BuildingContractor
Canadian Excavating Ltd              412
Bhullar Excavating and Demolition    260
East West Excavating Ltd             236
JVT EXCAVATING & DEMOLITION LTD      231
Kingsman Excavating Ltd.             177
                                    ... 
A-1 Window Mfg Ltd                     1
Thi M T Tran                           1
PROFORM CONSTRUCTION GROUP LTD         1
Cornat Construction Ltd                1
JPC Services Inc                       1
Name: count, Length: 1022, dtype: int64


In the col_null_df, the feature BuildingContractorAddress has the following value counts.



BuildingContractorAddress
6898 130 St\nSurrey, BC  V3W 4J5                            412
968 E 53RD AV  \nVancouver, BC  V5X 1J6                     204
Unit 1104, 7360 137 ST\n15559 59 Ave\nSurrey, BC  V3S4N8    177
12498 55 Ave\nSurrey, BC  V3X 3V5                           172
5649 ASH ST  \nVancouver, BC  V5Z 3G8                       144
                                                           ... 
2115 W 34TH AV  \nVancouver, BC  V6M 1G3                      1
4830 INVERNESS ST  \nVancouver, BC  V5V 4X6                   1
6498 DUMFRIES ST  \nVancouver, BC  V5P 3B4                    1
4418 JAMES ST  \nVancouver, BC  V5V 3J1                       1
12306 McTavish Rd  \nPitt Meadows, BC  V3Y 1Z1                1
Name: count, Length: 686, dtype: int64


In the col_null_df, the feature GeoLocalArea has the following value counts.



GeoLocalArea
Kensington-Cedar Cottage    966
Renfrew-Collingwood         749
Hastings-Sunrise            730
Riley Park                  634
Dunbar-Southlands           614
Sunset                      537
Kitsilano                   533
Grandview-Woodland          448
Victoria-Fraserview         418
Marpole                     405
West Point Grey             331
Killarney                   307
Kerrisdale                  301
Mount Pleasant              297
Oakridge                    275
Arbutus Ridge               270
Shaughnessy                 255
South Cambie                240
West End                    220
Downtown                    165
Fairview                    135
Strathcona                  106
Name: count, dtype: int64

(9002,
 34.8590458488228,
         PermitNumber PermitNumberCreatedDate   IssueDate  PermitElapsedDays  \
 2      BP-2022-02723              2022-05-25  2022-11-14                173   
 4      DB-2017-02311              2017-05-02  2017-08-23                113   
 10     BP-2017-05079              2017-09-27  2018-08-03                310   
 12     DB-2019-02085              2019-05-13  2019-06-24                 42   
 15     BP-2017-00509              2017-01-30  2017-06-12                133   
 ...              ...                     ...         ...                ...   
 32713  BP-2022-03253              2022-06-22  2022-06-28                  6   
 32730  BP-2022-04148              2022-08-25  2023-06-22                301   
 32736  BP-2022-05542              2022-12-16  2023-08-22                249   
 32740  DB-2019-02157              2019-05-16  2020-06-22                403   
 32742  BP-2022-01383              2022-03-16  2022-10-14                212   
 
        Proj

In [108]:
# Value counts for the TypeOfWork column
dwelling_permits_df['TypeOfWork'].value_counts()

TypeOfWork
Addition / Alteration                   10383
New Building                             9651
Demolition / Deconstruction              5727
Salvage and Abatement                      30
Outdoor Uses (No Buildings Proposed)       22
Temporary Building / Structure             11
Name: count, dtype: int64

##### NOTE: the categorical columns will be carefully examined later on to ensure this is an appropriate imputation

In [109]:
# Impute 'unknown' into selected categorical columns

# List of categorical columns you want to impute
categorical_cols = ['PermitCategory', 'BuildingContractor']

# Fill NaN values in these columns with 'Unknown'
dwelling_permits_df[categorical_cols] = dwelling_permits_df[categorical_cols].fillna('unknown')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dwelling_permits_df[categorical_cols] = dwelling_permits_df[categorical_cols].fillna('unknown')


### Step 4.2: removing rows with null geographic info

##### NOTE: later on, I will carefully investigate to see if we can avoid removing all these rows

In [115]:
# Drop rows with nulls in important columns

# List of columns to check for nulls
# We include essential geographic columns as well as SpecificUseCategory which only has one null value
columns_to_check = ['GeoLocalArea', 'Geom', 'geo_point_2d', 'Address','SpecificUseCategory']

# Drop rows where any of these columns have nulls
dwelling_permits_df = dwelling_permits_df.dropna(subset=columns_to_check)

## Testing function: test_sequence()

In [9]:
# Define function to streamline testing
def test_sequence():

    """
    Function to test class functionality
    """
    
    # Instantiate dwelling permits 
    dwelling_permits = PermitsProcessor("issued_building_permits_filter_dwelling_purposes.csv")
    print('Dwelling permits instantiated.\n')
    
    # Clean data 
    dwelling_permits, dwelling_permits_df = dwelling_permits.clean_data()
    print('Data has been cleaned.\n')
    display(dwelling_permits_df.info())
    print('')
    dwelling_permits.save_processed_data("issued_building_permits_filter_dwelling_purposes_cleaned.csv")
    return dwelling_permits, dwelling_permits_df

In [127]:
# Run test sequence: initation, cleaning, etc.

# Running this code block sets dwelling_permits_df as a clean data set

dwelling_permits, dwelling_permits_df = test_sequence()

Dwelling permits instantiated.

Data has been cleaned.

<class 'pandas.core.frame.DataFrame'>
Index: 25508 entries, 0 to 32743
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   PermitNumber             25508 non-null  object        
 1   PermitNumberCreatedDate  25508 non-null  datetime64[ns]
 2   IssueDate                25508 non-null  datetime64[ns]
 3   PermitElapsedDays        25508 non-null  int64         
 4   ProjectValue             25508 non-null  float64       
 5   TypeOfWork               25508 non-null  object        
 6   Address                  25508 non-null  object        
 7   ProjectDescription       25508 non-null  object        
 8   PermitCategory           25508 non-null  object        
 9   Applicant                25508 non-null  object        
 10  PropertyUse              25508 non-null  object        
 11  SpecificUseCategory      25508 non-null  o

None

Processed data saved to issued_building_permits_filter_dwelling_purposes_cleaned.csv.


## Miscellaneous

In [113]:
# Display dwelling permits info
display(dwelling_permits_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 25824 entries, 0 to 32743
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   PermitNumber             25824 non-null  object        
 1   PermitNumberCreatedDate  25824 non-null  datetime64[ns]
 2   IssueDate                25824 non-null  datetime64[ns]
 3   PermitElapsedDays        25824 non-null  int64         
 4   ProjectValue             25824 non-null  float64       
 5   TypeOfWork               25824 non-null  object        
 6   Address                  25686 non-null  object        
 7   ProjectDescription       25824 non-null  object        
 8   PermitCategory           25824 non-null  object        
 9   Applicant                25824 non-null  object        
 10  PropertyUse              25824 non-null  object        
 11  SpecificUseCategory      25823 non-null  object        
 12  BuildingContractor       25824 non-nu

None