In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# Read in the construction permits data
permits_filepath = Path("../resources/dan/Permit_Data/permit_data.csv")

permits_df = pd.read_csv(permits_filepath)

In [3]:
# spot check permits_df data
permits_df.head()

Unnamed: 0,zip_code,land_use,count,date,permit_type
0,77091.0,Single Family Residential (Shared Driveways),1,1/23/2020,Residential
1,77091.0,Single Family Residential (Shared Driveways),1,1/23/2020,Residential
2,77072.0,Unrestricted,1,1/23/2020,Unrestricted
3,77048.0,Drainage or Detention; Landscape or Open Space...,1,1/23/2020,Residential
4,77045.0,Unrestricted Intended for Multifamily,1,1/23/2020,Unrestricted


In [4]:
# check data field counts
permits_df.count()

zip_code       8847
land_use       8859
count          8859
date           8859
permit_type    8859
dtype: int64

In [5]:
# drop rows with nulls
permits_df.dropna(inplace=True)

# repeat spot check of data field counts
permits_df.count()

zip_code       8847
land_use       8847
count          8847
date           8847
permit_type    8847
dtype: int64

In [6]:
# check dtypes
permits_df.dtypes

zip_code       float64
land_use        object
count            int64
date            object
permit_type     object
dtype: object

In [7]:
# check zip_code to int
permits_df = permits_df.astype({'zip_code': 'int64'})

# recheck dtypes
permits_df.dtypes

zip_code        int64
land_use       object
count           int64
date           object
permit_type    object
dtype: object

In [8]:
# spot check permits_df data again
permits_df.head()

Unnamed: 0,zip_code,land_use,count,date,permit_type
0,77091,Single Family Residential (Shared Driveways),1,1/23/2020,Residential
1,77091,Single Family Residential (Shared Driveways),1,1/23/2020,Residential
2,77072,Unrestricted,1,1/23/2020,Unrestricted
3,77048,Drainage or Detention; Landscape or Open Space...,1,1/23/2020,Residential
4,77045,Unrestricted Intended for Multifamily,1,1/23/2020,Unrestricted


In [9]:
# set index to date with format of year and month
permits_df.set_index(pd.to_datetime(permits_df['date'], format='%Y%m', infer_datetime_format=True), inplace=True)

# drop non-index date column
permits_df.drop('date', axis=1, inplace=True)

# preview data
permits_df.head()

Unnamed: 0_level_0,zip_code,land_use,count,permit_type
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-23,77091,Single Family Residential (Shared Driveways),1,Residential
2020-01-23,77091,Single Family Residential (Shared Driveways),1,Residential
2020-01-23,77072,Unrestricted,1,Unrestricted
2020-01-23,77048,Drainage or Detention; Landscape or Open Space...,1,Residential
2020-01-23,77045,Unrestricted Intended for Multifamily,1,Unrestricted


In [10]:
# sort by date (index), oldest to newest, then by zip, lowest to highest
permits_df = permits_df.sort_values(by=['date','zip_code'])

# preview_data
permits_df.head(15)

Unnamed: 0_level_0,zip_code,land_use,count,permit_type
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-05,77002,Unrestricted,1,Unrestricted
2017-01-05,77004,Single Family Residential (Shared Driveways),1,Residential
2017-01-05,77004,Unrestricted,1,Unrestricted
2017-01-05,77004,Single Family Residential (Public Street),1,Residential
2017-01-05,77006,Single Family Residential (Public Street),1,Residential
2017-01-05,77006,Single Family Residential (Public Street),1,Residential
2017-01-05,77006,Commercial,1,Commercial
2017-01-05,77007,Unrestricted,1,Unrestricted
2017-01-05,77007,Single Family Residential (Shared Driveways),1,Residential
2017-01-05,77007,Unrestricted,1,Unrestricted


In [11]:
# reorder columns for visibility to most pertinent data
permits_df = permits_df[['zip_code', 'permit_type', 'count', 'land_use']]

# preview new column order
permits_df.columns.values

array(['zip_code', 'permit_type', 'count', 'land_use'], dtype=object)

In [12]:
# preview permits_df data
permits_df.head()

Unnamed: 0_level_0,zip_code,permit_type,count,land_use
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-05,77002,Unrestricted,1,Unrestricted
2017-01-05,77004,Residential,1,Single Family Residential (Shared Driveways)
2017-01-05,77004,Unrestricted,1,Unrestricted
2017-01-05,77004,Residential,1,Single Family Residential (Public Street)
2017-01-05,77006,Residential,1,Single Family Residential (Public Street)


In [13]:
# groupby zip_code and date and sum each zip column of permit counts
permit_counts = permits_df.groupby(["zip_code", "date"]).sum()
#permit_counts.head()

#pivot so index is date, columns are zip, and values for each are permit counts
permits_by_zip = pd.pivot_table(permit_counts, index = "date", values = "count", columns = "zip_code")

#preview dataframe
permits_by_zip.head()

zip_code,77002,77003,77004,77005,77006,77007,77008,77009,77010,77011,...,77484,77489,77493,77494,77498,77530,77532,77545,77546,77598
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-05,1.0,,3.0,,3.0,4.0,7.0,,,,...,1.0,,7.0,,2.0,,,2.0,,
2017-01-19,1.0,,8.0,,1.0,8.0,3.0,1.0,,,...,,,8.0,1.0,2.0,,,1.0,1.0,
2017-02-02,1.0,,6.0,,1.0,1.0,,,,,...,,,4.0,1.0,1.0,2.0,5.0,1.0,1.0,1.0
2017-02-16,,2.0,3.0,,,3.0,2.0,1.0,,,...,1.0,,3.0,,1.0,,,,,
2017-03-13,1.0,2.0,3.0,,1.0,3.0,5.0,2.0,,2.0,...,1.0,,2.0,1.0,1.0,1.0,,1.0,,


In [14]:
#consolidate index of weekly data into year-month and sum per month
permits_by_zip.groupby([permits_by_zip.index.year, permits_by_zip.index.month]).sum()

# reset multi-index (year and month) to a year-month single index format
permits_by_zip.index = permits_by_zip.index.to_period('M')


# preview dataframe
permits_by_zip.head()

zip_code,77002,77003,77004,77005,77006,77007,77008,77009,77010,77011,...,77484,77489,77493,77494,77498,77530,77532,77545,77546,77598
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01,1.0,,3.0,,3.0,4.0,7.0,,,,...,1.0,,7.0,,2.0,,,2.0,,
2017-01,1.0,,8.0,,1.0,8.0,3.0,1.0,,,...,,,8.0,1.0,2.0,,,1.0,1.0,
2017-02,1.0,,6.0,,1.0,1.0,,,,,...,,,4.0,1.0,1.0,2.0,5.0,1.0,1.0,1.0
2017-02,,2.0,3.0,,,3.0,2.0,1.0,,,...,1.0,,3.0,,1.0,,,,,
2017-03,1.0,2.0,3.0,,1.0,3.0,5.0,2.0,,2.0,...,1.0,,2.0,1.0,1.0,1.0,,1.0,,


In [15]:
# clean up data: fill NaN with 0
permits_by_zip.fillna(0, inplace=True)

# clean up data: change values to intefet (from float)
permits_by_zip = permits_by_zip.astype(int)

# preview dataframe
permits_by_zip.head()

zip_code,77002,77003,77004,77005,77006,77007,77008,77009,77010,77011,...,77484,77489,77493,77494,77498,77530,77532,77545,77546,77598
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01,1,0,3,0,3,4,7,0,0,0,...,1,0,7,0,2,0,0,2,0,0
2017-01,1,0,8,0,1,8,3,1,0,0,...,0,0,8,1,2,0,0,1,1,0
2017-02,1,0,6,0,1,1,0,0,0,0,...,0,0,4,1,1,2,5,1,1,1
2017-02,0,2,3,0,0,3,2,1,0,0,...,1,0,3,0,1,0,0,0,0,0
2017-03,1,2,3,0,1,3,5,2,0,2,...,1,0,2,1,1,1,0,1,0,0


In [16]:
# # save out cleansed construction permit data
# cleaned_permits_filepath = Path("../resources/dan/Permit_Data/permits_by_zip_cleaned.csv")

# # write dataframe to csv with index (keep date) and header info
# permits_by_zip.to_csv(cleaned_permits_filepath, index=True, header=True)
