In [14]:
import pandas as pd
from pathlib import Path

In [15]:
# Read in the construction permits data
permits_filepath = Path("../resources/dan/Permit_Data/permit_data.csv")

permits_df = pd.read_csv(permits_filepath)

In [16]:
# spot check permits_df data
permits_df.head()

Unnamed: 0,zip_code,land_use,count,date,permit_type
0,77091.0,Single Family Residential (Shared Driveways),1,1/23/2020,Residential
1,77091.0,Single Family Residential (Shared Driveways),1,1/23/2020,Residential
2,77072.0,Unrestricted,1,1/23/2020,Unrestricted
3,77048.0,Drainage or Detention; Landscape or Open Space...,1,1/23/2020,Residential
4,77045.0,Unrestricted Intended for Multifamily,1,1/23/2020,Unrestricted


In [17]:
# check data field counts
permits_df.count()

zip_code       8847
land_use       8859
count          8859
date           8859
permit_type    8859
dtype: int64

In [18]:
# drop rows with nulls
permits_df.dropna(inplace=True)

# repeat spot check of data field counts
permits_df.count()

zip_code       8847
land_use       8847
count          8847
date           8847
permit_type    8847
dtype: int64

In [19]:
# check dtypes
permits_df.dtypes

zip_code       float64
land_use        object
count            int64
date            object
permit_type     object
dtype: object

In [20]:
# check zip_code to int
permits_df = permits_df.astype({'zip_code': 'int64'})

# recheck dtypes
permits_df.dtypes

zip_code        int64
land_use       object
count           int64
date           object
permit_type    object
dtype: object

In [21]:
# spot check permits_df data again
permits_df.head()

Unnamed: 0,zip_code,land_use,count,date,permit_type
0,77091,Single Family Residential (Shared Driveways),1,1/23/2020,Residential
1,77091,Single Family Residential (Shared Driveways),1,1/23/2020,Residential
2,77072,Unrestricted,1,1/23/2020,Unrestricted
3,77048,Drainage or Detention; Landscape or Open Space...,1,1/23/2020,Residential
4,77045,Unrestricted Intended for Multifamily,1,1/23/2020,Unrestricted


In [22]:
# set index to date with format of year and month
permits_df.set_index(pd.to_datetime(permits_df['date'], format='%Y%m', infer_datetime_format=True), inplace=True)

# drop non-index date column
permits_df.drop('date', axis=1, inplace=True)

# preview data
permits_df.head()

Unnamed: 0_level_0,zip_code,land_use,count,permit_type
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-23,77091,Single Family Residential (Shared Driveways),1,Residential
2020-01-23,77091,Single Family Residential (Shared Driveways),1,Residential
2020-01-23,77072,Unrestricted,1,Unrestricted
2020-01-23,77048,Drainage or Detention; Landscape or Open Space...,1,Residential
2020-01-23,77045,Unrestricted Intended for Multifamily,1,Unrestricted


In [23]:
# sort by date (index), oldest to newest, then by zip, lowest to highest
permits_df = permits_df.sort_values(by=['date','zip_code'])

# preview_data
permits_df.head(15)

Unnamed: 0_level_0,zip_code,land_use,count,permit_type
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-05,77002,Unrestricted,1,Unrestricted
2017-01-05,77004,Single Family Residential (Shared Driveways),1,Residential
2017-01-05,77004,Unrestricted,1,Unrestricted
2017-01-05,77004,Single Family Residential (Public Street),1,Residential
2017-01-05,77006,Single Family Residential (Public Street),1,Residential
2017-01-05,77006,Single Family Residential (Public Street),1,Residential
2017-01-05,77006,Commercial,1,Commercial
2017-01-05,77007,Unrestricted,1,Unrestricted
2017-01-05,77007,Single Family Residential (Shared Driveways),1,Residential
2017-01-05,77007,Unrestricted,1,Unrestricted


In [24]:
# reorder columns for visibility to most pertinent data
permits_df = permits_df[['zip_code', 'permit_type', 'count', 'land_use']]

# preview new colun order
permits_df.columns.values

array(['zip_code', 'permit_type', 'count', 'land_use'], dtype=object)

In [25]:
# final preview permits_df data
permits_df.head()

Unnamed: 0_level_0,zip_code,permit_type,count,land_use
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-05,77002,Unrestricted,1,Unrestricted
2017-01-05,77004,Residential,1,Single Family Residential (Shared Driveways)
2017-01-05,77004,Unrestricted,1,Unrestricted
2017-01-05,77004,Residential,1,Single Family Residential (Public Street)
2017-01-05,77006,Residential,1,Single Family Residential (Public Street)


In [13]:
# save out cleansed construction permit data
cleaned_permits_filepath = Path("../resources/dan/Permit_Data/permit_data_cleaned.csv")

# write dataframe to csv with index (keep date) and header info
permits_df.to_csv(cleaned_permits_filepath, index=True, header=True)


In [32]:
permits_df.loc[permits_df['zip_code'] == 77002, 'permit_type']  = "is this working"
permits_df.loc[permits_df['zip_code'] == 77002, 'permit_type']

date
2017-01-05    is this working
2017-01-19    is this working
2017-02-02    is this working
2017-03-13    is this working
2017-05-25    is this working
2017-06-22    is this working
2017-07-27    is this working
2017-07-27    is this working
2017-11-30    is this working
2017-12-14    is this working
2018-01-04    is this working
2018-01-18    is this working
2018-03-01    is this working
2018-03-15    is this working
2018-03-29    is this working
2018-03-29    is this working
2018-04-26    is this working
2018-07-05    is this working
2018-07-30    is this working
2018-08-27    is this working
2018-09-10    is this working
2018-10-08    is this working
2019-02-14    is this working
2019-03-14    is this working
2019-03-14    is this working
2019-07-25    is this working
2019-08-08    is this working
2019-08-08    is this working
2019-08-22    is this working
2019-12-19    is this working
2019-12-19    is this working
2020-02-27    is this working
Name: permit_type, dtype: object