In [1]:
#Load dependencies
from datetime import datetime as dt, timedelta
import pandas as pd
import numpy as np
from uszipcode import SearchEngine

#Open case close date
opencaseclosedate=np.datetime64('2021-01-01')

In [2]:
#Define data files to import
url2017='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-2017.txt'
url2018='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-2018.txt'
url2019='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-2019.txt'
url2020='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-monthly.txt'
nullzip=pd.read_csv('../Clean Data Files/311latlngzipcodes.csv',dtype={'calczip':str})

#Define dataframe column names and select numeric and date columns
cols=['case number','sr location','county','district','neighborhood','tax id','trash quad','recycle quad','trash day','heavy trash day','recycle day','key map',
      'management district','department','division','sr type','queue','sla','status','sr create date','due date','date closed','overdue','title','x','y','latitude',
      'longitude','channel type']
numcols=['latitude','longitude']
datecols=['sr create date','due date','date closed']

#Create zipcode retrieval function
search=SearchEngine(simple_zipcode=False)
def zipinfo(lat,lng):
    zipdata=search.by_coordinates(lat,lng,radius=3,returns=1)
    for zipcode in zipdata:
        return zipcode.zipcode

In [3]:
#Create 2017 data frames
data2017=pd.read_csv(url2017,header=5,sep='|',error_bad_lines=False)
data2017=data2017.drop(data2017.index[0]).reset_index(drop=True)
data2017.columns=cols
data2017[cols]=data2017[cols].apply(lambda x:x.str.strip()).replace(r'^\s*$',np.nan,regex=True)
data2017[datecols]=data2017[datecols].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S',errors='coerce')
data2017[numcols]=data2017[numcols].apply(pd.to_numeric,errors='coerce')
top2017=data2017['sr type'].value_counts()[lambda x:x>=10000].index.tolist()
data2017.shape

b'Skipping line 9979: expected 29 fields, saw 30\nSkipping line 16339: expected 29 fields, saw 30\n'
b'Skipping line 211068: expected 29 fields, saw 30\n'
b'Skipping line 294299: expected 29 fields, saw 30\n'
b'Skipping line 327924: expected 29 fields, saw 30\n'


(364664, 29)

In [4]:
#Create 2018 data frames
data2018=pd.read_csv(url2018,header=5,sep='|',error_bad_lines=False)
data2018=data2018.drop(data2018.index[0]).reset_index(drop=True)
data2018.columns=cols
data2018[cols]=data2018[cols].apply(lambda x:x.str.strip()).replace(r'^\s*$',np.nan,regex=True)
data2018[datecols]=data2018[datecols].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S',errors='coerce')
data2018[numcols]=data2018[numcols].apply(pd.to_numeric,errors='coerce')
top2018=data2018['sr type'].value_counts()[lambda x:x>=10000].index.tolist()
data2018.shape

b'Skipping line 124864: expected 29 fields, saw 30\n'
  interactivity=interactivity, compiler=compiler, result=result)


(399953, 29)

In [5]:
#Create 2019 data frames
data2019=pd.read_csv(url2019,header=5,sep='|',error_bad_lines=False)
data2019=data2019.drop(data2019.index[0]).reset_index(drop=True)
data2019.columns=cols
data2019[cols]=data2019[cols].apply(lambda x:x.str.strip()).replace(r'^\s*$',np.nan,regex=True)
data2019[datecols]=data2019[datecols].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S',errors='coerce')
data2019[numcols]=data2019[numcols].apply(pd.to_numeric,errors='coerce')
top2019=data2019['sr type'].value_counts()[lambda x:x>=10000].index.tolist()
data2019.shape

b'Skipping line 86859: expected 29 fields, saw 31\n'
b'Skipping line 124913: expected 29 fields, saw 30\n'
b'Skipping line 144497: expected 29 fields, saw 30\n'
b'Skipping line 218652: expected 29 fields, saw 31\n'
b'Skipping line 349873: expected 29 fields, saw 30\n'


(395258, 29)

In [6]:
#Create 2020 data frames
data2020=pd.read_csv(url2020,header=5,sep='|',error_bad_lines=False)
data2020=data2020.drop(data2020.index[0]).reset_index(drop=True)
data2020.columns=cols
data2020[cols]=data2020[cols].apply(lambda x:x.str.strip()).replace(r'^\s*$',np.nan,regex=True)
data2020[datecols]=data2020[datecols].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S',errors='coerce')
data2020[numcols]=data2020[numcols].apply(pd.to_numeric,errors='coerce')
top2020=data2020['sr type'].value_counts()[lambda x:x>=10000].index.tolist()
data2020.shape

(25483, 29)

In [7]:
#Create all complete years dataframe
tempdata311=data2017.append([data2018,data2019,data2020])
data311=pd.merge(tempdata311,nullzip,on=['latitude','longitude'],how='left')
data311['date']=data311['sr create date'].dt.strftime('%Y-%m-%d')
data311['year']=data311['sr create date'].dt.strftime('%Y')
data311['month']=data311['sr create date'].dt.strftime('%m')
data311=data311[pd.notnull(data311['latitude'])]
data311['date closed']=np.where(data311['date closed'].isnull()==True,opencaseclosedate,data311['date closed'])
data311['truezip']='77'+data311['sr location'].str.extract(r'77(\d{3}\-?\d{0,4})')
data311['zipcode']=np.where(data311['truezip'].isnull()==True,data311['calczip'],data311['truezip'])
data311['openclosetime']=data311['date closed']-data311['sr create date']
data311['daystoclose']=data311['openclosetime']/timedelta(days=1)
data311['openduetime']=data311['due date']-data311['sr create date']
data311['daysdue']=data311['openduetime']/timedelta(days=1)
data311['missedduedate']=np.where(data311['due date']>data311['date closed'],0,1)
data311.drop(['x','y','calczip','truezip','openclosetime','openduetime'],axis=1,inplace=True)
types311=data311.groupby(['sr type','year'])['case number'].count().unstack('year').reset_index()
types311.columns=['sr type','2017','2018','2019','2020']
data311.shape

(1092423, 34)

In [8]:
#Create dataframe containing service requests with around 10000 a year
toprequests=sorted(np.unique(top2017+top2018+top2019+top2020))
topdata=data311.loc[data311['sr type'].isin(toprequests)].reset_index()
topdata.drop(['index'],axis=1,inplace=True)

In [9]:
#Create csv of number of service requests by type per year
types311.to_csv('../Clean Data Files/Houston 311 SR Types by Year.csv',index=False,header=True)

In [10]:
#Display null values in dataset
topdata.isnull().sum()

case number                 0
sr location                 0
county                   2031
district                 2946
neighborhood             3968
tax id                   2021
trash quad             111787
recycle quad           114051
trash day              111787
heavy trash day        112422
recycle day            114049
key map                     0
management district    391664
department                  0
division                    0
sr type                     0
queue                       0
sla                         0
status                      0
sr create date              0
due date                    6
date closed                 0
overdue                  5032
title                       0
latitude                    0
longitude                   0
channel type                0
date                        0
year                        0
month                       0
zipcode                     7
daystoclose                 0
daysdue                     6
misseddued

In [None]:
#Create dataframe of missing zipcodes
missingzip=topdata[topdata.zipcode.isnull()]
missingzip=missingzip[['latitude','longitude']].reset_index()
missingzip.drop(['index'],axis=1,inplace=True)
missingzip.drop_duplicates(inplace=True)
missingzip

In [None]:
#Get missing zipcodes
#zipmiss=missingzip[0:5000]
missingzip['calczip']=np.vectorize(zipinfo)(missingzip['latitude'].values,missingzip['longitude'].values)

In [None]:
#Check for zips that cannot be coded
nonezips=missingzip.loc[missingzip['calczip']=='None']
nonezips

In [None]:
#Check nullzip file size
nullzip.shape

In [None]:
#Check missingzip file size
missingzip.shape

In [None]:
#Append and verify newnullzip file size
newnullzip=nullzip.append([missingzip],sort=False)
newnullzip.drop_duplicates(inplace=True)
newnullzip.shape

In [None]:
#Create updated zip csv file
newnullzip.to_csv('../Clean Data Files/311latlngzipcodes.csv',index=False,header=True)