In [1]:
#Load dependencies
from datetime import datetime as dt, timedelta
import pandas as pd
import numpy as np
from uszipcode import SearchEngine

#Open case close date
opencaseclosedate=np.datetime64('2021-01-01')

In [2]:
#Define data files to import
url2017='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-2017.txt'
url2018='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-2018.txt'
url2019='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-2019.txt'
url2020='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-monthly.txt'
nullzip=pd.read_csv('../Clean Data Files/311latlngzipcodes.csv',dtype={'calczip':str})

#Define dataframe column names and select numeric and date columns
cols=['case number','sr location','county','district','neighborhood','tax id','trash quad','recycle quad','trash day','heavy trash day','recycle day','key map',
      'management district','department','division','sr type','queue','sla','status','sr create date','due date','date closed','overdue','title','x','y','latitude',
      'longitude','channel type']
numcols=['latitude','longitude']
datecols=['sr create date','due date','date closed']

#Create zipcode retrieval function
search=SearchEngine(simple_zipcode=False)
def zipinfo(lat,lng):
    zipdata=search.by_coordinates(lat,lng,radius=3,returns=1)
    for zipcode in zipdata:
        return zipcode.zipcode
    
#Create function to assign season based on month
def season(row_number, assigned_value): 
    return assigned_value[row_number] 

In [3]:
#Create 2017 data frames
data2017=pd.read_csv(url2017,header=5,sep='|',error_bad_lines=False)
data2017=data2017.drop(data2017.index[0]).reset_index(drop=True)
data2017.columns=cols
data2017[cols]=data2017[cols].apply(lambda x:x.str.strip()).replace(r'^\s*$',np.nan,regex=True)
data2017[datecols]=data2017[datecols].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S',errors='coerce')
data2017[numcols]=data2017[numcols].apply(pd.to_numeric,errors='coerce')
top2017=data2017['sr type'].value_counts()[lambda x:x>=10000].index.tolist()
data2017.shape

b'Skipping line 9979: expected 29 fields, saw 30\nSkipping line 16339: expected 29 fields, saw 30\n'
b'Skipping line 211068: expected 29 fields, saw 30\n'
b'Skipping line 294299: expected 29 fields, saw 30\n'
b'Skipping line 327926: expected 29 fields, saw 30\n'


(364666, 29)

In [4]:
#Create 2018 data frames
data2018=pd.read_csv(url2018,header=5,sep='|',error_bad_lines=False)
data2018=data2018.drop(data2018.index[0]).reset_index(drop=True)
data2018.columns=cols
data2018[cols]=data2018[cols].apply(lambda x:x.str.strip()).replace(r'^\s*$',np.nan,regex=True)
data2018[datecols]=data2018[datecols].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S',errors='coerce')
data2018[numcols]=data2018[numcols].apply(pd.to_numeric,errors='coerce')
top2018=data2018['sr type'].value_counts()[lambda x:x>=10000].index.tolist()
data2018.shape

b'Skipping line 124864: expected 29 fields, saw 30\n'
  interactivity=interactivity, compiler=compiler, result=result)


(399955, 29)

In [5]:
#Create 2019 data frames
data2019=pd.read_csv(url2019,header=5,sep='|',error_bad_lines=False)
data2019=data2019.drop(data2019.index[0]).reset_index(drop=True)
data2019.columns=cols
data2019[cols]=data2019[cols].apply(lambda x:x.str.strip()).replace(r'^\s*$',np.nan,regex=True)
data2019[datecols]=data2019[datecols].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S',errors='coerce')
data2019[numcols]=data2019[numcols].apply(pd.to_numeric,errors='coerce')
top2019=data2019['sr type'].value_counts()[lambda x:x>=10000].index.tolist()
data2019.shape

b'Skipping line 86859: expected 29 fields, saw 31\n'
b'Skipping line 124913: expected 29 fields, saw 30\n'
b'Skipping line 144497: expected 29 fields, saw 30\n'
b'Skipping line 218652: expected 29 fields, saw 31\n'
b'Skipping line 349873: expected 29 fields, saw 30\n'


(395258, 29)

In [6]:
#Create 2020 data frames
data2020=pd.read_csv(url2020,header=5,sep='|',error_bad_lines=False)
data2020=data2020.drop(data2020.index[0]).reset_index(drop=True)
data2020.columns=cols
data2020[cols]=data2020[cols].apply(lambda x:x.str.strip()).replace(r'^\s*$',np.nan,regex=True)
data2020[datecols]=data2020[datecols].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S',errors='coerce')
data2020[numcols]=data2020[numcols].apply(pd.to_numeric,errors='coerce')
top2020=data2020['sr type'].value_counts()[lambda x:x>=10000].index.tolist()
data2020.shape

(26987, 29)

In [7]:
#Create all complete years dataframe
tempdata311=data2017.append([data2018,data2019,data2020])
data311=pd.merge(tempdata311,nullzip,on=['latitude','longitude'],how='left')
data311['date']=data311['sr create date'].dt.strftime('%Y-%m-%d')
data311['year']=data311['sr create date'].dt.strftime('%Y')
data311['month']=data311['sr create date'].dt.strftime('%m')
seasondict={'01':'winter','02':'winter','03':'spring','04':'spring','05':'spring','06':'summer',
        '07':'summer','08':'summer','09':'fall','10':'fall','11':'fall','12':'winter','NaT':'none'}
data311['season']=data311['month'].apply(season,args=(seasondict,))
data311=data311[pd.notnull(data311['latitude'])]
data311['date closed']=np.where(data311['date closed'].isnull()==True,opencaseclosedate,data311['date closed'])
data311['truezip']='77'+data311['sr location'].str.extract(r'77(\d{3}\-?\d{0,4})')
data311['zipcode']=np.where(data311['truezip'].isnull()==True,data311['calczip'],data311['truezip'])
data311['openclosetime']=data311['date closed']-data311['sr create date']
data311['daystoclose']=data311['openclosetime']/timedelta(days=1)
data311['openduetime']=data311['due date']-data311['sr create date']
data311['daysdue']=data311['openduetime']/timedelta(days=1)
data311['missedduedate']=np.where(data311['due date']>data311['date closed'],0,1)
data311.drop(['x','y','calczip','truezip','openclosetime','openduetime'],axis=1,inplace=True)
types311=data311.groupby(['sr type','year'])['case number'].count().unstack('year').reset_index()
types311.columns=['sr type','2017','2018','2019','2020']
data311.shape

(1094514, 35)

In [8]:
#Create dataframe containing service requests with around 10000 a year
toprequests=sorted(np.unique(top2017+top2018+top2019+top2020))
topdata=data311.loc[data311['sr type'].isin(toprequests)].reset_index()
topdata.drop(['index'],axis=1,inplace=True)

In [9]:
toprequests

['Container Problem',
 'Drainage',
 'Missed Garbage Pickup',
 'Missed Heavy Trash Pickup',
 'Missed Recycling Pickup',
 'Nuisance On Property',
 'SWM Escalation',
 'Sewer Wastewater',
 'Storm Debris Collection',
 'Street Condition',
 'Street Hazard',
 'Traffic Signal Maintenance',
 'Traffic Signs',
 'Water Leak',
 'Water Service']

In [10]:
#Create csv of number of service requests by type per year
types311.to_csv('../Clean Data Files/Houston 311 SR Types by Year.csv',index=False,header=True)

In [11]:
#Display null values in dataset
topdata.isnull().sum()

case number                 0
sr location                 0
county                   2034
district                 2949
neighborhood             3975
tax id                   2024
trash quad             112208
recycle quad           114424
trash day              112208
heavy trash day        112794
recycle day            114422
key map                     0
management district    392317
department                  0
division                    0
sr type                     0
queue                       0
sla                         0
status                      0
sr create date              0
due date                    6
date closed                 0
overdue                  5037
title                       0
latitude                    0
longitude                   0
channel type                0
date                        0
year                        0
month                       0
season                      0
zipcode                   393
daystoclose                 0
daysdue   

In [12]:
#Create dataframe of missing zipcodes
missingzip=topdata[topdata.zipcode.isnull()]
missingzip=missingzip[['latitude','longitude']].reset_index()
missingzip.drop(['index'],axis=1,inplace=True)
missingzip.drop_duplicates(inplace=True)
missingzip

Unnamed: 0,latitude,longitude
0,29.736628,-95.5812
1,29.675718,-95.232803
2,29.739761,-95.378532
4,29.665729,-95.469205
6,29.841055,-95.416438
7,29.761276,-95.3666
9,29.866869,-95.30056
13,29.598766,-95.19799
15,29.773882,-95.307419
16,29.724491,-95.438095


In [13]:
#Get missing zipcodes
#zipmiss=missingzip[0:5000]
missingzip['calczip']=np.vectorize(zipinfo)(missingzip['latitude'].values,missingzip['longitude'].values)

In [14]:
#Check for zips that cannot be coded
nonezips=missingzip.loc[missingzip['calczip']=='None']
nonezips

Unnamed: 0,latitude,longitude,calczip


In [15]:
#Check nullzip file size
nullzip.shape

(17248, 3)

In [16]:
#Check missingzip file size
missingzip.shape

(36, 3)

In [17]:
#Append and verify newnullzip file size
newnullzip=nullzip.append([missingzip],sort=False)
newnullzip.drop_duplicates(inplace=True)
newnullzip.shape

(16237, 3)

In [18]:
#Create updated zip csv file
#newnullzip.to_csv('../Clean Data Files/311latlngzipcodes.csv',index=False,header=True)

In [19]:
dups=nullzip.append([missingzip],sort=False)
dups.duplicated()
dups.shape

(17284, 3)

In [20]:
#Create updated zip csv file
#dups.to_csv('../Clean Data Files/311latlngzipcodes.csv',index=False,header=True)

In [21]:
check=dups[['latitude', 'longitude', 'calczip']]
check.drop_duplicates(inplace=True)
check.shape

(16237, 3)

In [22]:
#Create Houston zipcode file
zips=[]
polygons=[]
hzips=search.by_prefix('77',returns=0)
for zipcode in hzips:
        zips.append(zipcode.zipcode)
        polygons.append(zipcode.polygon)
        
hzips_dict={'zipcode':zips,'polygons':polygons}
htownzips=pd.DataFrame(hzips_dict)
htownzips

Unnamed: 0,zipcode,polygons
0,77002,"[[[-95.388083, 29.729348], [-95.388223, 29.729..."
1,77003,"[[-95.363636, 29.747351], [-95.363053, 29.7480..."
2,77004,"[[[-95.372277, 29.746044], [-95.371729, 29.746..."
3,77005,"[[-95.44781, 29.725236], [-95.447869, 29.72711..."
4,77006,"[[-95.402073, 29.72665], [-95.402082, 29.72783..."
...,...,...
314,77983,"[[[-96.622859, 28.273454], [-96.622728, 28.273..."
315,77984,"[[-97.235916, 29.380135], [-97.236357, 29.3805..."
316,77990,"[[-97.00638, 28.498274], [-97.004785, 28.49758..."
317,77994,"[[-97.493419, 29.199753], [-97.493222, 29.1998..."


In [23]:
hzips=search.by_zipcode('77003')
#hzips.to_dict()
hzips

Zipcode(zipcode_type='Standard', major_city='Houston', post_office_city='Houston, TX', common_city_list=['Houston'], county='Harris County', state='TX', lat=29.75, lng=-95.35, timezone='Central', radius_in_miles=1.0, area_code_list=['713'], population=10508, population_density=4144.0, land_area_in_sqmi=2.54, water_area_in_sqmi=0.03, housing_units=4713, occupied_housing_units=3894, median_home_value=162600, median_household_income=36412, bounds_west=-95.367436, bounds_east=-95.326427, bounds_north=29.766512, bounds_south=29.731823, zipcode='77003', polygon=[[-95.363636, 29.747351], [-95.363053, 29.748099], [-95.3639, 29.748618], [-95.363295, 29.749371], [-95.362703, 29.750121], [-95.361864, 29.749602], [-95.36126, 29.75033], [-95.362115, 29.750862], [-95.361526, 29.751601], [-95.360936, 29.752343], [-95.360632, 29.752754], [-95.360581, 29.7525], [-95.360496, 29.752278], [-95.360416, 29.75214], [-95.36011, 29.751913], [-95.360057, 29.751883], [-95.360013, 29.751858], [-95.359487, 29.7515