In [1]:
#Load dependencies
from datetime import datetime as dt, timedelta
import pandas as pd
import numpy as np
from uszipcode import SearchEngine

In [2]:
#Define data files to import
url2017='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-2017.txt'
url2018='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-2018.txt'
url2019='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-2019.txt'
url2020='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-monthly.txt'
nullzip=pd.read_csv('../Clean Data Files/311latlngzipcodes.csv',dtype={'calczip':str})

#Define dataframe column names and select numeric and date columns
cols=['case number','sr location','county','district','neighborhood','tax id','trash quad','recycle quad','trash day','heavy trash day','recycle day','key map',
      'management district','department','division','sr type','queue','sla','status','sr create date','due date','date closed','overdue','title','x','y','latitude',
      'longitude','channel type']
numcols=['latitude','longitude']
datecols=['sr create date','due date','date closed']

#Create zipcode retrieval function
search=SearchEngine(simple_zipcode=False)
def zipinfo(lat,lng):
    zipdata=search.by_coordinates(lat,lng,radius=3,returns=1)
    for zipcode in zipdata:
        return zipcode.zipcode

In [3]:
#Create 2017 data frames
data2017=pd.read_csv(url2017,header=5,sep='|',error_bad_lines=False)
data2017=data2017.drop(data2017.index[0]).reset_index(drop=True)
data2017.columns=cols
data2017[cols]=data2017[cols].apply(lambda x:x.str.strip()).replace(r'^\s*$',np.nan,regex=True)
data2017[datecols]=data2017[datecols].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S',errors='coerce')
data2017[numcols]=data2017[numcols].apply(pd.to_numeric,errors='coerce')
top2017=data2017['sr type'].value_counts()[lambda x:x>=10000].index.tolist()
data2017.shape

b'Skipping line 9979: expected 29 fields, saw 30\nSkipping line 16339: expected 29 fields, saw 30\n'
b'Skipping line 211068: expected 29 fields, saw 30\n'
b'Skipping line 294299: expected 29 fields, saw 30\n'
b'Skipping line 327924: expected 29 fields, saw 30\n'


(364664, 29)

In [4]:
#Create 2018 data frames
data2018=pd.read_csv(url2018,header=5,sep='|',error_bad_lines=False)
data2018=data2018.drop(data2018.index[0]).reset_index(drop=True)
data2018.columns=cols
data2018[cols]=data2018[cols].apply(lambda x:x.str.strip()).replace(r'^\s*$',np.nan,regex=True)
data2018[datecols]=data2018[datecols].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S',errors='coerce')
data2018[numcols]=data2018[numcols].apply(pd.to_numeric,errors='coerce')
top2018=data2018['sr type'].value_counts()[lambda x:x>=10000].index.tolist()
data2018.shape

b'Skipping line 124864: expected 29 fields, saw 30\n'
  interactivity=interactivity, compiler=compiler, result=result)


(399953, 29)

In [5]:
#Create 2019 data frames
data2019=pd.read_csv(url2019,header=5,sep='|',error_bad_lines=False)
data2019=data2019.drop(data2019.index[0]).reset_index(drop=True)
data2019.columns=cols
data2019[cols]=data2019[cols].apply(lambda x:x.str.strip()).replace(r'^\s*$',np.nan,regex=True)
data2019[datecols]=data2019[datecols].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S',errors='coerce')
data2019[numcols]=data2019[numcols].apply(pd.to_numeric,errors='coerce')
top2019=data2019['sr type'].value_counts()[lambda x:x>=10000].index.tolist()
data2019.shape

b'Skipping line 86859: expected 29 fields, saw 31\n'
b'Skipping line 124913: expected 29 fields, saw 30\n'
b'Skipping line 144497: expected 29 fields, saw 30\n'
b'Skipping line 218652: expected 29 fields, saw 31\n'
b'Skipping line 349873: expected 29 fields, saw 30\n'


(395258, 29)

In [6]:
#Create all complete years dataframe
tempdata311=data2017.append([data2018,data2019])
data311=pd.merge(tempdata311,nullzip,on=['latitude','longitude'],how='left')
data311['date']=data311['sr create date'].dt.strftime('%Y-%m-%d')
data311['year']=data311['sr create date'].dt.strftime('%Y')
data311['month']=data311['sr create date'].dt.strftime('%m')
data311=data311[pd.notnull(data311['latitude'])]
data311['truezip']='77'+data311['sr location'].str.extract(r'77(\d{3}\-?\d{0,4})')
data311['zipcode']=np.where(data311['truezip'].isnull()==True,data311['calczip'],data311['truezip'])
data311['openclosetime']=data311['date closed']-data311['sr create date']
data311['daystoclose']=data311['openclosetime']/timedelta(days=1)
data311['openduetime']=data311['due date']-data311['sr create date']
data311['daysdue']=data311['openduetime']/timedelta(days=1)
data311['missedduedate']=np.where(data311['due date']>data311['date closed'],0,1)
data311.drop(['x','y','calczip','truezip','openclosetime','openduetime'],axis=1,inplace=True)
types311=data311.groupby(['sr type','year'])['case number'].count().unstack('year').reset_index()
types311.columns=['sr type','2017','2018','2019']
data311.shape

(1059491, 34)

In [7]:
#Create dataframe containing service requests with around 10000 a year
toprequests=sorted(np.unique(top2017+top2018+top2019))
topdata=data311.loc[data311['sr type'].isin(toprequests)].reset_index()
topdata.drop(['index'],axis=1,inplace=True)

In [8]:
#Create weather and census dataframes
weatherdata=pd.read_csv('../static/data/Final_selected_weather_data.csv')
censusdata=pd.read_csv('../Clean Data Files/census_data.csv')

In [9]:
#Apply a few fixes to weather and census dataframes for merging
weatherdata.drop(['Unnamed: 0'],axis=1,inplace=True)
censusdata['zipcode']=censusdata['Zipcode'].astype(str)
censusdata.drop(['Zipcode'],axis=1,inplace=True)

In [10]:
#Merge dataframes to create modeling dataframe
tempmodeldata=pd.merge(topdata,weatherdata,how='left',left_on='date',right_on='date_field')
modeldata=pd.merge(tempmodeldata,censusdata,how='left',on='zipcode') 

In [11]:
#Display null values in dataset
modeldata.isnull().sum()

case number                  0
sr location                  0
county                    1968
district                  2798
neighborhood              3747
tax id                    1958
trash quad              106401
recycle quad            108552
trash day               106401
heavy trash day         107003
recycle day             108550
key map                      0
management district     380646
department                   0
division                     0
sr type                      0
queue                        0
sla                          0
status                       0
sr create date               0
due date                     6
date closed               6803
overdue                   4981
title                        0
latitude                     0
longitude                    0
channel type                 0
date                         0
year                         0
month                        0
zipcode                  10744
daystoclose               6803
daysdue 

In [12]:
#Create modeling data csv and json files
#modeldata.to_csv('../Clean Data Files/modeldata.csv',index=False,header=True)
#modeldata.to_json('../static/data/modeldata.json',orient='records')

KeyboardInterrupt: 

In [13]:
modeldata.shape

(656909, 47)

In [14]:
modeldata.columns

Index(['case number', 'sr location', 'county', 'district', 'neighborhood',
       'tax id', 'trash quad', 'recycle quad', 'trash day', 'heavy trash day',
       'recycle day', 'key map', 'management district', 'department',
       'division', 'sr type', 'queue', 'sla', 'status', 'sr create date',
       'due date', 'date closed', 'overdue', 'title', 'latitude', 'longitude',
       'channel type', 'date', 'year', 'month', 'zipcode', 'daystoclose',
       'daysdue', 'missedduedate', 'date_field', 'tempMax', 'tempAvg',
       'tempMin', 'precipitation', 'Population', 'Median Age',
       'Household Income', 'Per Capita Income', 'Poverty Rate',
       'Total Households', 'Total Owner Occupied', '% Owner Occupied'],
      dtype='object')

In [15]:
modeldata

Unnamed: 0,case number,sr location,county,district,neighborhood,tax id,trash quad,recycle quad,trash day,heavy trash day,...,tempMin,precipitation,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Total Households,Total Owner Occupied,% Owner Occupied
0,101002444726,Intersection 3900 S GESSNER RD&10000 WESTPARK DR,Harris County,F,MID WEST,,,,,,...,64,0.3,38931.0,31.9,44957.0,32805.0,22.824998,38751.0,9667.0,24.946453
1,12091836-101002444730,"3303 SAGE, HOUSTON TX 77056",HARRIS,G,GREATER UPTOWN,0451400060009,,,,,...,64,0.3,21732.0,39.1,107003.0,89180.0,5.268728,21641.0,10643.0,49.179798
2,101002444733,Intersection 1400 CAROLINE ST&1300 CLAY ST,Harris County,I,DOWNTOWN,,,,,,...,64,0.3,915.0,44.6,250001.0,196722.0,6.666667,915.0,78.0,8.524590
3,12091839-101002444736,"7701 APPLETON, HOUSTON TX 77022",HARRIS,H,NORTHSIDE/NORTHLINE,0710210010015,NE,NW,MONDAY,3rd Monday,...,64,0.3,27364.0,34.1,30164.0,14924.0,29.399942,27186.0,13143.0,48.344736
4,12091840-101002444737,"7701 APPLETON, HOUSTON TX 77022",HARRIS,H,NORTHSIDE/NORTHLINE,0420050000055,NE,NW,MONDAY,3rd Monday,...,64,0.3,27364.0,34.1,30164.0,14924.0,29.399942,27186.0,13143.0,48.344736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656904,101003785782,"3614 S MACGREGOR, HOUSTON TX 77021",HARRIS,D,MACGREGOR,0611350550011,SE,SW,TUESDAY,3rd Thursday,...,38,0.0,26654.0,33.6,35126.0,21531.0,28.674871,26327.0,10572.0,40.156493
656905,101003785783,"3547 TAMPA, HOUSTON TX 77021",HARRIS,D,MACGREGOR,0741210010025,SE,SW,TUESDAY,3rd Thursday,...,38,0.0,26654.0,33.6,35126.0,21531.0,28.674871,26327.0,10572.0,40.156493
656906,101003785785,"3415 WENTWORTH, HOUSTON TX 77004",HARRIS,D,MACGREGOR,0700280060004,SE,SW,TUESDAY,3rd Friday,...,38,0.0,37642.0,28.3,48592.0,31067.0,19.733277,28125.0,9997.0,35.544889
656907,20024521-101003785786,"11035 AVON BROOK, HOUSTON TX 77034",HARRIS,E,SOUTH BELT / ELLINGTON,1319170030001,SE,SE,THURSDAY,2nd Thursday,...,38,0.0,40183.0,28.1,47252.0,19317.0,19.767066,40162.0,19810.0,49.325233
