# MODELING DATAFRAME CREATION

In [1]:
#Load dependencies
from datetime import datetime as dt, timedelta
import pandas as pd
import numpy as np
from uszipcode import SearchEngine

#Open case close date
opencaseclosedate=np.datetime64('2021-01-01')

In [2]:
#Define data files to import
url2017='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-2017.txt'
url2018='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-2018.txt'
url2019='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-2019.txt'
url2020='https://hfdapp.houstontx.gov/311/311-Public-Data-Extract-monthly.txt'
nullzip=pd.read_csv('../Clean Data Files/311latlngzipcodes.csv',dtype={'calczip':str})

#Define dataframe column names and select numeric and date columns
cols=['case number','sr location','county','district','neighborhood','tax id','trash quad','recycle quad','trash day','heavy trash day','recycle day','key map',
      'management district','department','division','sr type','queue','sla','status','sr create date','due date','date closed','overdue','title','x','y','latitude',
      'longitude','channel type']
numcols=['latitude','longitude']
datecols=['sr create date','due date','date closed']

#Create zipcode retrieval function
search=SearchEngine(simple_zipcode=False)
def zipinfo(lat,lng):
    zipdata=search.by_coordinates(lat,lng,radius=3,returns=1)
    for zipcode in zipdata:
        return zipcode.zipcode

#Create function to assign season based on month
def season(row_number, assigned_value): 
    return assigned_value[row_number] 

In [3]:
#Create 2017 data frames
data2017=pd.read_csv(url2017,header=5,sep='|',error_bad_lines=False)
data2017=data2017.drop(data2017.index[0]).reset_index(drop=True)
data2017.columns=cols
data2017[cols]=data2017[cols].apply(lambda x:x.str.strip()).replace(r'^\s*$',np.nan,regex=True)
data2017[datecols]=data2017[datecols].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S',errors='coerce')
data2017[numcols]=data2017[numcols].apply(pd.to_numeric,errors='coerce')
top2017=data2017['sr type'].value_counts()[lambda x:x>=10000].index.tolist()
data2017.shape

b'Skipping line 9979: expected 29 fields, saw 30\nSkipping line 16339: expected 29 fields, saw 30\n'
b'Skipping line 211068: expected 29 fields, saw 30\n'
b'Skipping line 294299: expected 29 fields, saw 30\n'
b'Skipping line 327926: expected 29 fields, saw 30\n'


(364666, 29)

In [4]:
#Create 2018 data frames
data2018=pd.read_csv(url2018,header=5,sep='|',error_bad_lines=False)
data2018=data2018.drop(data2018.index[0]).reset_index(drop=True)
data2018.columns=cols
data2018[cols]=data2018[cols].apply(lambda x:x.str.strip()).replace(r'^\s*$',np.nan,regex=True)
data2018[datecols]=data2018[datecols].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S',errors='coerce')
data2018[numcols]=data2018[numcols].apply(pd.to_numeric,errors='coerce')
top2018=data2018['sr type'].value_counts()[lambda x:x>=10000].index.tolist()
data2018.shape

b'Skipping line 124864: expected 29 fields, saw 30\n'
  interactivity=interactivity, compiler=compiler, result=result)


(399955, 29)

In [5]:
#Create 2019 data frames
data2019=pd.read_csv(url2019,header=5,sep='|',error_bad_lines=False)
data2019=data2019.drop(data2019.index[0]).reset_index(drop=True)
data2019.columns=cols
data2019[cols]=data2019[cols].apply(lambda x:x.str.strip()).replace(r'^\s*$',np.nan,regex=True)
data2019[datecols]=data2019[datecols].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S',errors='coerce')
data2019[numcols]=data2019[numcols].apply(pd.to_numeric,errors='coerce')
top2019=data2019['sr type'].value_counts()[lambda x:x>=10000].index.tolist()
data2019.shape

b'Skipping line 86859: expected 29 fields, saw 31\n'
b'Skipping line 124913: expected 29 fields, saw 30\n'
b'Skipping line 144497: expected 29 fields, saw 30\n'
b'Skipping line 218652: expected 29 fields, saw 31\n'
b'Skipping line 349873: expected 29 fields, saw 30\n'


(395258, 29)

In [6]:
#Create all complete years dataframe
tempdata311=data2017.append([data2018,data2019])
data311=pd.merge(tempdata311,nullzip,on=['latitude','longitude'],how='left')
data311['date']=data311['sr create date'].dt.strftime('%Y-%m-%d')
data311['year']=data311['sr create date'].dt.strftime('%Y')
data311['month']=data311['sr create date'].dt.strftime('%m')
seasondict={'01':'winter','02':'winter','03':'spring','04':'spring','05':'spring','06':'summer',
        '07':'summer','08':'summer','09':'fall','10':'fall','11':'fall','12':'winter','NaT':'none'}
data311['season']=data311['month'].apply(season,args=(seasondict,))
data311=data311[pd.notnull(data311['latitude'])]
data311['date closed']=np.where(data311['date closed'].isnull()==True,opencaseclosedate,data311['date closed'])
data311['truezip']='77'+data311['sr location'].str.extract(r'77(\d{3}\-?\d{0,4})')
data311['zipcode']=np.where(data311['truezip'].isnull()==True,data311['calczip'],data311['truezip'])
data311['openclosetime']=data311['date closed']-data311['sr create date']
data311['daystoclose']=data311['openclosetime']/timedelta(days=1)
data311['openduetime']=data311['due date']-data311['sr create date']
data311['daysdue']=data311['openduetime']/timedelta(days=1)
data311['missedduedate']=np.where(data311['due date']>data311['date closed'],0,1)
data311.drop(['x','y','calczip','truezip','openclosetime','openduetime'],axis=1,inplace=True)
types311=data311.groupby(['sr type','year'])['case number'].count().unstack('year').reset_index()
types311.columns=['sr type','2017','2018','2019']
data311.shape

(1067511, 35)

In [7]:
#Create dataframe containing service requests with around 10000 a year
toprequests=sorted(np.unique(top2017+top2018+top2019))
topdata=data311.loc[data311['sr type'].isin(toprequests)].reset_index()
topdata.drop(['index'],axis=1,inplace=True)

In [8]:
toprequests

['Container Problem',
 'Drainage',
 'Missed Garbage Pickup',
 'Missed Heavy Trash Pickup',
 'Missed Recycling Pickup',
 'Nuisance On Property',
 'SWM Escalation',
 'Sewer Wastewater',
 'Storm Debris Collection',
 'Street Condition',
 'Street Hazard',
 'Traffic Signal Maintenance',
 'Traffic Signs',
 'Water Leak',
 'Water Service']

In [9]:
#Create weather and census dataframes
weatherdata=pd.read_csv('../Clean Data Files/weatherdata.csv')
censusdata=pd.read_csv('../Clean Data Files/census_data.csv')

In [10]:
#Apply a few fixes to weather dataframes for merging
censusdata['zipcode']=censusdata['Zipcode'].astype(str)
censusdata.drop(['Zipcode'],axis=1,inplace=True)

In [11]:
#Merge dataframes to create modeling dataframe
tempmodeldata=pd.merge(topdata,weatherdata,how='left',on='date')
modeldata=pd.merge(tempmodeldata,censusdata,how='left',on='zipcode') 

In [12]:
#Display null values in dataset
modeldata.isnull().sum()

case number                  0
sr location                  0
county                    1968
district                  2885
neighborhood              3877
tax id                    1958
trash quad              109280
recycle quad            111444
trash day               109280
heavy trash day         109843
recycle day             111442
key map                      0
management district     383293
department                   0
division                     0
sr type                      0
queue                        0
sla                          0
status                       0
sr create date               0
due date                     6
date closed                  0
overdue                   5037
title                        0
latitude                     0
longitude                    0
channel type                 0
date                         0
year                         0
month                        0
season                       0
zipcode                    378
daystocl

In [13]:
#Create modeling data csv and json files
modeldata.to_csv('c://temp/modeldata.csv',index=False,header=True)
#modeldata.to_json('../static/data/modeldata.json',orient='records')

In [14]:
modeldata.shape

(661926, 52)

In [15]:
modeldata.columns

Index(['case number', 'sr location', 'county', 'district', 'neighborhood',
       'tax id', 'trash quad', 'recycle quad', 'trash day', 'heavy trash day',
       'recycle day', 'key map', 'management district', 'department',
       'division', 'sr type', 'queue', 'sla', 'status', 'sr create date',
       'due date', 'date closed', 'overdue', 'title', 'latitude', 'longitude',
       'channel type', 'date', 'year', 'month', 'season', 'zipcode',
       'daystoclose', 'daysdue', 'missedduedate', 'maxtemp', 'avgtemp',
       'mintemp', 'precip', 'week', 'weekmaxtemp', 'weekavgtemp',
       'weekmintemp', 'weekprecip', 'Population', 'Median Age',
       'Household Income', 'Per Capita Income', 'Poverty Rate',
       'Total Households', 'Total Owner Occupied', '% Owner Occupied'],
      dtype='object')

In [16]:
modeldata

Unnamed: 0,case number,sr location,county,district,neighborhood,tax id,trash quad,recycle quad,trash day,heavy trash day,...,weekmintemp,weekprecip,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Total Households,Total Owner Occupied,% Owner Occupied
0,101002444726,Intersection 3900 S GESSNER RD&10000 WESTPARK DR,Harris County,F,MID WEST,,,,,,...,42.714286,0.265714,38931.0,31.9,44957.0,32805.0,22.824998,38751.0,9667.0,24.946453
1,12091836-101002444730,"3303 SAGE, HOUSTON TX 77056",HARRIS,G,GREATER UPTOWN,0451400060009,,,,,...,42.714286,0.265714,21732.0,39.1,107003.0,89180.0,5.268728,21641.0,10643.0,49.179798
2,101002444733,Intersection 1400 CAROLINE ST&1300 CLAY ST,Harris County,I,DOWNTOWN,,,,,,...,42.714286,0.265714,915.0,44.6,250001.0,196722.0,6.666667,915.0,78.0,8.524590
3,12091839-101002444736,"7701 APPLETON, HOUSTON TX 77022",HARRIS,H,NORTHSIDE/NORTHLINE,0710210010015,NE,NW,MONDAY,3rd Monday,...,42.714286,0.265714,27364.0,34.1,30164.0,14924.0,29.399942,27186.0,13143.0,48.344736
4,12091840-101002444737,"7701 APPLETON, HOUSTON TX 77022",HARRIS,H,NORTHSIDE/NORTHLINE,0420050000055,NE,NW,MONDAY,3rd Monday,...,42.714286,0.265714,27364.0,34.1,30164.0,14924.0,29.399942,27186.0,13143.0,48.344736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
661921,101003785782,"3614 S MACGREGOR, HOUSTON TX 77021",HARRIS,D,MACGREGOR,0611350550011,SE,SW,TUESDAY,3rd Thursday,...,42.333333,0.276667,26654.0,33.6,35126.0,21531.0,28.674871,26327.0,10572.0,40.156493
661922,101003785783,"3547 TAMPA, HOUSTON TX 77021",HARRIS,D,MACGREGOR,0741210010025,SE,SW,TUESDAY,3rd Thursday,...,42.333333,0.276667,26654.0,33.6,35126.0,21531.0,28.674871,26327.0,10572.0,40.156493
661923,101003785785,"3415 WENTWORTH, HOUSTON TX 77004",HARRIS,D,MACGREGOR,0700280060004,SE,SW,TUESDAY,3rd Friday,...,42.333333,0.276667,37642.0,28.3,48592.0,31067.0,19.733277,28125.0,9997.0,35.544889
661924,20024521-101003785786,"11035 AVON BROOK, HOUSTON TX 77034",HARRIS,E,SOUTH BELT / ELLINGTON,1319170030001,SE,SE,THURSDAY,2nd Thursday,...,42.333333,0.276667,40183.0,28.1,47252.0,19317.0,19.767066,40162.0,19810.0,49.325233


# MODELING

In [17]:
modeldata.corr()

Unnamed: 0,latitude,longitude,daystoclose,daysdue,missedduedate,maxtemp,avgtemp,mintemp,precip,weekmaxtemp,...,weekmintemp,weekprecip,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Total Households,Total Owner Occupied,% Owner Occupied
latitude,1.0,0.117848,0.014901,0.008979,-0.050008,-0.0039,-0.002714,-0.000698,0.012206,-0.003846,...,-0.001418,0.00568,0.011664,-0.007618,-0.007544,-0.007564,-0.000951,0.013336,0.100039,0.121941
longitude,0.117848,1.0,0.041796,0.023205,0.044617,0.004165,0.004763,0.003669,-0.00292,0.003598,...,0.003929,0.00102,-0.402733,-0.010623,-0.010719,-0.010769,0.110825,-0.404371,-0.117478,0.301577
daystoclose,0.014901,0.041796,1.0,0.491616,0.281237,0.059456,0.058042,0.056205,-0.002306,0.066519,...,0.061936,-0.001377,-0.018245,0.010906,0.010848,0.010861,0.070146,-0.018598,-0.01218,0.007272
daysdue,0.008979,0.023205,0.491616,1.0,-0.117486,0.076263,0.079878,0.078168,-0.014326,0.083721,...,0.084803,-0.016835,-0.017091,0.010009,0.009949,0.009969,0.077327,-0.019256,-0.021538,-0.014276
missedduedate,-0.050008,0.044617,0.281237,-0.117486,1.0,0.008556,0.008034,0.007549,0.004105,0.006814,...,0.0075,0.02006,0.006217,0.020814,0.020755,0.020752,0.045256,0.006644,0.019649,0.033785
maxtemp,-0.0039,0.004165,0.059456,0.076263,0.008556,1.0,0.971378,0.899916,-0.022007,0.896618,...,0.866543,0.051876,0.001953,0.004633,0.004625,0.004622,0.001573,0.002414,0.008756,0.010406
avgtemp,-0.002714,0.004763,0.058042,0.079878,0.008034,0.971378,1.0,0.967183,0.007866,0.904536,...,0.90358,0.083174,0.00124,0.003934,0.003927,0.003924,0.001229,0.001633,0.007976,0.00989
mintemp,-0.000698,0.003669,0.056205,0.078168,0.007549,0.899916,0.967183,1.0,0.048769,0.873196,...,0.904111,0.119923,0.001071,0.002694,0.002688,0.002685,0.000876,0.001432,0.006812,0.008026
precip,0.012206,-0.00292,-0.002306,-0.014326,0.004105,-0.022007,0.007866,0.048769,1.0,0.029269,...,0.073088,0.445849,-0.003331,-0.005531,-0.005528,-0.005527,-0.002822,-0.002866,-0.003818,-0.001999
weekmaxtemp,-0.003846,0.003598,0.066519,0.083721,0.006814,0.896618,0.904536,0.873196,0.029269,1.0,...,0.966516,0.051846,0.0028,0.004038,0.004033,0.004029,-0.0009,0.003103,0.007763,0.008153
