# PLAN

- [x] Acquisition
    - [x] read the csv into a dataframe
- [ ] Preparation
    - [ ] no missing values
    - [x] drop columns that are not needed
    - [x] change case to lower case
    - [x] make sure everything has right dtype
    - [ ] normalize what needs to be normalized
    - [x] rename columns for clarification
- [ ] Exploration
    - [ ] answer ALL questions raised
        - [x] Which locations are the most frequent sites of SSO?
        - [x] Which location have the most volume of overflow?
        - [x] What are most common root causes of SSO?
        - [x] Where do the majority of overflow go?

    - [ ] visualize important findings
    - [ ] decide what TODO items to keep
- [ ] Modeling
    - [ ] predict 
- [ ] Delivery
    - [ ] report
    - [ ] prezi slides
    - [ ] website

# ENVIRONMENT

In [1]:
import os
import acquire_sso as acquire
import prepare_sso as prepare
import pandas as pd
import numpy as np

# data visualization 
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import statsmodels.api as sm

from datetime import timedelta, datetime
from pylab import rcParams

# to explode the DataFrames and avoid truncation
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from fbprophet import Prophet

import warnings
warnings.filterwarnings('ignore')

# ACQUIRE

#### _Let's read in the data from the csv file and take a peek at te first five records._

In [2]:
df = acquire.read_data('saws-sso.csv')

In [3]:
df.head()

Unnamed: 0,SSO_ID,INSPKEY,SERVNO,REPORTDATE,SPILL_ADDRESS,SPILL_ST_NAME,TOTAL_GAL,GALSRET,GAL,SPILL_START,SPILL_STOP,HRS,CAUSE,COMMENTS,ACTIONS,WATERSHED,UNITID,UNITID2,DISCHARGE_TO,DISCHARGE_ROUTE,COUNCIL_DISTRICT,FERGUSON,Month,Year,Week,EARZ_ZONE,Expr1029,PIPEDIAM,PIPELEN,PIPETYPE,INSTYEAR,DWNDPTH,UPSDPTH,Inches_No,RainFall_Less3,SPILL ADDRESS,SewerAssetExp,NUM_SPILLS_COMPKEY,NUM_SPILLS_24MOS,PREVSPILL_24MOS,UNITTYPE,ASSETTYPE,LASTCLND,ResponseTime,ResponseDTTM,Public Notice,TIMEINT,Root_Cause,STEPS_TO_PREVENT,SPILL_START_2,SPILL_STOP_2,HRS_2,GAL_2,SPILL_START_3,SPILL_STOP_3,HRS_3,GAL_3
0,6582,567722.0,,3/10/19,3200,THOUSAND OAKS DR,2100,2100.0,2100.0,3/10/2019 1:16:00 PM,3/10/2019 2:40:00 PM,1.4,Grease,Spill ContainedReturned to SystemArea Cleaned ...,CLEANED MAIN,SALADO CREEK,66918,66917,STREET,,,172A2,3,2019,11,0.0,,8.0,16.55,PVC,1997.0,,,,,3200 THOUSAND OAKS DR,,1,1.0,,GRAVITY,Sewer Main,,0.45,10-Mar-19,False,24.0,,,,,0.0,0.0,,,0.0,0.0
1,6583,567723.0,,3/10/19,6804,S FLORES ST,80,0.0,80.0,3/10/2019 2:25:00 PM,3/10/2019 3:45:00 PM,1.333333,Grease,Spill ContainedArea Cleaned and Disinfected,CLEANED MAIN,DOS RIOS,24250,24193,STORMDRAIN,,3.0,251A3,3,2019,11,0.0,,8.0,157.0,PVC,1988.0,,,,,6804 S FLORES,,1,1.0,,GRAVITY,Sewer Main,,1.08,10-Mar-19,False,120.0,,,,,0.0,0.0,,,0.0,0.0
2,6581,567714.0,,3/9/19,215,AUDREY ALENE DR,79,0.0,10.0,3/9/2019 6:00:00 PM,3/9/2019 7:30:00 PM,1.5,Structural,Spill ContainedArea Cleaned and DisinfectedFlu...,CLEANED MAIN,DOS RIOS,2822,3351,ALLEY,,1.0,190E4,3,2019,10,0.0,,8.0,350.0,CP,1955.0,,,,,215 Audrey Alene Dr,,1,1.0,,GRAVITY,Sewer Main,,1.0,09-Mar-19,False,24.0,,,03/10/2019 09:36,03/10/2019 10:45,1.15,69.0,,,0.0,0.0
3,6584,567713.0,,3/9/19,3602,SE MILITARY DR,83,0.0,83.0,3/9/2019 3:37:00 PM,3/9/2019 5:00:00 PM,1.383333,Grease,Spill ContainedArea Cleaned and DisinfectedFlu...,,SALADO CREEK,92804,92805,EASEMENT,,3.0,252C3,3,2019,10,0.0,,8.0,213.91,PVC,1983.0,,,,,3602 SE MILITARY DR,,1,1.0,,GRAVITY,Sewer Main,,0.55,09-Mar-19,False,120.0,,,,,0.0,0.0,,,0.0,0.0
4,6580,567432.0,,3/6/19,100,PANSY LN,75,0.0,75.0,3/6/2019 9:40:00 AM,3/6/2019 9:55:00 AM,0.25,Structural,Spill ContainedArea Cleaned and DisinfectedFlu...,CLEANED MAIN,SALADO CREEK,61141,49543,STREET,,2.0,192A7,3,2019,10,0.0,,12.0,291.9,CP,1952.0,,,,,100 PANSY LN,,2,2.0,15-Dec-18,GRAVITY,Sewer Main,,0.0,06-Mar-19,False,3.0,,,,,0.0,0.0,,,0.0,0.0


In [4]:
df['UNITTYPE'].value_counts()

GRAVITY       2508
OUTFALL        222
SIPHON         144
ABOVE           83
LATERAL         57
SUBMERSED       51
FORCE           44
STANDARD        29
                 9
A/R              6
JUNCTION         2
PMPCENTSP        2
TANKHOLDIN       2
IN SIPHON        2
OUT SIPHON       2
SLUDGE           1
OTHER            1
PMPSUBM          1
Name: UNITTYPE, dtype: int64

In [5]:
df['ASSETTYPE'].value_counts()

Sewer Main               2925
Sewer Lift Station        108
Sewer Service Line         62
Sewer Manhole              34
Sewer Liftstation          30
Sewer Valve                 7
Sewer Pump                  3
Sewer Treatment Plant       3
Sewer Node                  3
Plant Equipment             2
Sewer Force Main            1
BYPASS LINE                 1
Sewer Miscellaneous         1
Bypass                      1
Name: ASSETTYPE, dtype: int64

In [6]:
df.WATERSHED.value_counts()

DOS RIOS             1572
SALADO CREEK          790
LEON CREEK            668
MEDIO CREEK           146
CCMA (Subscriber)       4
Leon Creek              1
Dos Rios                1
Salado Creek            1
Name: WATERSHED, dtype: int64

In [7]:
(df.WATERSHED == 'LEON CREEK').sum()

668

In [8]:
df[df.WATERSHED == 'Leon Creek']

Unnamed: 0,SSO_ID,INSPKEY,SERVNO,REPORTDATE,SPILL_ADDRESS,SPILL_ST_NAME,TOTAL_GAL,GALSRET,GAL,SPILL_START,SPILL_STOP,HRS,CAUSE,COMMENTS,ACTIONS,WATERSHED,UNITID,UNITID2,DISCHARGE_TO,DISCHARGE_ROUTE,COUNCIL_DISTRICT,FERGUSON,Month,Year,Week,EARZ_ZONE,Expr1029,PIPEDIAM,PIPELEN,PIPETYPE,INSTYEAR,DWNDPTH,UPSDPTH,Inches_No,RainFall_Less3,SPILL ADDRESS,SewerAssetExp,NUM_SPILLS_COMPKEY,NUM_SPILLS_24MOS,PREVSPILL_24MOS,UNITTYPE,ASSETTYPE,LASTCLND,ResponseTime,ResponseDTTM,Public Notice,TIMEINT,Root_Cause,STEPS_TO_PREVENT,SPILL_START_2,SPILL_STOP_2,HRS_2,GAL_2,SPILL_START_3,SPILL_STOP_3,HRS_3,GAL_3
1703,2795,162383.0,,6/7/12,7523,OLD PEARSALL RD,124,124.0,124.0,6/7/2012 11:33:00 AM,6/7/2012 12:25:00 PM,0.866667,Grease,APPLIED HTH,CLEANED MAIN,Leon Creek,41296,41265,DRAINAGE CULVERT,,4.0,679D4,6,2012,23,0.0,GREASE,8.0,400.0,PVC,1993.0,,,0.0,0.0,7523 OLD PEARSALL RD,,2,,,GRAVITY,Sewer Main,,,,False,,GREASE,,,,,,,,,


# Work with geocoded_saws.csv to clean up zip codes and lat and long

In [9]:
df_geocoded = acquire.read_data('geocoded_saws.csv')

In [10]:
df_geocoded.sewershed.value_counts()

dos rios             1566
salado creek          789
leon creek            671
medio creek           153
ccma (subscriber)       4
Name: sewershed, dtype: int64

In [11]:
df_geocoded.Postal[df_geocoded.sewershed == 'leon creek'].value_counts()

78227    122
78242     99
78228     77
78240     54
78251     45
78238     43
78250     31
78229     29
78243     25
78249     24
78254     19
78257     17
78211     13
78252     11
78221     11
78245      9
78255      8
78253      6
78237      6
78256      6
78015      3
78230      3
78224      3
78264      2
78073      2
1          2
78023      1
Name: Postal, dtype: int64

In [12]:
wrong_zips = [78219, 78218, 78208, 78154, 78244, 77377]

In [13]:
temp = df_geocoded[['spill_street_address', 'Postal', 'sewershed']][df_geocoded['Postal'].isin(wrong_zips)]
temp.sort_values(by='sewershed')

Unnamed: 0,spill_street_address,Postal,sewershed
386,"1112 muncey, san antonio, tx",78208,dos rios
1323,"4000 ih 10 e, san antonio, tx",78219,dos rios
1324,"700 at and t center pkwy, san antonio, tx",78219,dos rios
1325,"3099 houston st e, san antonio, tx",78219,dos rios
1326,"4065 ih 10 e, san antonio, tx",78219,dos rios
401,"2535 ih 35 n, san antonio, tx",78208,dos rios
400,"2004 23rd st, san antonio, tx",78208,dos rios
398,"1300 ervin, san antonio, tx",78208,dos rios
397,"300 jim st, san antonio, tx",78208,dos rios
396,"1700 rogers ave, san antonio, tx",78208,dos rios


In [14]:
temp[temp['sewershed']=='leon creek']

Unnamed: 0,spill_street_address,Postal,sewershed


#### I changed the following data so that it no longer shows up as errors.
- 32	12014 canyon rock ln, san antonio, tx	77377
    - Closest is in Spring Branch, 78070, but that's north of SAWS area, removed city and changed zip to 1 and removed long and lat

- 257	10710 spirit roam, san antonio, tx	78154
    - Changed to 10710 spirit roam, san antonio, tx 78254
    - (long: -98.704260, lat: 29.525840

- 778	29621 elkhorn ridge, san antonio, tx	78218
    - Changed to 29621 elkhorn ridge, boerne, tx 78015
    - (long: -98.662880, lat: 29.734010)
- also changed 7707 lost creek gap, san antonio, tx
    - changed to 7707 lost creek gap, boerne, tx 78015
    - (long: -98.636370, lat: 29.682290)
- also changed 8219 mystic chase, san antonio, tx
    - changed to 8219 mystic chase, boerne, tx 78015
    - (long: -98.643900, lat: 29.702590)

- 796	8400 redwing dr, san antonio, tx	78219
    - There is no Redwing Drive in SA, closest is north of Medina Lake... ??? removed city and changed zip to 1 and removed long and lat

- 1097	4707 crystal hill, san antonio, tx	78244
    - Changed to 4707 crystal hill, san antonio, tx 78238
    - (long: -98.620500, lat: 29.479150)

- 1131	89 grey, san antonio, tx	78208
    - changed to 89 Grey Bluff, San Antonio, TX 78252
    - (long: -98.62897, lat: 29.343267)

In [15]:
df_geocoded.Postal[df_geocoded.sewershed == 'dos rios'].value_counts()

78201    169
78228    162
78223    156
78207    124
78212    123
78213     83
78221     77
78216     75
78230     74
78210     61
78209     60
78237     54
78214     41
78220     34
78211     34
78229     32
78202     28
78226     26
78203     26
78224     25
78225     22
78204     17
78208     16
78205     12
78249      9
78215      8
78240      6
78219      4
78231      4
78264      3
78257      1
Name: Postal, dtype: int64

In [16]:
wrong_for_dos_rios = [77701, 81120, 78232, 78236, 78251, 78252, 
                      78255, 78257, 78259, 78606, 78064]

In [17]:
temp = df_geocoded[['spill_street_address', 'Postal', 'sewershed']][df_geocoded['Postal'].isin(wrong_for_dos_rios)]
temp.sort_values(by=['sewershed', 'Postal'])

Unnamed: 0,spill_street_address,Postal,sewershed
3086,"17203 nw military hwy, san antonio, tx",78257,dos rios
2946,"4315 rogers rd, san antonio, tx",78251,leon creek
2947,"9765 potranco rd, san antonio, tx",78251,leon creek
2948,"2310 elva forest, san antonio, tx",78251,leon creek
2949,"9926 sable arrow, san antonio, tx",78251,leon creek
2950,"8985 grissom rd, san antonio, tx",78251,leon creek
2951,"9765 potranco rd, san antonio, tx",78251,leon creek
2952,"9765 potranco rd, san antonio, tx",78251,leon creek
2953,"3302 timber view dr, san antonio, tx",78251,leon creek
2954,"1900 ashprington, san antonio, tx",78251,leon creek


In [18]:
temp[temp['sewershed']=='dos rios'].sort_values(by='Postal')

Unnamed: 0,spill_street_address,Postal,sewershed
3086,"17203 nw military hwy, san antonio, tx",78257,dos rios


I changed the following data so that it no longer shows up as errors.
Except that one address really is at the top of the dos rios sewershed.

- 2600 cima, san antonio, tx	77701	dos rios
    - changed to 2600 cima st, san antonio, tx 78228
    - (long: -98.54377, lat: 29.44466)
    
NOT SURE IF THE NEXT TWO LISTSINGS ARE CORRECT...
- 7500 us hwy 281 n, san antonio, tx	78064	dos rios
    - changed to 7500 us hwy 281 n, san antonio, tx	78216
    - (long: -98.48198, lat: 29.48261)
    
- 7900 us hwy 281 n, san antonio, tx	78232	dos rios
    - changed to 7900 us hwy 281 n, san antonio, tx	78216
    - (long: -98.482997, lat: 29.48876)
    
    
- 100 chappie james way, san antonio, tx	78236	dos rios
    - 100 chappie james way, san antonio, tx	78243
    - (long: -98.59991, lat: 29.3869)
    - also changed 108 chappie james way, san antonio 78243
    - (long: -98.59896, lat: 29.38401)
    
- 105 dillon walk, san antonio, tx	78251	dos rios
    - there is no dillon walk...should this be dillon road???
    - changed to 105 dillon road, san antonio, tx 78002
    - (long: -98.6987, lat:	29.31755)
    
- 12519 switchgrass, san antonio, tx	78252	dos rios
    - looks like correct zip code, but wrong sewershed
    - changed sewershed to medio creek
    - (long: -98.738300, lat: 29.360230)
    
- 6700 mountain top, san antonio, tx	78255	dos rios
    - looks like correct zip code, but wrong sewershed
    - changed sewershed to leon creek
    - (long: -98.660072, lat: 29.688419)
    
- 17203 nw military hwy, san antonio, tx	78257	dos rios
    - all looks correct... top of dos rios sewershed
    
- 800 hanover, san antonio, tx	78259	dos rios
    - changed to 800 hanover, san antonio, tx	78228
    - (long: -98.557060, lat: 29.448790)
    
- 100 pyle, san antonio, tx	78606	dos rios
    - changed to 100 pyle st, san antonio, tx	78223
    - (long: -98.445580, lat: 29.365900)
    
- 500 st agatha, san antonio, tx	81120	dos rios
    - changed to 500 st agatha, san antonio, tx	78207
    - (long: -98.532380, lat: 29.415290)
    

#### Other errors detected through Tableau...
# need to be corrected in csv file

Postal	Sewershed	Spill Street Address	X	Y

Should be medio creek:
- 78245	dos rios	2007 ares cv, san antonio, tx	-98.78637663	29.41156555
- 78245	dos rios	12503 ranch summit, san antonio, tx	-98.74192235	29.42448298
- 78245	dos rios	2380 grosenbacher rd, san antonio, tx	-98.72656996	29.40217192
- 78245	dos rios	2553 struck silver, san antonio, tx	-98.71700874	29.40479113



Should be dos rios:
- 78225	salado creek	4000 ih 35 n, san antonio, tx	-98.512	29.38424129
    - changed sewershed to dos rios
    - changed address to 4000 ih 35 n, san antonio, tx 78214
    - (long: -98.511903, lat: 29.382335)



Should be leon creek:
- 78228	salado creek	83 arrowhead dr, san antonio, tx	-98.57819317	29.47707174
    - changed sewershed to leon creek
    - (long: -98.573485, lat: 29.482164)





Changed to leon creek to match sewershed of others on this street:
- 10/24/18 0:00	2448400	0	2448400	10/24/18 13:50	10/28/18 13:29	95.65	i/i		monitored area	leon creek	1210	42319	creek bed - spilled into leon creek	leon creek		2.28E+08	10	2018	43	0	RAIN EVENT	54	1335	rcp	1965	0.79	0.49	10	9	outfall	sewer main		TRUE	RAIN EVENT	Capacity Project, 			0	0			0	0	108 chappie james way, san antonio, tx	78243	122	-98.59896	29.38401


Changed zip code:
- 78233	dos rios	2535 ih 35 n, san antonio, tx	-98.36316044	29.54880533
    - changed to 2535 ih 35 n, san antonio, tx 78208
    - (long: -98.452467, lat: 29.439920)
- 78233 dos rios  2623 ih 35 n, san antonio, tx -98.36316044	29.54880533
    - changed to 2623 ih 35 n, san antonio, tx 78211
    - (long: -98.450922, lat: 29.439939)
    
- 78237 salado creek 132 afton oaks e, san antonio, tx
    - changed to 132 afton oaks e, san antonio, tx 78232
    - (long: -98.482517, lat: 29.608570)
    
- 78249	salado creek	4100 kingston, san antonio, tx	-98.60637544	29.569293
    - changed to 4100 kingston, san antonio, tx 78218
    - (long: -98.409455, lat: 29.502832)
    
- 78249	salado creek	6011 kingston, san antonio, tx	-98.60416452	29.57100288
    - changed to 6011 kingston, san antonio, tx 78218
    - (long: -98.410942, lat: 29.492495)
    
- 78249	salado creek	6000 kingston, san antonio, tx	-98.60377135	29.57104561
    - changed to 6000 kingston, san antonio, tx 78218
    - (long: -98.409455, lat: 29.502832)
    
- 78249	salado creek	5800 kingston, san antonio, tx	   
    - changed to 5800 kingston, san antonio, tx 78218
    - (long: -98.409455, lat: 29.502832)
    
- 78238	salado creek	4906 wurzbach pkwy, san antonio, tx	-98.61201237	29.47850521
    - changed to 4906 wurzbach pkwy, san antonio, tx 78233
    - (long: -98.38278, lat: 29.545654)
    
- 78260	medio creek	1605 dove canyon, san antonio, tx	-98.46736392	29.68591598
    - changed to 1605 dove canyon, san antonio, tx 78245 
    - (long: -98.718503, lat: 29.419498)
    
- 78249	medio creek	8541 lake vista, san antonio, tx	-98.65468727	29.55706988
    - changed to 8541 lake vista, san antonio, tx 78227
    - (long: -98.650678, lat: 29.394092)
    
Lat and Long and sewershed were wrong:
- 78245	leon creek	2380 grosenbacher rd, san antonio, tx	-98.72656996	29.40217192
    - changed (long: -98.72651, lat: 29.407868)
    - changed sewershed to medio creek

In [19]:
df = acquire.read_data('SAWS_SSOs_2009-2018Mar_UploadData.csv')

In [20]:
df.Root_Cause.value_counts()

STRUCTURAL           1237
GREASE                610
DEBRIS                443
RAIN EVENT            353
CONTRACTOR            164
LIFT STATION          125
OTHER                  61
VANDALISM              48
RAIN EVENT             48
VANDALISM              24
ROOTS                  22
LIFT STATION           15
I/I                    12
CONTRACTOR              8
BY PASS PUMP LEAK       3
OTHER                   1
Grease                  1
Debris                  1
Name: Root_Cause, dtype: int64

In [21]:
df.CAUSE.value_counts()

Grease                                                  956
Debris                                                  479
I/I                                                     465
Debris/Rock                                             254
Structural                                              244
Rags                                                    236
Break In Main                                           164
Contractor                                               91
Break in Main                                            68
Lift Station                                             42
Vandalism                                                38
Lift Station Power Out                                   29
Other                                                    29
Roots                                                    28
Lift Station Pump Out                                    21
Equipment Failure                                         5
Other - Capacity                        

In [22]:
df = pd.read_excel('data/SAWS_SSOs_2009-2018Mar_UploadData.xlsx', sep=', ', keep_default_na=True)

In [23]:
df.columns = map(str.lower, df.columns)

In [24]:
df.rename(index=str, columns={'public notice':'public_notice', 'spill address': 'spill_address_full'}, inplace=True)

In [25]:
def create_full_address():    
    df[['full_address_spill']] = df[['spill_address']]
    rows = list(df.index)
    for row in rows:
        df.full_address_spill[row] = str(df.spill_address[row]) + ' ' + df.spill_st_name[row]
    return df

In [26]:
create_full_address()

Unnamed: 0,sso_id,inspkey,servno,reportdate,spill_address,spill_st_name,total_gal,galsret,gal,spill_start,spill_stop,hrs,cause,comments,actions,watershed,unitid,unitid2,discharge_to,discharge_route,council_district,ferguson,month,year,week,earz_zone,expr1029,pipediam,pipelen,pipetype,instyear,dwndpth,upsdpth,inches_no,rainfall_less3,spill_address_full,sewerassetexp,num_spills_compkey,num_spills_24mos,prevspill_24mos,unittype,assettype,lastclnd,responsetime,responsedttm,public_notice,timeint,root_cause,steps_to_prevent,spill_start_2,spill_stop_2,hrs_2,gal_2,spill_start_3,spill_stop_3,hrs_3,gal_3,full_address_spill
0,6582,567722.0,,2019-03-10,3200,THOUSAND OAKS DR,2100,2100.0,2100.0,3/10/2019 1:16:00 PM,3/10/2019 2:40:00 PM,1.400000,Grease,Spill ContainedReturned to SystemArea Cleaned ...,CLEANED MAIN,SALADO CREEK,66918,66917,STREET,,,172A2,3,2019,11,0.0,,8.0,16.550000,PVC,1997.0,,,,,3200 THOUSAND OAKS DR,NaT,1,1.0,NaT,GRAVITY,Sewer Main,NaT,0.450000,2019-03-10 13:43:00,False,24.0,,,,,0.000000,0.0,,,0.00,0.0,3200 THOUSAND OAKS DR
1,6583,567723.0,,2019-03-10,6804,S FLORES ST,80,0.0,80.0,3/10/2019 2:25:00 PM,3/10/2019 3:45:00 PM,1.333333,Grease,Spill ContainedArea Cleaned and Disinfected,CLEANED MAIN,DOS RIOS,24250,24193,STORMDRAIN,,3.0,251A3,3,2019,11,0.0,,8.0,157.000000,PVC,1988.0,,,,,6804 S FLORES,NaT,1,1.0,NaT,GRAVITY,Sewer Main,NaT,1.083333,2019-03-10 15:30:00,False,120.0,,,,,0.000000,0.0,,,0.00,0.0,6804 S FLORES ST
2,6581,567714.0,,2019-03-09,215,AUDREY ALENE DR,79,0.0,10.0,3/9/2019 6:00:00 PM,3/9/2019 7:30:00 PM,1.500000,Structural,Spill ContainedArea Cleaned and DisinfectedFlu...,CLEANED MAIN,DOS RIOS,2822,3351,ALLEY,,1.0,190E4,3,2019,10,0.0,,8.0,350.000000,CP,1955.0,,,,,215 Audrey Alene Dr,NaT,1,1.0,NaT,GRAVITY,Sewer Main,NaT,1.000000,2019-03-09 19:00:00,False,24.0,,,03/10/2019 09:36,03/10/2019 10:45,1.150000,69.0,,,0.00,0.0,215 AUDREY ALENE DR
3,6584,567713.0,,2019-03-09,3602,SE MILITARY DR,83,0.0,83.0,3/9/2019 3:37:00 PM,3/9/2019 5:00:00 PM,1.383333,Grease,Spill ContainedArea Cleaned and DisinfectedFlu...,,SALADO CREEK,92804,92805,EASEMENT,,3.0,252C3,3,2019,10,0.0,,8.0,213.910000,PVC,1983.0,,,,,3602 SE MILITARY DR,NaT,1,1.0,NaT,GRAVITY,Sewer Main,NaT,0.550000,2019-03-09 16:10:00,False,120.0,,,,,0.000000,0.0,,,0.00,0.0,3602 SE MILITARY DR
4,6580,567432.0,,2019-03-06,100,PANSY LN,75,0.0,75.0,3/6/2019 9:40:00 AM,3/6/2019 9:55:00 AM,0.250000,Structural,Spill ContainedArea Cleaned and DisinfectedFlu...,CLEANED MAIN,SALADO CREEK,61141,49543,STREET,,2.0,192A7,3,2019,10,0.0,,12.0,291.900000,CP,1952.0,,,,,100 PANSY LN,NaT,2,2.0,2018-12-15,GRAVITY,Sewer Main,NaT,0.000000,2019-03-06 09:40:00,False,3.0,,,,,0.000000,0.0,,,0.00,0.0,100 PANSY LN
5,6579,567274.0,,2019-03-05,3200,S HACKBERRY ST,250,0.0,250.0,3/5/2019 2:22:00 PM,3/5/2019 2:32:00 PM,0.166667,Grease,Spill ContainedArea Cleaned and DisinfectedFlu...,CLEANED MAIN,DOS RIOS,38907,26117,STREET,,3.0,231C6,3,2019,10,0.0,,8.0,315.000000,RL,1992.0,,,,,3200 S Hackberry St,NaT,2,2.0,2017-04-11,GRAVITY,Sewer Main,NaT,0.000000,2019-03-05 14:22:00,False,12.0,,,,,0.000000,0.0,,,0.00,0.0,3200 S HACKBERRY ST
6,6577,567046.0,,2019-03-02,9910,SUGARLOAF DR,73,0.0,73.0,3/2/2019 1:42:00 PM,3/2/2019 2:55:00 PM,1.216667,Grease,Spill ContainedArea Cleaned and DisinfectedFlu...,CLEANED MAIN,MEDIO CREEK,85120,85363,DRAINAGE CULVERT,,4.0,207A6,3,2019,9,0.0,GREASE,8.0,264.470000,PVC,1985.0,,,,,9910 Sugarloaf Dr,NaT,1,1.0,NaT,GRAVITY,Sewer Main,NaT,0.730000,2019-03-02 14:26:00,False,120.0,GREASE,"Increase FCS,",,,0.000000,0.0,,,0.00,0.0,9910 SUGARLOAF DR
7,6576,567004.0,,2019-03-01,3507,PIEDMONT AVE,76,0.0,76.0,3/1/2019 6:34:00 PM,3/1/2019 7:50:00 PM,1.266667,Grease,Spill ContainedArea Cleaned and DisinfectedFlu...,CLEANED MAIN,DOS RIOS,26128,24334,STORMDRAIN,,3.0,231C6,3,2019,9,0.0,,8.0,60.000000,RL,2015.0,,,,,3507 Piedmont Ave,NaT,1,1.0,NaT,GRAVITY,Sewer Main,NaT,0.433333,2019-03-01 19:00:00,False,120.0,,"Increase FCS,",,,0.000000,0.0,,,0.00,0.0,3507 PIEDMONT AVE
8,6573,566487.0,,2019-02-26,349,ALICIA,3750,0.0,3750.0,2/26/2019 9:00:00 AM,2/26/2019 10:15:00 AM,1.250000,Structural,Spill ContainedArea Cleaned and DisinfectedFlu...,CLEANED MAIN,LEON CREEK,47292,47293,STORMDRAIN,,7.0,209A4,2,2019,9,0.0,STRUCTURAL,8.0,175.390000,CP,1956.0,,,,,349 Alicia,NaT,1,1.0,NaT,GRAVITY,Sewer Main,NaT,0.000000,2019-02-26 09:00:00,False,120.0,STRUCTURAL,"Design Request,",,,0.000000,0.0,,,0.00,0.0,349 ALICIA
9,6575,566539.0,,2019-02-26,1502,W MISTLETOE AVE,66,0.0,66.0,2/26/2019 5:24:00 PM,2/26/2019 6:30:00 PM,1.100000,Grease,Spill ContainedArea Cleaned and DisinfectedFlu...,CLEANED MAIN,DOS RIOS,14241,14896,STREET,,1.0,210C4,2,2019,9,0.0,DEBRIS,8.0,194.100000,PVC,1992.0,,,,,1502 W Mistletoe Ave,NaT,1,1.0,NaT,GRAVITY,Sewer Main,NaT,0.430000,2019-02-26 17:50:00,False,120.0,DEBRIS,"Increase FCS,",,,0.000000,0.0,,,0.00,0.0,1502 W MISTLETOE AVE


In [27]:
df.head()

Unnamed: 0,sso_id,inspkey,servno,reportdate,spill_address,spill_st_name,total_gal,galsret,gal,spill_start,spill_stop,hrs,cause,comments,actions,watershed,unitid,unitid2,discharge_to,discharge_route,council_district,ferguson,month,year,week,earz_zone,expr1029,pipediam,pipelen,pipetype,instyear,dwndpth,upsdpth,inches_no,rainfall_less3,spill_address_full,sewerassetexp,num_spills_compkey,num_spills_24mos,prevspill_24mos,unittype,assettype,lastclnd,responsetime,responsedttm,public_notice,timeint,root_cause,steps_to_prevent,spill_start_2,spill_stop_2,hrs_2,gal_2,spill_start_3,spill_stop_3,hrs_3,gal_3,full_address_spill
0,6582,567722.0,,2019-03-10,3200,THOUSAND OAKS DR,2100,2100.0,2100.0,3/10/2019 1:16:00 PM,3/10/2019 2:40:00 PM,1.4,Grease,Spill ContainedReturned to SystemArea Cleaned ...,CLEANED MAIN,SALADO CREEK,66918,66917,STREET,,,172A2,3,2019,11,0.0,,8.0,16.55,PVC,1997.0,,,,,3200 THOUSAND OAKS DR,NaT,1,1.0,NaT,GRAVITY,Sewer Main,NaT,0.45,2019-03-10 13:43:00,False,24.0,,,,,0.0,0.0,,,0.0,0.0,3200 THOUSAND OAKS DR
1,6583,567723.0,,2019-03-10,6804,S FLORES ST,80,0.0,80.0,3/10/2019 2:25:00 PM,3/10/2019 3:45:00 PM,1.333333,Grease,Spill ContainedArea Cleaned and Disinfected,CLEANED MAIN,DOS RIOS,24250,24193,STORMDRAIN,,3.0,251A3,3,2019,11,0.0,,8.0,157.0,PVC,1988.0,,,,,6804 S FLORES,NaT,1,1.0,NaT,GRAVITY,Sewer Main,NaT,1.083333,2019-03-10 15:30:00,False,120.0,,,,,0.0,0.0,,,0.0,0.0,6804 S FLORES ST
2,6581,567714.0,,2019-03-09,215,AUDREY ALENE DR,79,0.0,10.0,3/9/2019 6:00:00 PM,3/9/2019 7:30:00 PM,1.5,Structural,Spill ContainedArea Cleaned and DisinfectedFlu...,CLEANED MAIN,DOS RIOS,2822,3351,ALLEY,,1.0,190E4,3,2019,10,0.0,,8.0,350.0,CP,1955.0,,,,,215 Audrey Alene Dr,NaT,1,1.0,NaT,GRAVITY,Sewer Main,NaT,1.0,2019-03-09 19:00:00,False,24.0,,,03/10/2019 09:36,03/10/2019 10:45,1.15,69.0,,,0.0,0.0,215 AUDREY ALENE DR
3,6584,567713.0,,2019-03-09,3602,SE MILITARY DR,83,0.0,83.0,3/9/2019 3:37:00 PM,3/9/2019 5:00:00 PM,1.383333,Grease,Spill ContainedArea Cleaned and DisinfectedFlu...,,SALADO CREEK,92804,92805,EASEMENT,,3.0,252C3,3,2019,10,0.0,,8.0,213.91,PVC,1983.0,,,,,3602 SE MILITARY DR,NaT,1,1.0,NaT,GRAVITY,Sewer Main,NaT,0.55,2019-03-09 16:10:00,False,120.0,,,,,0.0,0.0,,,0.0,0.0,3602 SE MILITARY DR
4,6580,567432.0,,2019-03-06,100,PANSY LN,75,0.0,75.0,3/6/2019 9:40:00 AM,3/6/2019 9:55:00 AM,0.25,Structural,Spill ContainedArea Cleaned and DisinfectedFlu...,CLEANED MAIN,SALADO CREEK,61141,49543,STREET,,2.0,192A7,3,2019,10,0.0,,12.0,291.9,CP,1952.0,,,,,100 PANSY LN,NaT,2,2.0,2018-12-15,GRAVITY,Sewer Main,NaT,0.0,2019-03-06 09:40:00,False,3.0,,,,,0.0,0.0,,,0.0,0.0,100 PANSY LN


# Data is from 1/1/2010 through 9/9/2018

In [44]:
df.spill_start.min()

'1/1/2010 2:30:00 PM'

In [45]:
df.spill_start.max()

'9/9/2018 9:58:00 AM'

In [28]:
df.num_spills_24mos[df.num_spills_24mos > 1].value_counts()

2.0     180
3.0      75
4.0      32
5.0      27
6.0      19
7.0      12
8.0       7
9.0       5
10.0      1
11.0      1
12.0      1
13.0      1
Name: num_spills_24mos, dtype: int64

In [29]:
df.full_address_spill.value_counts()

6606 SWISS OAKS               24
108 CHAPPIE JAMES WAY         20
700 HOLBROOK                  16
10800 ENTRANCE RD NE          14
6305 CAGNON RD                13
1427 HARRY WURZBACH           12
6785 CAGNON RD                12
23500 IH 10 W                 12
1800 AUSTIN HWY               12
668 HOLBROOK                  10
10102 US HWY 90 W              9
3847 THOUSAND OAKS             8
409 CHERYL DR E                7
7930 FLORES ST S               7
1043 KENTUCKY AVE              7
10762 PLEASANTON RD            7
3714 NEER AVE                  7
5410 MOREY RD                  6
1300 ELMENDORF N               6
111 YOLANDA                    6
6102 ROSE VALLEY               6
1114 ONSLOW                    6
5802 KIM VALLEY DR             6
8600 MISSION PKWY              6
11400 STARCREST DR             6
902 HOLBROOK                   6
1703 DONALDSON AVE             6
2627 CULEBRA RD                6
9765 POTRANCO RD               5
874 NEW VALLEY HI DR           5
1612 MARTI

In [46]:
df['root_cause'][df.full_address_spill == '6606 SWISS OAKS'].value_counts()

RAIN EVENT    20
OTHER          3
GREASE         1
Name: root_cause, dtype: int64

In [47]:
df['root_cause'][df.full_address_spill == '108 CHAPPIE JAMES WAY'].value_counts()

RAIN EVENT    20
Name: root_cause, dtype: int64

In [48]:
df['root_cause'][df.full_address_spill == '100 CHAPPIE JAMES WAY'].value_counts()

RAIN EVENT    3
STRUCTURAL    1
Name: root_cause, dtype: int64

# Should we add this one in, too?

In [95]:
df['root_cause'][df.full_address_spill == '0 CHAPPIE JAMES WAY'].value_counts()

RAIN EVENT     1
Name: root_cause, dtype: int64

In [96]:
df['root_cause'][df.spill_st_name == 'CHAPPIE JAMES WAY'].value_counts()

RAIN EVENT     23
RAIN EVENT      1
STRUCTURAL      1
Name: root_cause, dtype: int64

In [49]:
df['root_cause'][df.full_address_spill == '700 HOLBROOK'].value_counts()

RAIN EVENT    16
Name: root_cause, dtype: int64

In [59]:
df['root_cause'][df.full_address_spill == '668 HOLBROOK'].value_counts()

RAIN EVENT    9
OTHER         1
Name: root_cause, dtype: int64

In [50]:
df['root_cause'][df.full_address_spill == '10800 ENTRANCE RD NE'].value_counts()

OTHER         8
RAIN EVENT    4
DEBRIS        2
Name: root_cause, dtype: int64

In [51]:
df['root_cause'][df.full_address_spill == '6305 CAGNON RD'].value_counts()

LIFT STATION     11
LIFT STATION      2
Name: root_cause, dtype: int64

In [53]:
df['root_cause'][df.full_address_spill == '6305 CAGNON RD']

1194     LIFT STATION
1411     LIFT STATION
1610     LIFT STATION
1612     LIFT STATION
1654     LIFT STATION
2394     LIFT STATION
2742    LIFT STATION 
2748    LIFT STATION 
2806     LIFT STATION
2882     LIFT STATION
2886     LIFT STATION
3001     LIFT STATION
3015     LIFT STATION
Name: root_cause, dtype: object

In [55]:
df['root_cause'][df.full_address_spill == '1800 AUSTIN HWY'].value_counts()

RAIN EVENT    11
DEBRIS         1
Name: root_cause, dtype: int64

In [56]:
df['root_cause'][df.full_address_spill == '1427 HARRY WURZBACH'].value_counts()

RAIN EVENT    8
STRUCTURAL    2
GREASE        1
CONTRACTOR    1
Name: root_cause, dtype: int64

In [57]:
df['root_cause'][df.full_address_spill == '23500 IH 10 W'].value_counts()

RAIN EVENT      11
LIFT STATION     1
Name: root_cause, dtype: int64

In [60]:
df['root_cause'][df.full_address_spill == '10102 US HWY 90 W'].value_counts()

LIFT STATION     7
LIFT STATION     1
DEBRIS           1
Name: root_cause, dtype: int64

In [62]:
df['root_cause'][df.full_address_spill == '3847 THOUSAND OAKS'].value_counts()

LIFT STATION     6
LIFT STATION     2
Name: root_cause, dtype: int64

In [64]:
df['cause'][df.full_address_spill == '6606 SWISS OAKS'].value_counts()

I/I                                                     17
Structural                                               4
Other - Cause To Be Determined                           1
Other                                                    1
Other - Spill Was Caused By A Surcharging Sewer Main     1
Name: cause, dtype: int64

In [65]:
df['cause'][df.full_address_spill == '108 CHAPPIE JAMES WAY'].value_counts()

I/I    20
Name: cause, dtype: int64

In [66]:
df['cause'][df.full_address_spill == '100 CHAPPIE JAMES WAY'].value_counts()

I/I       3
Debris    1
Name: cause, dtype: int64

# Should we add this in, too?

In [120]:
df['cause'][df.full_address_spill == '0 CHAPPIE JAMES WAY'].value_counts()

I/I    1
Name: cause, dtype: int64

In [67]:
df['cause'][df.full_address_spill == '700 HOLBROOK'].value_counts()

I/I    16
Name: cause, dtype: int64

In [68]:
df['cause'][df.full_address_spill == '668 HOLBROOK'].value_counts()

I/I      9
Other    1
Name: cause, dtype: int64

In [69]:
df['cause'][df.full_address_spill == '10800 ENTRANCE RD NE'].value_counts()

I/I                 5
Other - Capacity    4
Debris              4
Other               1
Name: cause, dtype: int64

In [70]:
df['cause'][df.full_address_spill == '6305 CAGNON RD'].value_counts()

Lift Station              4
Lift Station Power Out    3
Lift Station Pump Out     2
I/I                       2
Rags                      2
Name: cause, dtype: int64

In [72]:
df['cause'][df.full_address_spill == '1800 AUSTIN HWY'].value_counts()

I/I       11
Grease     1
Name: cause, dtype: int64

In [73]:
df['cause'][df.full_address_spill == '1427 HARRY WURZBACH'].value_counts()

I/I    12
Name: cause, dtype: int64

In [74]:
df['cause'][df.full_address_spill == '23500 IH 10 W'].value_counts()

I/I    12
Name: cause, dtype: int64

In [75]:
df['cause'][df.full_address_spill == '10102 US HWY 90 W'].value_counts()

Lift Station Pump Out     4
Lift Station              3
Lift Station Power Out    2
Name: cause, dtype: int64

In [76]:
df['cause'][df.full_address_spill == '3847 THOUSAND OAKS'].value_counts()

Lift Station              3
Lift Station Pump Out     3
Lift Station Power Out    2
Name: cause, dtype: int64

In [115]:
df.groupby(['spill_st_name'])['total_gal']\
  .sum()\
  .reset_index()\
  .sort_values(by = ['total_gal'], ascending=[False])

Unnamed: 0,spill_st_name,total_gal
1331,SWISS OAKS,23636682
614,HOLBROOK,17560029
243,CHAPPIE JAMES WAY,15518896
79,AUSTIN HWY,5304575
865,MISSION PKWY,4537445
1030,PETALUMA BLVD E,3663000
866,MISSION RD,2574350
636,IH 10 W,1931931
967,OLD CORPUS CHRISTI RD,1252900
283,COMMERCE ST W,1166574
