# PLAN

- [ ] Acquisition
    - [x] read the csv into a dataframe
- [ ] Preparation
    - [ ] no missing values
    - [ ] drop columns that are not needed
    - [x] change case to lower case
    - [ ] make sure everything has right dtype
    - [ ] normalize what needs to be normalized
    - [x] rename columns for clarification
- [ ] Exploration
    - [ ] answer ALL questions raised
    - [ ] visualize important findings
    - [ ] decide what TODO items to keep
- [ ] Modeling
    - [ ] predict 
- [ ] Delivery
    - [ ] report
    - [ ] prezi slides
    - [ ] website

# ENVIRONMENT

In [1]:
import os
import acquire
import pandas as pd

# data visualization 
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import statsmodels.api as sm

from datetime import timedelta, datetime
from pylab import rcParams

# ACQUIRE

_Let's read in the data from the csv file and take a peek at te first five records._

In [2]:
df = acquire.read_data('saws-ssos.csv')

In [3]:
df.head()

Unnamed: 0,SSO_ID,INSPKEY,SERVNO,REPORTDATE,SPILL_ADDRESS,SPILL_ST_NAME,TOTAL_GAL,GALSRET,GAL,SPILL_START,...,Root_Cause,STEPS_TO_PREVENT,SPILL_START_2,SPILL_STOP_2,HRS_2,GAL_2,SPILL_START_3,SPILL_STOP_3,HRS_3,GAL_3
0,6582,567722.0,,3/10/19,3200,THOUSAND OAKS DR,2100,2100.0,2100.0,3/10/2019 1:16:00 PM,...,,,,,0.0,0.0,,,0.0,0.0
1,6583,567723.0,,3/10/19,6804,S FLORES ST,80,0.0,80.0,3/10/2019 2:25:00 PM,...,,,,,0.0,0.0,,,0.0,0.0
2,6581,567714.0,,3/9/19,215,AUDREY ALENE DR,79,0.0,10.0,3/9/2019 6:00:00 PM,...,,,03/10/2019 09:36,03/10/2019 10:45,1.15,69.0,,,0.0,0.0
3,6584,567713.0,,3/9/19,3602,SE MILITARY DR,83,0.0,83.0,3/9/2019 3:37:00 PM,...,,,,,0.0,0.0,,,0.0,0.0
4,6580,567432.0,,3/6/19,100,PANSY LN,75,0.0,75.0,3/6/2019 9:40:00 AM,...,,,,,0.0,0.0,,,0.0,0.0


# PREPARE

In [4]:
def missing_values_col(df):
    """
    This functions returns the total missing values and
    the percent missing values by column.
    """
    null_count = df.isnull().sum()
    null_percentage = (null_count / df.shape[0]) * 100
    empty_count = pd.Series(((df == ' ') | (df == '')).sum())
    empty_percentage = (empty_count / df.shape[0]) * 100
    nan_count = pd.Series(((df == 'nan') | (df == 'NaN')).sum())
    nan_percentage = (nan_count / df.shape[0]) * 100
    return pd.DataFrame({'num_missing': null_count, 'missing_percentage': null_percentage,
                         'num_empty': empty_count, 'empty_percentage': empty_percentage,
                         'nan_count': nan_count, 'nan_percentage': nan_percentage})


def missing_values_row(df):
    """
    This functions returns the total missing values and
    the percent missing values by row.
    """
    null_count = df.isnull().sum(axis=1)
    null_percentage = (null_count / df.shape[1]) * 100
    return pd.DataFrame({'num_missing': null_count, 'percentage': null_percentage})


def handle_missing_threshold(df, prop_required_column = .3, prop_required_row = .9):
    """
    This functions removes columns and rows whose
    count of missing values exceeds threshold.
    """
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df


def count_values(df):
    """
    This function counts the value of columns in a dataframe.
    """
    for col in df.columns:
        n = df[col].unique().shape[0]
        col_bins = min(n, 10)
        print(f"{col}:")
        if df[col].dtype in ['int64', 'float64'] and n > 10:
            print(df[col].value_counts(bins=col_bins, sort=False))
        else:
            print(df[col].value_counts())
        print("\n")

def remove_columns(df, columns):
    return df.drop(columns=columns)


def fill_with_zeroes(df, *cols):
    """
    This functions returns the column names as input and
    return the dataframe with the
    null values in those columns replace by 0.
    """
    for col in cols:
        df[col] = df[col].fillna(0)
    return df


def fill_with_median(df, *cols):
    """
    This function fills the NaN values with
    respective median values.
    """
    for col in cols:
        df[col] = df[col].fillna(df[col].median())
    return df


def fill_with_none(df, *cols):
    """
    This function fills the NaN values with
    'None' string value.
    """
    for col in cols:
        df[col] = df[col].fillna('None')
    return df

def fill_with_unknown(df, *cols):
    """
    This functions fills the NaN values with
    'Unknown' string value.
    """
    for col in cols:
        df[col] = df[col].fillna('Unknown')
    return df

def lowercase_columm(df, *columns):
    """
    This function returns a lowercase version of the column values.
    """
    for col in columns:
        df[col] = df[col].str.lower() 
    return df

_Let's convert the column to lowercase to make them easier to work with._

In [5]:
df.columns = map(str.lower, df.columns)
df.head()

Unnamed: 0,sso_id,inspkey,servno,reportdate,spill_address,spill_st_name,total_gal,galsret,gal,spill_start,...,root_cause,steps_to_prevent,spill_start_2,spill_stop_2,hrs_2,gal_2,spill_start_3,spill_stop_3,hrs_3,gal_3
0,6582,567722.0,,3/10/19,3200,THOUSAND OAKS DR,2100,2100.0,2100.0,3/10/2019 1:16:00 PM,...,,,,,0.0,0.0,,,0.0,0.0
1,6583,567723.0,,3/10/19,6804,S FLORES ST,80,0.0,80.0,3/10/2019 2:25:00 PM,...,,,,,0.0,0.0,,,0.0,0.0
2,6581,567714.0,,3/9/19,215,AUDREY ALENE DR,79,0.0,10.0,3/9/2019 6:00:00 PM,...,,,03/10/2019 09:36,03/10/2019 10:45,1.15,69.0,,,0.0,0.0
3,6584,567713.0,,3/9/19,3602,SE MILITARY DR,83,0.0,83.0,3/9/2019 3:37:00 PM,...,,,,,0.0,0.0,,,0.0,0.0
4,6580,567432.0,,3/6/19,100,PANSY LN,75,0.0,75.0,3/6/2019 9:40:00 AM,...,,,,,0.0,0.0,,,0.0,0.0


_Let's take a look at the missing values._

In [6]:
missing_values_col(df)

Unnamed: 0,num_missing,missing_percentage,num_empty,empty_percentage,nan_count,nan_percentage
sso_id,0,0.0,0,0.0,0,0.0
inspkey,797,25.039271,0,0.0,0,0.0
servno,2715,85.29689,0,0.0,0,0.0
reportdate,0,0.0,0,0.0,0,0.0
spill_address,0,0.0,0,0.0,0,0.0
spill_st_name,0,0.0,0,0.0,0,0.0
total_gal,0,0.0,0,0.0,0,0.0
galsret,475,14.923029,0,0.0,0,0.0
gal,0,0.0,0,0.0,0,0.0
spill_start,0,0.0,0,0.0,0,0.0


_Let's look at a few of the rows._

In [7]:
df.head(7).T

Unnamed: 0,0,1,2,3,4,5,6
sso_id,6582,6583,6581,6584,6580,6579,6577
inspkey,567722,567723,567714,567713,567432,567274,567046
servno,,,,,,,
reportdate,3/10/19,3/10/19,3/9/19,3/9/19,3/6/19,3/5/19,3/2/19
spill_address,3200,6804,215,3602,100,3200,9910
spill_st_name,THOUSAND OAKS DR,S FLORES ST,AUDREY ALENE DR,SE MILITARY DR,PANSY LN,S HACKBERRY ST,SUGARLOAF DR
total_gal,2100,80,79,83,75,250,73
galsret,2100,0,0,0,0,0,0
gal,2100,80,10,83,75,250,73
spill_start,3/10/2019 1:16:00 PM,3/10/2019 2:25:00 PM,3/9/2019 6:00:00 PM,3/9/2019 3:37:00 PM,3/6/2019 9:40:00 AM,3/5/2019 2:22:00 PM,3/2/2019 1:42:00 PM


_Now let's take a peek into the value counts of the columns._

In [8]:
count_values(df)

sso_id:
(-5.5840000000000005, 659.3]    310
(659.3, 1317.6]                  38
(1317.6, 1975.9]                326
(1975.9, 2634.2]                644
(2634.2, 3292.5]                587
(3292.5, 3950.8]                464
(3950.8, 4609.1]                  0
(4609.1, 5267.4]                  6
(5267.4, 5925.7]                157
(5925.7, 6584.0]                651
Name: sso_id, dtype: int64


inspkey:
(3724.564, 60631.5]     541
(60631.5, 116975.0]     372
(116975.0, 173318.5]    392
(173318.5, 229662.0]    246
(229662.0, 286005.5]    134
(286005.5, 342349.0]    174
(342349.0, 398692.5]    118
(398692.5, 455036.0]    125
(455036.0, 511379.5]    151
(511379.5, 567723.0]    133
Name: inspkey, dtype: int64


servno:
(274101.378, 515352.1]     18
(515352.1, 754214.2]       14
(754214.2, 993076.3]       72
(993076.3, 1231938.4]      38
(1231938.4, 1470800.5]    101
(1470800.5, 1709662.6]     62
(1709662.6, 1948524.7]     20
(1948524.7, 2187386.8]     17
(2187386.8, 2426248.9]     31
(24262

(0.979, 3.0]    2840
(3.0, 5.0]       179
(5.0, 7.0]        80
(7.0, 9.0]        35
(9.0, 11.0]        9
(11.0, 13.0]      28
(13.0, 15.0]       0
(15.0, 17.0]       2
(17.0, 19.0]       4
(19.0, 21.0]       6
Name: num_spills_compkey, dtype: int64


num_spills_24mos:
(0.987, 2.2]    1325
(2.2, 3.4]        75
(3.4, 4.6]        32
(4.6, 5.8]        27
(5.8, 7.0]        31
(7.0, 8.2]         7
(8.2, 9.4]         5
(9.4, 10.6]        1
(10.6, 11.8]       1
(11.8, 13.0]       2
Name: num_spills_24mos, dtype: int64


prevspill_24mos:
26-Sep-16    11
19-May-16    10
25-May-13     7
17-May-15     6
02-Jun-16     6
16-May-16     6
25-Apr-15     5
28-Mar-18     5
09-Jul-18     5
20-Feb-17     5
17-Apr-10     5
03-Dec-16     5
11-May-16     5
07-Aug-17     4
08-Sep-10     4
22-Sep-18     4
26-Oct-09     3
03-Feb-10     3
05-Dec-16     3
23-May-15     3
25-Jun-14     3
04-Nov-14     3
15-Jan-10     3
24-Oct-15     3
07-Sep-10     3
31-May-16     3
09-Oct-11     3
15-Aug-16     3
19-Jun-15     3
2

_Let's rename the columns for clarity._

In [9]:
df = df.rename(index=str, columns={'inspkey':'inspection_key',
                                   'servno':'service_number',
                                   'reportdate':'report_date',
                                   'spill_st_name':'spill_street_name',
                                   'total_gal':'total_gallons',
                                   'galsret':'gallons_returned',
                                   'gal':'gallons_1',
                                   'spill_start':'spill_start_1',
                                   'spill_stop':'spill_stop_1',
                                   'hrs':'hours_1',
                                   'unitid':'unit_id_1',
                                   'unitid2':'unit_id_2',
                                   'earz_zone':'edwards_zone',
                                   'expr1029':'expr_1029',
                                   'pipediam':'pipe_diameter',
                                   'pipelen':'pipe_length',
                                   'pipetype':'pipe_type',
                                   'instyear':'installation_year',
                                   'dwndpth':'downstream_depth',
                                   'upsdpth':'upstream_depth',
                                   'rainfall_less3':'rainfall_less_3',
                                   'spill address': 'spill_address_full',
                                   'sewerassetexp':'sewer_asset_exp',
                                   'prevspill_24mos':'previous_spill_24mos',
                                   'unittype':'unit_type',
                                   'assettype':'asset_type',
                                   'lastclnd':'last_cleaned',
                                   'responsetime':'response_time',
                                   'responsedttm':'response_datetime',
                                   'public notice':'public_notice',
                                   'timeint':'time_int',
                                   'hrs_2':'hours_2',
                                   'gal_2':'gallons_2',
                                   'hrs_3':'hours_3',
                                   'gal_3':'gallons_3'
                                   })

In [10]:
df.head(4).T

Unnamed: 0,0,1,2,3
sso_id,6582,6583,6581,6584
inspection_key,567722,567723,567714,567713
service_number,,,,
report_date,3/10/19,3/10/19,3/9/19,3/9/19
spill_address,3200,6804,215,3602
spill_street_name,THOUSAND OAKS DR,S FLORES ST,AUDREY ALENE DR,SE MILITARY DR
total_gallons,2100,80,79,83
gallons_returned,2100,0,0,0
gallons_1,2100,80,10,83
spill_start_1,3/10/2019 1:16:00 PM,3/10/2019 2:25:00 PM,3/9/2019 6:00:00 PM,3/9/2019 3:37:00 PM


## _Let's make copies of the original dataframe before dropping some columns and rows to cover scenarios where we uncover more information about the variables._

In [11]:
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()

_Let's work with df1..._

_Let's remove columns that do not add information._

In [12]:
df1 = remove_columns(df1, columns=['sso_id',
                                   'inspection_key',
                                   'service_number',
                                   'downstream_depth',
                                   'upstream_depth',
                                   'sewer_asset_exp',
                                   'previous_spill_24mos',
                                   'response_time',
                                   'response_datetime',
                                   'time_int'
                                ])
df1.head(4).T

Unnamed: 0,0,1,2,3
report_date,3/10/19,3/10/19,3/9/19,3/9/19
spill_address,3200,6804,215,3602
spill_street_name,THOUSAND OAKS DR,S FLORES ST,AUDREY ALENE DR,SE MILITARY DR
total_gallons,2100,80,79,83
gallons_returned,2100,0,0,0
gallons_1,2100,80,10,83
spill_start_1,3/10/2019 1:16:00 PM,3/10/2019 2:25:00 PM,3/9/2019 6:00:00 PM,3/9/2019 3:37:00 PM
spill_stop_1,3/10/2019 2:40:00 PM,3/10/2019 3:45:00 PM,3/9/2019 7:30:00 PM,3/9/2019 5:00:00 PM
hours_1,1.4,1.33333,1.5,1.38333
cause,Grease,Grease,Structural,Grease


_Let's make the fields lowercase as well..._

In [14]:
df1 = lowercase_columm(df1, 'spill_street_name',
                 'spill_address_full',
                 'unit_type',
                 'asset_type',
                 'cause',
                 'comments',
                 'actions',
                 'watershed',
                 'discharge_to',
                 'discharge_route',
                 'pipe_type'
                )
df1.head(4).T

Unnamed: 0,0,1,2,3
report_date,3/10/19,3/10/19,3/9/19,3/9/19
spill_address,3200,6804,215,3602
spill_street_name,thousand oaks dr,s flores st,audrey alene dr,se military dr
total_gallons,2100,80,79,83
gallons_returned,2100,0,0,0
gallons_1,2100,80,10,83
spill_start_1,3/10/2019 1:16:00 PM,3/10/2019 2:25:00 PM,3/9/2019 6:00:00 PM,3/9/2019 3:37:00 PM
spill_stop_1,3/10/2019 2:40:00 PM,3/10/2019 3:45:00 PM,3/9/2019 7:30:00 PM,3/9/2019 5:00:00 PM
hours_1,1.4,1.33333,1.5,1.38333
cause,grease,grease,structural,grease


_Creating a new address column with a better format._

In [15]:
df1[['spill_street_address']] = df1[['spill_address']]
rows = list(df1.index)
for row in rows:
    df1.spill_street_address[row] = str(df1.spill_address[row]) + ' ' + df1.spill_street_name[row]
df1.head(4).T

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,0,1,2,3
report_date,3/10/19,3/10/19,3/9/19,3/9/19
spill_address,3200,6804,215,3602
spill_street_name,thousand oaks dr,s flores st,audrey alene dr,se military dr
total_gallons,2100,80,79,83
gallons_returned,2100,0,0,0
gallons_1,2100,80,10,83
spill_start_1,3/10/2019 1:16:00 PM,3/10/2019 2:25:00 PM,3/9/2019 6:00:00 PM,3/9/2019 3:37:00 PM
spill_stop_1,3/10/2019 2:40:00 PM,3/10/2019 3:45:00 PM,3/9/2019 7:30:00 PM,3/9/2019 5:00:00 PM
hours_1,1.4,1.33333,1.5,1.38333
cause,grease,grease,structural,grease


In [16]:
df1 = df1.drop(columns=['spill_address_full', 'spill_address', 'spill_street_name'])
df1.head(4).T

Unnamed: 0,0,1,2,3
report_date,3/10/19,3/10/19,3/9/19,3/9/19
total_gallons,2100,80,79,83
gallons_returned,2100,0,0,0
gallons_1,2100,80,10,83
spill_start_1,3/10/2019 1:16:00 PM,3/10/2019 2:25:00 PM,3/9/2019 6:00:00 PM,3/9/2019 3:37:00 PM
spill_stop_1,3/10/2019 2:40:00 PM,3/10/2019 3:45:00 PM,3/9/2019 7:30:00 PM,3/9/2019 5:00:00 PM
hours_1,1.4,1.33333,1.5,1.38333
cause,grease,grease,structural,grease
comments,spill containedreturned to systemarea cleaned ...,spill containedarea cleaned and disinfected,spill containedarea cleaned and disinfectedflu...,spill containedarea cleaned and disinfectedflu...
actions,cleaned main,cleaned main,cleaned main,


# EXPLORE

### Looking for the repeat offenders...

In [17]:
df1.num_spills_24mos[df1.num_spills_24mos > 1].value_counts()

2.0     180
3.0      75
4.0      32
5.0      27
6.0      19
7.0      12
8.0       7
9.0       5
10.0      1
11.0      1
12.0      1
13.0      1
Name: num_spills_24mos, dtype: int64

### Location of the most frequent SSOs

In [18]:
df1.spill_street_address[df1.num_spills_24mos >= 7]

33            6606 swiss oaks
40            6606 swiss oaks
41            6606 swiss oaks
47            6606 swiss oaks
73            1800 austin hwy
84      108 chappie james way
93      108 chappie james way
97      108 chappie james way
141     108 chappie james way
153           6606 swiss oaks
158     108 chappie james way
166     108 chappie james way
171           6606 swiss oaks
190           6606 swiss oaks
241       1427 harry wurzbach
243           6606 swiss oaks
303     100 chappie james way
384     108 chappie james way
591              700 holbrook
593      10800 entrance rd ne
602              700 holbrook
727      10800 entrance rd ne
738      10800 entrance rd ne
751      10823 entrance rd ne
762      10700 entrance rd ne
770      10800 entrance rd ne
1654           6305 cagnon rd
2394           6305 cagnon rd
Name: spill_street_address, dtype: object

- [ ] **TODO:** Find a way to flesh out the address using regex to account for typos etc.
- [ ] **TODO:** Maybe try using unit id's instead of addresses.
- [ ] **TODO:** Drill down to only the top 3-5 locations.
- [ ] **TODO:** Compare predictions between preventing SSO on the most frequents versus not preventing.
- [ ] **TODO:** What is causing the spills on these top 3-5 locations?

In [20]:
df1.head(4).T

Unnamed: 0,0,1,2,3
report_date,3/10/19,3/10/19,3/9/19,3/9/19
total_gallons,2100,80,79,83
gallons_returned,2100,0,0,0
gallons_1,2100,80,10,83
spill_start_1,3/10/2019 1:16:00 PM,3/10/2019 2:25:00 PM,3/9/2019 6:00:00 PM,3/9/2019 3:37:00 PM
spill_stop_1,3/10/2019 2:40:00 PM,3/10/2019 3:45:00 PM,3/9/2019 7:30:00 PM,3/9/2019 5:00:00 PM
hours_1,1.4,1.33333,1.5,1.38333
cause,grease,grease,structural,grease
comments,spill containedreturned to systemarea cleaned ...,spill containedarea cleaned and disinfected,spill containedarea cleaned and disinfectedflu...,spill containedarea cleaned and disinfectedflu...
actions,cleaned main,cleaned main,cleaned main,


In [23]:
df1[['spill_street_address', 'unit_id_1','unit_id_2', 'unit_type', 'asset_type']].head(15)

Unnamed: 0,spill_street_address,unit_id_1,unit_id_2,unit_type,asset_type
0,3200 thousand oaks dr,66918,66917.0,gravity,sewer main
1,6804 s flores st,24250,24193.0,gravity,sewer main
2,215 audrey alene dr,2822,3351.0,gravity,sewer main
3,3602 se military dr,92804,92805.0,gravity,sewer main
4,100 pansy ln,61141,49543.0,gravity,sewer main
5,3200 s hackberry st,38907,26117.0,gravity,sewer main
6,9910 sugarloaf dr,85120,85363.0,gravity,sewer main
7,3507 piedmont ave,26128,24334.0,gravity,sewer main
8,349 alicia,47292,47293.0,gravity,sewer main
9,1502 w mistletoe ave,14241,14896.0,gravity,sewer main


- [ ] **TODO:** Maybe we can do some kind of clustering to group problem areas.

In [24]:
df.unit_id_1.value_counts()[df.unit_id_1.value_counts() > 7]

52470     15
LS200     14
1187      13
LS199     12
903059    12
1210      10
LS188     10
47822     10
903060    10
62085     10
LS201      8
11989      8
29988      8
424896     8
Name: unit_id_1, dtype: int64

In [25]:
df.unit_id_2.value_counts()[df.unit_id_2.value_counts() > 7]

52677     13
1074      13
71380     12
903059    10
922726     9
46546      9
62088      9
16276      8
79942      8
424898     8
499663     8
Name: unit_id_2, dtype: int64

In [26]:
df1.spill_street_address.value_counts()

6606 swiss oaks          24
108 chappie james way    20
700 holbrook             16
10800 entrance rd ne     14
6305 cagnon rd           13
23500 ih 10 w            12
1427 harry wurzbach      12
6785 cagnon rd           12
1800 austin hwy          12
668 holbrook             10
10102 us hwy 90 w         9
3847 thousand oaks        8
409 cheryl dr e           7
10762 pleasanton rd       7
1043 kentucky ave         7
7930 flores st s          7
3714 neer ave             7
11400 starcrest dr        6
902 holbrook              6
6102 rose valley          6
1300 elmendorf n          6
1114 onslow               6
5410 morey rd             6
1703 donaldson ave        6
2627 culebra rd           6
111 yolanda               6
8600 mission pkwy         6
5802 kim valley dr        6
107 yolanda               5
1612 martinez losoya      5
                         ..
12500 john barry          1
5314 randolph blvd        1
3033 goliad rd            1
6600 merry oaks dr        1
310 commerce st w   