# ENVIRONMENT

In [28]:
import os
import acquire
import pandas as pd

# data visualization 
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import statsmodels.api as sm

from datetime import timedelta, datetime
from pylab import rcParams

# ACQUIRE

_Let's read in the data from the csv file and take a peek at te first five records._

In [29]:
df = acquire.read_data('saws-ssos.csv')

In [30]:
df.head()

Unnamed: 0,SSO_ID,INSPKEY,SERVNO,REPORTDATE,SPILL_ADDRESS,SPILL_ST_NAME,TOTAL_GAL,GALSRET,GAL,SPILL_START,...,Root_Cause,STEPS_TO_PREVENT,SPILL_START_2,SPILL_STOP_2,HRS_2,GAL_2,SPILL_START_3,SPILL_STOP_3,HRS_3,GAL_3
0,6582,567722.0,,3/10/19,3200,THOUSAND OAKS DR,2100,2100.0,2100.0,3/10/2019 1:16:00 PM,...,,,,,0.0,0.0,,,0.0,0.0
1,6583,567723.0,,3/10/19,6804,S FLORES ST,80,0.0,80.0,3/10/2019 2:25:00 PM,...,,,,,0.0,0.0,,,0.0,0.0
2,6581,567714.0,,3/9/19,215,AUDREY ALENE DR,79,0.0,10.0,3/9/2019 6:00:00 PM,...,,,03/10/2019 09:36,03/10/2019 10:45,1.15,69.0,,,0.0,0.0
3,6584,567713.0,,3/9/19,3602,SE MILITARY DR,83,0.0,83.0,3/9/2019 3:37:00 PM,...,,,,,0.0,0.0,,,0.0,0.0
4,6580,567432.0,,3/6/19,100,PANSY LN,75,0.0,75.0,3/6/2019 9:40:00 AM,...,,,,,0.0,0.0,,,0.0,0.0


# PREPARE

In [31]:
def missing_values_col(df):
    """
    This functions returns the total missing values and
    the percent missing values by column.
    """
    null_count = df.isnull().sum()
    null_percentage = (null_count / df.shape[0]) * 100
    empty_count = pd.Series(((df == ' ') | (df == '')).sum())
    empty_percentage = (empty_count / df.shape[0]) * 100
    nan_count = pd.Series(((df == 'nan') | (df == 'NaN')).sum())
    nan_percentage = (nan_count / df.shape[0]) * 100
    return pd.DataFrame({'num_missing': null_count, 'missing_percentage': null_percentage,
                         'num_empty': empty_count, 'empty_percentage': empty_percentage,
                         'nan_count': nan_count, 'nan_percentage': nan_percentage})


def missing_values_row(df):
    """
    This functions returns the total missing values and
    the percent missing values by row.
    """
    null_count = df.isnull().sum(axis=1)
    null_percentage = (null_count / df.shape[1]) * 100
    return pd.DataFrame({'num_missing': null_count, 'percentage': null_percentage})


def handle_missing_threshold(df, prop_required_column = .3, prop_required_row = .9):
    """
    This functions removes columns and rows whose
    count of missing values exceeds threshold.
    """
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df


def count_values(df):
    """
    This function counts the value of columns in a dataframe.
    """
    for col in df.columns:
        n = df[col].unique().shape[0]
        col_bins = min(n, 10)
        print(f"{col}:")
        if df[col].dtype in ['int64', 'float64'] and n > 10:
            print(df[col].value_counts(bins=col_bins, sort=False))
        else:
            print(df[col].value_counts())
        print("\n")

def remove_columns(df, columns):
    return df.drop(columns=columns)

def fill_with_zeroes(df, *cols):
    """
    This functions returns the column names as input and
    return the dataframe with the
    null values in those columns replace by 0.
    """
    for col in cols:
        df[col] = df[col].fillna(0)
    return df


def fill_with_median(df, *cols):
    """
    This function fills the NaN values with
    respective median values.
    """
    for col in cols:
        df[col] = df[col].fillna(df[col].median())
    return df


def fill_with_none(df, *cols):
    """
    This function fills the NaN values with
    'None' string value.
    """
    for col in cols:
        df[col] = df[col].fillna('None')
    return df

def fill_with_unknown(df, *cols):
    """
    This functions fills the NaN values with
    'Unknown' string value.
    """
    for col in cols:
        df[col] = df[col].fillna('Unknown')
    return df

_Let's convert the column to lowercase to make them easier to work with._

In [32]:
df.columns = map(str.lower, df.columns)
df.head()

Unnamed: 0,sso_id,inspkey,servno,reportdate,spill_address,spill_st_name,total_gal,galsret,gal,spill_start,...,root_cause,steps_to_prevent,spill_start_2,spill_stop_2,hrs_2,gal_2,spill_start_3,spill_stop_3,hrs_3,gal_3
0,6582,567722.0,,3/10/19,3200,THOUSAND OAKS DR,2100,2100.0,2100.0,3/10/2019 1:16:00 PM,...,,,,,0.0,0.0,,,0.0,0.0
1,6583,567723.0,,3/10/19,6804,S FLORES ST,80,0.0,80.0,3/10/2019 2:25:00 PM,...,,,,,0.0,0.0,,,0.0,0.0
2,6581,567714.0,,3/9/19,215,AUDREY ALENE DR,79,0.0,10.0,3/9/2019 6:00:00 PM,...,,,03/10/2019 09:36,03/10/2019 10:45,1.15,69.0,,,0.0,0.0
3,6584,567713.0,,3/9/19,3602,SE MILITARY DR,83,0.0,83.0,3/9/2019 3:37:00 PM,...,,,,,0.0,0.0,,,0.0,0.0
4,6580,567432.0,,3/6/19,100,PANSY LN,75,0.0,75.0,3/6/2019 9:40:00 AM,...,,,,,0.0,0.0,,,0.0,0.0


_Let's take a look at the missing values._

In [33]:
missing_values_col(df)

Unnamed: 0,num_missing,missing_percentage,num_empty,empty_percentage,nan_count,nan_percentage
sso_id,0,0.0,0,0.0,0,0.0
inspkey,797,25.039271,0,0.0,0,0.0
servno,2715,85.29689,0,0.0,0,0.0
reportdate,0,0.0,0,0.0,0,0.0
spill_address,0,0.0,0,0.0,0,0.0
spill_st_name,0,0.0,0,0.0,0,0.0
total_gal,0,0.0,0,0.0,0,0.0
galsret,475,14.923029,0,0.0,0,0.0
gal,0,0.0,0,0.0,0,0.0
spill_start,0,0.0,0,0.0,0,0.0


_Let's look at a few of the rows._

In [34]:
df.head(7).T

Unnamed: 0,0,1,2,3,4,5,6
sso_id,6582,6583,6581,6584,6580,6579,6577
inspkey,567722,567723,567714,567713,567432,567274,567046
servno,,,,,,,
reportdate,3/10/19,3/10/19,3/9/19,3/9/19,3/6/19,3/5/19,3/2/19
spill_address,3200,6804,215,3602,100,3200,9910
spill_st_name,THOUSAND OAKS DR,S FLORES ST,AUDREY ALENE DR,SE MILITARY DR,PANSY LN,S HACKBERRY ST,SUGARLOAF DR
total_gal,2100,80,79,83,75,250,73
galsret,2100,0,0,0,0,0,0
gal,2100,80,10,83,75,250,73
spill_start,3/10/2019 1:16:00 PM,3/10/2019 2:25:00 PM,3/9/2019 6:00:00 PM,3/9/2019 3:37:00 PM,3/6/2019 9:40:00 AM,3/5/2019 2:22:00 PM,3/2/2019 1:42:00 PM


_Now let's take a peek into the value counts of the columns._

In [35]:
count_values(df)

sso_id:
(-5.5840000000000005, 659.3]    310
(659.3, 1317.6]                  38
(1317.6, 1975.9]                326
(1975.9, 2634.2]                644
(2634.2, 3292.5]                587
(3292.5, 3950.8]                464
(3950.8, 4609.1]                  0
(4609.1, 5267.4]                  6
(5267.4, 5925.7]                157
(5925.7, 6584.0]                651
Name: sso_id, dtype: int64


inspkey:
(3724.564, 60631.5]     541
(60631.5, 116975.0]     372
(116975.0, 173318.5]    392
(173318.5, 229662.0]    246
(229662.0, 286005.5]    134
(286005.5, 342349.0]    174
(342349.0, 398692.5]    118
(398692.5, 455036.0]    125
(455036.0, 511379.5]    151
(511379.5, 567723.0]    133
Name: inspkey, dtype: int64


servno:
(274101.378, 515352.1]     18
(515352.1, 754214.2]       14
(754214.2, 993076.3]       72
(993076.3, 1231938.4]      38
(1231938.4, 1470800.5]    101
(1470800.5, 1709662.6]     62
(1709662.6, 1948524.7]     20
(1948524.7, 2187386.8]     17
(2187386.8, 2426248.9]     31
(24262

120.0    321
12.0     169
24.0     165
6.0       63
3.0       37
60.0      23
1.0       12
Name: timeint, dtype: int64


root_cause:
STRUCTURAL           1237
GREASE                610
DEBRIS                443
RAIN EVENT            353
CONTRACTOR            164
LIFT STATION          125
OTHER                  61
RAIN EVENT             48
VANDALISM              48
VANDALISM              24
ROOTS                  22
LIFT STATION           15
I/I                    12
CONTRACTOR              8
BY PASS PUMP LEAK       3
OTHER                   1
Grease                  1
Debris                  1
Name: root_cause, dtype: int64


steps_to_prevent:
Increase FCS,         470
Point Repair,         280
Design Request,       201
Other                 195
I/I Inv,              144
Capacity Project,     114
Bolt down,             24
ED/COM,                11
RPC,                    7
Name: steps_to_prevent, dtype: int64


spill_start_2:
06/02/2016 20:00    2
06/02/2018 09:18    1
06/02/2016 23:41

_Let's rename the columns for clarity._

In [36]:
df = df.rename(index=str, columns={'inspkey':'inspection_key',
                                   'servno':'service_number',
                                   'reportdate':'report_date',
                                   'spill_st_name':'spill_street_name',
                                   'total_gal':'total_gallons',
                                   'galsret':'gallons_returned',
                                   'gal':'gallons_1',
                                   'spill_start':'spill_start_1',
                                   'spill_stop':'spill_stop_1',
                                   'hrs':'hours_1',
                                   'unitid':'unit_id_1',
                                   'unitid2':'unit_id_2',
                                   'earz_zone':'edwards_zone',
                                   'expr1029':'expr_1029',
                                   'pipediam':'pipe_diameter',
                                   'pipelen':'pipe_length',
                                   'pipetype':'pipe_type',
                                   'instyear':'installation_year',
                                   'dwndpth':'downstream_depth',
                                   'upsdpth':'upstream_depth',
                                   'rainfall_less3':'rainfall_less_3',
                                   'spill address': 'spill_address_full',
                                   'sewerassetexp':'sewer_asset_exp',
                                   'prevspill_24mos':'previous_spill_24mos',
                                   'unittype':'unit_type',
                                   'assettype':'asset_type',
                                   'lastclnd':'last_cleaned',
                                   'responsetime':'response_time',
                                   'responsedttm':'response_datetime',
                                   'public notice':'public_notice',
                                   'timeint':'time_int',
                                   'hrs_2':'hours_2',
                                   'gal_2':'gallons_2',
                                   'hrs_3':'hours_3',
                                   'gal_3':'gallons_3'
                                   })

In [37]:
df.head(4).T

Unnamed: 0,0,1,2,3
sso_id,6582,6583,6581,6584
inspection_key,567722,567723,567714,567713
service_number,,,,
report_date,3/10/19,3/10/19,3/9/19,3/9/19
spill_address,3200,6804,215,3602
spill_street_name,THOUSAND OAKS DR,S FLORES ST,AUDREY ALENE DR,SE MILITARY DR
total_gallons,2100,80,79,83
gallons_returned,2100,0,0,0
gallons_1,2100,80,10,83
spill_start_1,3/10/2019 1:16:00 PM,3/10/2019 2:25:00 PM,3/9/2019 6:00:00 PM,3/9/2019 3:37:00 PM


### Looking for the repeat offenders...

In [38]:
df.num_spills_24mos[df.num_spills_24mos > 1].value_counts()

2.0     180
3.0      75
4.0      32
5.0      27
6.0      19
7.0      12
8.0       7
9.0       5
10.0      1
11.0      1
12.0      1
13.0      1
Name: num_spills_24mos, dtype: int64

### Location of the most frequent SSOs

In [39]:
df.spill_address_full[df.num_spills_24mos >= 7]

33             SWISS OAKS 06606
40              6606 Swiss Oaks
41              6606 Swiss Oaks
47             SWISS OAKS 06606
73              1800 AUSTIN HWY
84      CHAPPIE JAMES WAY 00108
93      CHAPPIE JAMES WAY 00108
97      CHAPPIE JAMES WAY 00108
141     CHAPPIE JAMES WAY 00100
153            SWISS OAKS 06606
158     CHAPPIE JAMES WAY 00100
166     CHAPPIE JAMES WAY 00100
171            SWISS OAKS 06606
190            SWISS OAKS 06606
241        HARRY WURZBACH 01427
243            SWISS OAKS 06606
303     CHAPPIE JAMES WAY 00100
384     CHAPPIE JAMES WAY 00108
591              HOLBROOK 00902
593       ENTRANCE RD N E 10800
602           RITTIMAN RD 04034
727        ENTRANCE RD NE 10800
738       10800 N E Entrance Rd
751        ENTRANCE RD NE 10823
762        ENTRANCE RD NE 108/1
770        ENTRANCE RD NE 108/1
1654            CAGNON RD 06305
2394            CAGNON RD 066/1
Name: spill_address_full, dtype: object

- [ ] **TODO:** Find a way to flesh out the address using regex to account for typos etc.
- [ ] **TODO:** Maybe try using unit id's instead of addresses.
- [ ] **TODO:** Drill down to only the top 3-5 locations.
- [ ] **TODO:** Compare predictions between preventing SSO on the most frequents versus not preventing.
- [ ] **TODO:** What is causing the spills on these top 3-5 locations?

In [45]:
df1 = df[(df.spill_address==10800) & (df.spill_street_name == 'ENTRANCE RD NE')]
df1.T

Unnamed: 0,568,593,726,727,738,767,770,773,783,792,797,799,802,809
sso_id,6025,5984,5854,5855,5838,5808,5802,5806,5794,5786,5779,5776,5775,4773
inspection_key,,,,,406517,,,,,,,,388919,387768
service_number,1.50063e+06,1.44566e+06,1.25265e+06,1.25315e+06,,1.1501e+06,1.14377e+06,1.14444e+06,1.1353e+06,1.12719e+06,1.12378e+06,1.12302e+06,,
report_date,9/26/16,8/16/16,4/18/16,4/18/16,3/24/16,1/23/16,1/19/16,1/18/16,1/9/16,1/2/16,12/27/15,12/24/15,12/19/15,12/6/15
spill_address,10800,10800,10800,10800,10800,10800,10800,10800,10800,10800,10800,10800,10800,10800
spill_street_name,ENTRANCE RD NE,ENTRANCE RD NE,ENTRANCE RD NE,ENTRANCE RD NE,ENTRANCE RD NE,ENTRANCE RD NE,ENTRANCE RD NE,ENTRANCE RD NE,ENTRANCE RD NE,ENTRANCE RD NE,ENTRANCE RD NE,ENTRANCE RD NE,ENTRANCE RD NE,ENTRANCE RD NE
total_gallons,26300,12000,6650,78000,100,24765,39940,3000,139525,267850,25030,30750,49110,24900
gallons_returned,0,,0,0,0,,0,700,,,,,0,0
gallons_1,26300,12000,6650,78000,100,390,11005,3000,58125,120850,22800,15000,21600,24750
spill_start_1,9/26/2016 7:50:00 AM,8/16/2016 10:30:00 PM,4/18/2016 11:02:00 AM,4/18/2016 2:20:00 PM,3/24/2016 2:40:00 PM,1/23/2016 2:58:00 PM,1/16/2016 1:44:00 PM,1/18/2016 10:00:00 PM,1/9/2016 1:00:00 PM,1/2/2016 1:59:00 PM,12/27/2015 3:25:00 PM,12/24/2015 2:00:00 PM,12/19/2015 1:42:00 PM,12/6/2015 2:00:00 PM


In [46]:
df[(df.spill_address_full=='CHAPPIE JAMES WAY 00108')].T

Unnamed: 0,84,93,97,110,127,188,209,384,407,504,564,590,595,653,695
sso_id,6499,6488,6485,6469,6462,6393,6370,6205,6170,6062,6021,5990,5983,5922,5887
inspection_key,,,,,,,,,,,,,,,
service_number,2.51675e+06,2.50317e+06,2.49964e+06,2.47077e+06,2.45879e+06,2.35836e+06,2.28821e+06,1.90694e+06,1.80939e+06,1.58214e+06,1.49978e+06,1.45257e+06,1.44422e+06,1.30694e+06,1.29683e+06
report_date,10/24/18,10/15/18,10/10/18,9/22/18,9/16/18,7/9/18,5/22/18,8/7/17,5/31/17,12/3/16,9/26/16,8/21/16,8/15/16,5/31/16,5/16/16
spill_address,108,108,108,108,108,108,108,108,108,108,108,108,108,108,108
spill_street_name,CHAPPIE JAMES WAY,CHAPPIE JAMES WAY,CHAPPIE JAMES WAY,CHAPPIE JAMES WAY,CHAPPIE JAMES WAY,CHAPPIE JAMES WAY,CHAPPIE JAMES WAY,CHAPPIE JAMES WAY,CHAPPIE JAMES WAY,CHAPPIE JAMES WAY,CHAPPIE JAMES WAY,CHAPPIE JAMES WAY,CHAPPIE JAMES WAY,CHAPPIE JAMES WAY,CHAPPIE JAMES WAY
total_gallons,2448400,642250,1500,495500,1323125,96200,1000,421881,1000,1880100,780000,625000,399000,2453340,1527175
gallons_returned,0,,0,,0,0,0,0,0,0,0,0,0,0,0
gallons_1,2.4484e+06,355750,1500,495500,1.32312e+06,96200,1000,421881,1000,751000,780000,625000,399000,75990,117475
spill_start_1,10/24/2018 1:50:00 PM,10/15/2018 2:48:00 PM,10/10/2018 8:58:00 AM,9/22/2018 1:05:00 PM,9/16/2018 5:03:00 PM,7/9/2018 4:00:00 PM,5/22/2018 1:45:00 PM,8/7/2017 11:11:00 AM,5/31/2017 9:15:00 AM,12/3/2016 9:24:00 AM,9/26/2016 8:45:00 AM,8/21/2016 3:40:00 PM,8/15/2016 3:00:00 PM,5/31/2016 5:08:00 PM,5/16/2016 10:44:00 AM


In [47]:
df[(df.spill_address_full=='SWISS OAKS 06606')].T

Unnamed: 0,33,47,56,68,115,135,143,153,171,190,243,311,344,359,405,453,501,563,696
sso_id,6551,6536,6528,6516,6466,6455,6442,6428,6413,6395,6345,6267,6236,6219,6172,6127,6081,6020,5889
inspection_key,,,,,,,,,,,,,,,,,,,
service_number,2.60565e+06,2.58795e+06,2.57716e+06,2.55664e+06,2.46636e+06,2.45716e+06,2.45073e+06,2.4471e+06,2.40832e+06,2.35755e+06,2.20841e+06,2.07036e+06,1.98058e+06,1.94532e+06,1.81281e+06,1.67092e+06,1.58829e+06,1.50544e+06,1.28665e+06
report_date,1/6/19,12/16/18,12/7/18,11/22/18,9/20/18,9/14/18,9/10/18,9/7/18,8/12/18,7/9/18,3/28/18,12/7/17,9/28/17,9/5/17,6/1/17,2/20/17,12/5/16,9/26/16,5/16/16
spill_address,6606,6606,6606,6606,6606,6606,6606,6606,6606,6606,6606,6606,6606,6606,6606,6606,6606,6606,6606
spill_street_name,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS,SWISS OAKS
total_gallons,48000,16125,1177800,165000,13304550,1309800,232500,123300,75000,135200,112200,8950,27500,21752,10000,622500,407500,693750,1349550
gallons_returned,0,,,,0,,,,,0,,,,0,0,0,0,0,0
gallons_1,48000,16125,630000,165000,1.33046e+07,1.3098e+06,232500,123300,33750,135200,13200,8950,27500,21752,10000,622500,407500,693750,1.34955e+06
spill_start_1,1/6/2019 10:00:00 PM,12/16/2018 3:30:00 PM,12/7/2018 6:30:00 PM,11/22/2018 2:00:00 PM,9/20/2018 9:45:00 AM,9/14/2018 9:14:00 AM,9/10/2018 2:15:00 PM,9/7/2018 6:45:00 AM,8/12/2018 4:15:00 PM,7/9/2018 3:00:00 PM,3/28/2018 6:14:00 AM,12/7/2017 10:01:00 AM,9/28/2017 9:45:00 PM,9/4/2017 9:45:00 PM,6/1/2017 2:00:00 PM,2/20/2017 7:45:00 AM,12/5/2016 8:00:00 AM,9/26/2016 8:00:00 AM,5/16/2016 2:19:00 PM


In [49]:
df1[['spill_address_full', 'unit_id_1','unit_id_2', 'unit_type', 'asset_type']]

Unnamed: 0,spill_address_full,unit_id_1,unit_id_2,unit_type,asset_type
568,ENTRANCE RD NE 108/1,52742,52470.0,GRAVITY,Sewer Main
593,ENTRANCE RD N E 10800,52470,52677.0,GRAVITY,Sewer Main
726,ENTRANCE RD NE 10800,930659,930653.0,OUTFALL,Sewer Main
727,ENTRANCE RD NE 10800,52470,52677.0,GRAVITY,Sewer Main
738,10800 N E Entrance Rd,52470,52677.0,GRAVITY,Sewer Main
767,ENTRANCE RD NE 10800,52470,52677.0,GRAVITY,Sewer Main
770,ENTRANCE RD NE 108/1,52470,52677.0,GRAVITY,Sewer Main
773,ENTRANCE RD NE 10800,52470,,STANDARD,Sewer Manhole
783,ENTRANCE RD NE 10823,52470,52677.0,GRAVITY,Sewer Main
792,10800 N E Entrance Rd,52470,52677.0,GRAVITY,Sewer Main


- [ ] **TODO:** Maybe we can do some kind of clustering to group problem areas.

In [None]:
missing_values_row(df).head(30)

In [None]:
# Decided to handle missing a different way, maybe
# some feature engineering or something...
# df = handle_missing_threshold(df)

In [None]:
count_val('ResponseTime')

In [None]:
df.columns

_Let's remove variables that do not add information._

In [None]:
df = remove_columns(df, columns=['INSPKEY',
                                 'SERVNO',
                                 'REPORTDATE',
                                 'FERGUSON',
                                 'Month',
                                 'Year',
                                 'Week',
                                 'EARZ_ZONE',
                                 'DWNDPTH',
                                 'UPSDPTH',
                                 'Inches_No',
                                 'RainFall_Less3',
                                 'SewerAssetExp',
                                 'UNITID',
                                 'UNITID2',
                                 'COUNCIL_DISTRICT',
                                 'INSTYEAR',
                                 'Public Notice',
                                 'TIMEINT',
                                 'HRS_2',
                                 'GAL_2',
                                 'HRS_3',
                                 'GAL_3',
                                 'SPILL_START_2',
                                 'SPILL_STOP_2',
                                 'SPILL_START_3',
                                 'SPILL_STOP_3',
                                 'SPILL ADDRESS',
                                 'SPILL_ADDRESS',
                                 'SPILL_ST_NAME',
                                ])

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df = fill_with_unknown(df, 'DISCHARGE_ROUTE',
                      'ACTIONS',
                      'COMMENTS',
                      'DISCHARGE_TO',
                      'Expr1029',
                      'PIPETYPE',
                      'UNITTYPE',
                      'ASSETTYPE',
                      'Root_Cause',
                      'STEPS_TO_PREVENT',
                      )

In [None]:
df = fill_with_median(df, 'GALSRET',
                     'HRS',
                     'PIPEDIAM',
                     'PIPELEN',
                     )

In [None]:
count_val('ResponseTime')

In [None]:
count_val('ResponseDTTM')

In [None]:
df = remove_columns(df, columns=['ResponseTime',
                                 'ResponseDTTM',
                                 ])

In [None]:
df.shape

In [None]:
missing_values_col(df)

_Let's temporarily remove the columns that needs to be feature-engineered later._

In [None]:
df0 = remove_columns(df, columns=['PIPETYPE',
                                  'NUM_SPILLS_24MOS',
                                  'PREVSPILL_24MOS',
                                  'UNITTYPE',
                                  'LASTCLND',
                                 ])

In [None]:
missing_values_col(df0)

In [None]:
df0.head()

In [None]:
df0 = df0.rename(index=str, columns={"Expr1029": "EXPR1029", "Root_Cause": "ROOT_CAUSE"})

In [None]:
df0.head()

# ANALYZE

In [None]:
train = df0[:'2016']
test = df0['2016':]
print(train.nunique())
print(test.nunique())

In [None]:
df0.head()

In [None]:
df0.SPILL_START = pd.to_datetime(df0.SPILL_START,infer_datetime_format=True)
df0.SPILL_STOP = pd.to_datetime(df0.SPILL_STOP,infer_datetime_format=True)

In [None]:
df0.head()

In [None]:
df0 = df0.sort_values('SPILL_START')
df0 = df0.set_index('SPILL_START')
df0.head()

In [None]:
by_date = df0.groupby(['SPILL_START'])['TOTAL_GAL'].sum().reset_index()
by_date.plot(x='SPILL_START', y='TOTAL_GAL');

In [None]:
df0.groupby(['SPILL_START']).TOTAL_GAL.sum().head()

In [None]:
df0.resample('A').mean()

In [None]:
df.shape

In [None]:
df0.shape

In [None]:
df = df0.copy()

In [None]:
train = df[:'2016']
test = df['2017':]
print(train.nunique())
print(test.nunique())

In [None]:
missing_values_col(train)

In [None]:
missing_values_col(test)

In [None]:
overflow = train.resample('D').TOTAL_GAL.mean()

In [None]:
overflow

In [None]:
overflow.plot()

In [None]:
overflow.resample('M').mean().plot()

In [None]:
overflow.resample('Q').mean().plot()

In [None]:
overflow.rolling(5).mean().plot(figsize=(12, 4))

In [None]:
overflow.diff(periods=10).plot(figsize=(12, 4))

In [None]:
decomposition = sm.tsa.seasonal_decompose(overflow.dropna(), model='additive', freq=12)
fig = decomposition.plot()
plt.show()

In [None]:
pd.plotting.lag_plot(overflow)

In [None]:
df_corr = pd.concat([overflow.shift(1), overflow], axis=1)
df_corr.columns = ['t-1','t+1']
result = df_corr.corr()
print(result)