# ENVIRONMENT

In [1]:
import os
import acquire
import pandas as pd

# data visualization 
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import statsmodels.api as sm

from datetime import timedelta, datetime
from pylab import rcParams

# ACQUIRE

In [2]:
df = acquire.read_data('saws-ssos.csv')

In [3]:
df.head()

Unnamed: 0,SSO_ID,INSPKEY,SERVNO,REPORTDATE,SPILL_ADDRESS,SPILL_ST_NAME,TOTAL_GAL,GALSRET,GAL,SPILL_START,...,Root_Cause,STEPS_TO_PREVENT,SPILL_START_2,SPILL_STOP_2,HRS_2,GAL_2,SPILL_START_3,SPILL_STOP_3,HRS_3,GAL_3
0,6582,567722.0,,3/10/19,3200,THOUSAND OAKS DR,2100,2100.0,2100.0,3/10/2019 1:16:00 PM,...,,,,,0.0,0.0,,,0.0,0.0
1,6583,567723.0,,3/10/19,6804,S FLORES ST,80,0.0,80.0,3/10/2019 2:25:00 PM,...,,,,,0.0,0.0,,,0.0,0.0
2,6581,567714.0,,3/9/19,215,AUDREY ALENE DR,79,0.0,10.0,3/9/2019 6:00:00 PM,...,,,03/10/2019 09:36,03/10/2019 10:45,1.15,69.0,,,0.0,0.0
3,6584,567713.0,,3/9/19,3602,SE MILITARY DR,83,0.0,83.0,3/9/2019 3:37:00 PM,...,,,,,0.0,0.0,,,0.0,0.0
4,6580,567432.0,,3/6/19,100,PANSY LN,75,0.0,75.0,3/6/2019 9:40:00 AM,...,,,,,0.0,0.0,,,0.0,0.0


# PREPARE

In [None]:
def missing_values_col(df):
    """
    Write or use a previously written function to return the
    total missing values and the percent missing values by column.
    """
    null_count = df.isnull().sum()
    null_percentage = (null_count / df.shape[0]) * 100
    empty_count = pd.Series(((df == ' ') | (df == '')).sum())
    empty_percentage = (empty_count / df.shape[0]) * 100
    nan_count = pd.Series(((df == 'nan') | (df == 'NaN')).sum())
    nan_percentage = (nan_count / df.shape[0]) * 100
    return pd.DataFrame({'num_missing': null_count, 'missing_percentage': null_percentage,
                         'num_empty': empty_count, 'empty_percentage': empty_percentage,
                         'nan_count': nan_count, 'nan_percentage': nan_percentage})


def missing_values_row(df):
    """
    Write or use a previously written function to return the
    total missing values and the percent missing values by row.
    """
    null_count = df.isnull().sum(axis=1)
    null_percentage = (null_count / df.shape[1]) * 100
    return pd.DataFrame({'num_missing': null_count, 'percentage': null_percentage})


def handle_missing_threshold(df, prop_required_column = .3, prop_required_row = .9):
    """
    Removes columns and rows whose count of missing values exceeds threshold.
    """
    threshold = int(round(prop_required_column*len(df.index),0))
    df.dropna(axis=1, thresh=threshold, inplace=True)
    threshold = int(round(prop_required_row*len(df.columns),0))
    df.dropna(axis=0, thresh=threshold, inplace=True)
    return df


def count_val(column):
    return df[column].value_counts(dropna=False)

def remove_columns(df, columns):
    return df.drop(columns=columns)

def fill_with_zeroes(df, *cols):
    """
    Write a function that will take a dataframe and list of
    column names as input and return the dataframe with the
    null values in those columns replace by 0.
    """
    for col in cols:
        df[col] = df[col].fillna(0)
    return df


def fill_with_median(df, *cols):
    """
    Fill the NaN values with respective median values.
    """
    for col in cols:
        df[col] = df[col].fillna(df[col].median())
    return df


def fill_with_none(df, *cols):
    """
    Fill the NaN values with 'None' string value.
    """
    for col in cols:
        df[col] = df[col].fillna('None')
    return df

def fill_with_unknown(df, *cols):
    """
    Fill the NaN values with 'None' string value.
    """
    for col in cols:
        df[col] = df[col].fillna('Unknown')
    return df

_Let's take a look at missing values._

In [None]:
missing_values_col(df)

In [None]:
df['SPILL ADDRESS'].value_counts()

In [None]:
missing_values_row(df).head(30)

In [None]:
# Decided to handle missing a different way, maybe
# some feature engineering or something...
# df = handle_missing_threshold(df)

In [None]:
count_val('ResponseTime')

In [None]:
df.columns

_Let's remove variables that do not add information._

In [None]:
df = remove_columns(df, columns=['INSPKEY',
                                 'SERVNO',
                                 'REPORTDATE',
                                 'FERGUSON',
                                 'Month',
                                 'Year',
                                 'Week',
                                 'EARZ_ZONE',
                                 'DWNDPTH',
                                 'UPSDPTH',
                                 'Inches_No',
                                 'RainFall_Less3',
                                 'SewerAssetExp',
                                 'UNITID',
                                 'UNITID2',
                                 'COUNCIL_DISTRICT',
                                 'INSTYEAR',
                                 'Public Notice',
                                 'TIMEINT',
                                 'HRS_2',
                                 'GAL_2',
                                 'HRS_3',
                                 'GAL_3',
                                 'SPILL_START_2',
                                 'SPILL_STOP_2',
                                 'SPILL_START_3',
                                 'SPILL_STOP_3',
                                 'SPILL ADDRESS',
                                 'SPILL_ADDRESS',
                                 'SPILL_ST_NAME',
                                ])

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df = fill_with_unknown(df, 'DISCHARGE_ROUTE',
                      'ACTIONS',
                      'COMMENTS',
                      'DISCHARGE_TO',
                      'Expr1029',
                      'PIPETYPE',
                      'UNITTYPE',
                      'ASSETTYPE',
                      'Root_Cause',
                      'STEPS_TO_PREVENT',
                      )

In [None]:
df = fill_with_median(df, 'GALSRET',
                     'HRS',
                     'PIPEDIAM',
                     'PIPELEN',
                     )

In [None]:
count_val('ResponseTime')

In [None]:
count_val('ResponseDTTM')

In [None]:
df = remove_columns(df, columns=['ResponseTime',
                                 'ResponseDTTM',
                                 ])

In [None]:
df.shape

In [None]:
missing_values_col(df)

_Let's temporarily remove the columns that needs to be feature-engineered later._

In [None]:
df0 = remove_columns(df, columns=['PIPETYPE',
                                  'NUM_SPILLS_24MOS',
                                  'PREVSPILL_24MOS',
                                  'UNITTYPE',
                                  'LASTCLND',
                                 ])

In [None]:
missing_values_col(df0)

In [None]:
df0.head()

In [None]:
df0 = df0.rename(index=str, columns={"Expr1029": "EXPR1029", "Root_Cause": "ROOT_CAUSE"})

In [None]:
df0.head()

# ANALYZE

In [None]:
train = df0[:'2016']
test = df0['2016':]
print(train.nunique())
print(test.nunique())

In [None]:
df0.head()

In [None]:
df0.SPILL_START = pd.to_datetime(df0.SPILL_START,infer_datetime_format=True)
df0.SPILL_STOP = pd.to_datetime(df0.SPILL_STOP,infer_datetime_format=True)

In [None]:
df0.head()

In [None]:
df0 = df0.sort_values('SPILL_START')
df0 = df0.set_index('SPILL_START')
df0.head()

In [None]:
by_date = df0.groupby(['SPILL_START'])['TOTAL_GAL'].sum().reset_index()
by_date.plot(x='SPILL_START', y='TOTAL_GAL');

In [None]:
df0.groupby(['SPILL_START']).TOTAL_GAL.sum().head()

In [None]:
df0.resample('A').mean()

In [None]:
df.shape

In [None]:
df0.shape

In [None]:
df = df0.copy()

In [None]:
train = df[:'2016']
test = df['2017':]
print(train.nunique())
print(test.nunique())

In [None]:
missing_values_col(train)

In [None]:
missing_values_col(test)

In [None]:
overflow = train.resample('D').TOTAL_GAL.mean()

In [None]:
overflow

In [None]:
overflow.plot()

In [None]:
overflow.resample('M').mean().plot()

In [None]:
overflow.resample('Q').mean().plot()

In [None]:
overflow.rolling(5).mean().plot(figsize=(12, 4))

In [None]:
overflow.diff(periods=10).plot(figsize=(12, 4))

In [None]:
decomposition = sm.tsa.seasonal_decompose(overflow.dropna(), model='additive', freq=12)
fig = decomposition.plot()
plt.show()

In [None]:
pd.plotting.lag_plot(overflow)

In [None]:
df_corr = pd.concat([overflow.shift(1), overflow], axis=1)
df_corr.columns = ['t-1','t+1']
result = df_corr.corr()
print(result)