# ENVIRONMENT

In [4]:
import acquire_sara as acquire
import prepare_sara as prepare
import pandas as pd
import numpy as np
import pandas_profiling

# data visualization 
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import statsmodels.api as sm

from datetime import timedelta, datetime
from pylab import rcParams

# to explode the DataFrames and avoid truncation
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from fbprophet import Prophet

# ACQUIRE

#### _Let's read in the data from the csv files._

In [5]:
df_flood = acquire.read_data('sara-flood-stage-levels.csv')
df_rain_details = acquire.read_data('sara-rainfall-details.csv')
df_rain_summary = acquire.read_data('sara-rainfall-summary.csv')
df_water_quality = acquire.read_data('sara-water-quality-bexar.csv')

b'Skipping line 9327: expected 500 fields, saw 501\n'


# PREPARE

#### _Let's get the shape of the dfs and take a peek at their header._

In [6]:
print('Flood Stage Levels: ' + str(df_flood.shape))
print('Rain Details: ' + str(df_rain_details.shape))
print('Rain Summary: ' + str(df_rain_summary.shape))
print('Water Quality: ' + str(df_water_quality.shape))

Flood Stage Levels: (1470, 6)
Rain Details: (37182, 5)
Rain Summary: (13395, 5)
Water Quality: (10883, 500)


In [7]:
df_flood.head(6).T

Unnamed: 0,0,1,2,3,4,5
location_name,SAR 01 (Lonestar),SAR 01 (Lonestar),SAR 01 (Lonestar),SAR 01 (Lonestar),SAR 01 (Lonestar),SAR 01 (Lonestar)
latitude,29.4019,29.4019,29.4019,29.4019,29.4019,29.4019
longitude,-98.4885,-98.4885,-98.4885,-98.4885,-98.4885,-98.4885
date,2018-06-20,2018-06-21,2018-06-22,2018-06-23,2018-06-24,2018-06-25
daily_average_stage,1.12642,1.09613,0.833789,0.957238,0.935521,0.958099
tranducer_elevation,602.3,602.3,602.3,602.3,602.3,602.3


In [8]:
df_rain_details.head(6).T

Unnamed: 0,0,1,2,3,4,5
location_name,Calaveras Creek Dam,Calaveras Creek Dam,Calaveras Creek Dam,Calaveras Creek Dam,Calaveras Creek Dam,Calaveras Creek Dam
latitude,29.3697,29.3697,29.3697,29.3697,29.3697,29.3697
longitude,-98.3323,-98.3323,-98.3323,-98.3323,-98.3323,-98.3323
date_time,2018-01-02 11:45:00,2018-01-16 00:30:00,2018-01-16 01:00:00,2018-01-17 00:05:00,2018-01-17 13:25:00,2018-01-18 00:05:00
five_minute_rainfall,0.01,0.01,0.01,0.02,0.01,0.01


In [9]:
df_rain_summary.head(6).T

Unnamed: 0,0,1,2,3,4,5
location_name,Blanco Road Dam,Blanco Road Dam,Blanco Road Dam,Blanco Road Dam,Blanco Road Dam,Blanco Road Dam
latitude,29.6248,29.6248,29.6248,29.6248,29.6248,29.6248
longitude,-98.5213,-98.5213,-98.5213,-98.5213,-98.5213,-98.5213
date,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06
daily_rainfall_total_inches,0.01,0,0,0,0,0


In [11]:
df_water_quality.head(6).T

Unnamed: 0,0,1,2,3,4,5
Station ID,12689,12689,12689,12689,12689,12689
Station Description,ROSILLO CREEK 0.1 KM ABOVE SALADO CREEK CONFLU...,ROSILLO CREEK 0.1 KM ABOVE SALADO CREEK CONFLU...,ROSILLO CREEK 0.1 KM ABOVE SALADO CREEK CONFLU...,ROSILLO CREEK 0.1 KM ABOVE SALADO CREEK CONFLU...,ROSILLO CREEK 0.1 KM ABOVE SALADO CREEK CONFLU...,ROSILLO CREEK 0.1 KM ABOVE SALADO CREEK CONFLU...
Latitude,29.3201,29.3201,29.3201,29.3201,29.3201,29.3201
Longitude,-98.4063,-98.4063,-98.4063,-98.4063,-98.4063,-98.4063
End Date,2008-09-18 00:00:00,2008-11-20 00:00:00,2008-12-10 00:00:00,2009-01-15 00:00:00,2009-02-19 00:00:00,2009-03-05 00:00:00
Tag ID,SA10917T,SA11053T,SA11123T,SA11193T,SA11249T,SA11319T
End Time,16:52,14:00,12:15,12:15,13:00,12:32
End Depth,0.28,0.15,0.05,0.08,0.06,0.03
Sample Type,RT,RT,RT,RT,RT,RT
Program Code,IPSW,IPSW,IPSW,IPSW,IPSW,IPSW


#### _Let's convert the column to lowercase to make them easier to work with and also rename the column names for clarity._

In [8]:
df = prepare.lowercase_and_rename(df)

NameError: name 'df' is not defined

In [None]:
df.head().T

#### _Let's make copies of the original dataframe before dropping some columns and rows to cover scenarios where we uncover more information about the variables._

In [None]:
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()

#### _Let's prepare df1 for EDA_
0. remove columns that do not add information
0. drop the columns that are no longer needed.
0. Make new variable of whether this incident involved two or more spills within 24 hours
0. Remove redundant columns
0. Rename spill details into simpler names
0. Change column values to lowercase
0. Change address into titlecase
0. Fix the data types
0. Fill nan's
0. Change data type to datetime

In [None]:
df1 = prepare.ready_df1(df1)
df1

In [None]:
df1.dtypes

#### _Let's assign a variable with all numerical column names._

In [None]:
df1_numerical_columns = list(df1.select_dtypes(include=[np.number]).columns.values)
df1_numerical_columns

#### _Let's assign a variable with all non-numerical column names._

In [None]:
df1_non_numerical_columns = list(df1.select_dtypes(exclude=[np.number]).columns.values)
df1_non_numerical_columns

In [None]:
df1.last_cleaned.value_counts(dropna=False)

In [None]:
df1.head(6).T

In [None]:
prepare.missing_values_col(df1)

# EXPLORE

#### _Looking for the repeat offenders..._

In [None]:
df1.num_spills_24mos[df1.num_spills_24mos > 1].value_counts()

#### _Locations of the most frequent SSOs in 2 years_

In [None]:
df1[['spill_street_address']][df1.num_spills_24mos >= 9]

#### _Total number of gallons spilled by the most frequent SSOs in 2 years_

In [None]:
df1.total_gallons[df1.num_spills_24mos >= 9].agg('sum')

In [None]:
df1[['spill_street_address', 'total_gallons', 'hours', 'root_cause',
     'unit_type', 'asset_type', 'last_cleaned', 'multiple_spills',
     'discharge_to', 'discharge_route']][df1.num_spills_24mos >= 9]

#### _Most common root causes of SSOs_

In [None]:
df1.root_cause.value_counts()

- [ ] **TODO:** Find a way to flesh out the address using regex to account for typos etc.
- [ ] **TODO:** Maybe try using unit id's instead of addresses.
- [ ] **TODO:** Drill down to only the top 3-5 locations.
- [ ] **TODO:** Compare predictions between preventing SSO on the most frequents versus not preventing.
- [ ] **TODO:** What is causing the spills on these top 3-5 locations?

In [None]:
df1.head(4).T

In [None]:
df1[['spill_street_address', 'unit_id_1','unit_id_2', 'unit_type', 'asset_type']].head(15)

- [ ] **TODO:** Maybe we can do some kind of clustering to group problem areas.

In [None]:
df1.unit_id_1.value_counts()[df1.unit_id_1.value_counts() > 7]

In [None]:
df1.unit_id_2.value_counts()[df1.unit_id_2.value_counts() > 7]

In [None]:
df1['root_cause'].value_counts()

In [None]:
df1['spill_street_address'].value_counts()[df1.spill_street_address.value_counts() > 7]

#### _Looking for locations with most SSOs that are also caused by grease._

In [None]:
df1.columns

In [None]:
df1['counts'] = df1.root_cause
df1['counts'] = df1.groupby(['spill_street_address']).transform('count')
df1

#### _Below shows the most frequent SSOs that are caused by grease._

In [None]:
df1.loc[(df1['counts'] >= 7) & (df1['root_cause'] == 'grease')]

#### _Below shows the most devastating SSOs by volume._

In [None]:
df1[df1.total_gallons > 1500000]

In [None]:
df1[df1.total_gallons > 1500000].shape

In [None]:
df1.installation_year.value_counts().sort_index()

#### _Spills by installation year._

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df1[df1.installation_year < 9999].groupby('installation_year')['spill_street_address'].count())

In [None]:
df1.year.value_counts().sort_index()

#### _Spills by year._

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df1[df1.year < 2019].groupby('year')['spill_street_address'].count())

#### _All observations grouped by month of the year._

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df1.groupby('month')['spill_street_address'].count())

#### _Colder months mean more grease clogs. Grease solidifies in colder temperatures._

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df1[(df1.root_cause == 'grease') & (df1.year < 2019)].groupby('month')['spill_street_address'].count())