# PLAN

- [x] Acquisition
    - [x] read the csv into a dataframe
- [x] Preparation
    - [x] no missing values
    - [x] drop columns that are not needed
    - [x] change case to lower case
    - [x] make sure everything has right dtype
    - [ ] normalize what needs to be normalized
    - [x] rename columns for clarification
- [ ] Exploration
    - [ ] answer ALL questions raised
        - [x] Which locations are the most frequent sites of SSO?
        - [x] Which location have the most volume of overflow?
        - [x] What are most common root causes of SSO?
        - [x] Where do the majority of overflow go?

    - [ ] visualize important findings
    - [ ] decide what TODO items to keep
- [ ] Modeling
    - [ ] predict 
- [ ] Delivery
    - [ ] report
    - [ ] prezi slides
    - [ ] website

# ENVIRONMENT

In [None]:
import os
import acquire_sso as acquire
import prepare_sso as prepare
import pandas as pd
import numpy as np
import json

# data visualization 
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import statsmodels.api as sm
import seaborn as sns

from datetime import timedelta, datetime
from pylab import rcParams
from requests import get

# to explode the DataFrames and avoid truncation
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from fbprophet import Prophet

API_KEY = 'Zd5jr3WZm1PbGobgPDHzLz9LEFDaco1V'

# ACQUIRE

#### _Let's read in the data from the csv file and take a peek at te first five records._

In [None]:
df = acquire.read_data('saws-sso.csv')

In [None]:
df.head()

# PREPARE

#### _Let's convert the column to lowercase to make them easier to work with and also rename the column names for clarity._

In [None]:
df = prepare.lowercase_and_rename(df)

In [None]:
df.head().T

#### _Let's make copies of the original dataframe before dropping some columns and rows to cover scenarios where we uncover more information about the variables._

In [None]:
df1 = df.copy()
df2 = df.copy()
df3 = df.copy()

#### _Let's prepare df1 for EDA_
0. remove columns that do not add information
0. drop the columns that are no longer needed.
0. Make new variable of whether this incident involved two or more spills within 24 hours
0. Remove redundant columns
0. Rename spill details into simpler names
0. Change column values to lowercase
0. Change address into titlecase
0. Fix the data types
0. Fill nan's
0. Change data type to datetime

In [None]:
df1.shape

In [None]:
df1 = prepare.ready_df(df1)
df1

In [None]:
df1.dtypes

#### _Let's assign a variable with all numerical column names._

In [None]:
df1_numerical_columns = list(df1.select_dtypes(include=[np.number]).columns.values)
df1_numerical_columns

#### _Let's assign a variable with all non-numerical column names._

In [None]:
df1_non_numerical_columns = list(df1.select_dtypes(exclude=[np.number]).columns.values)
df1_non_numerical_columns

#### _Let's get an idea of how often these pipes are cleaned._

In [None]:
df1.set_index('last_cleaned').groupby(pd.Grouper(freq='M')).count()

In [None]:
df1.last_cleaned.value_counts(dropna=False)

In [None]:
df1.head(6).T

In [None]:
prepare.missing_values_col(df1)

In [None]:
df1.shape

### Let's feature engineer a column to show how many months have passed since the last cleaning and the current spill.

In [None]:
df2 = prepare.ready_df(df2)
df2_numerical_columns = list(df2.select_dtypes(include=[np.number]).columns.values)
df2_non_numerical_columns = list(df2.select_dtypes(exclude=[np.number]).columns.values)
prepare.missing_values_col(df2)

In [None]:
df2['months_elapsed'] = ((df2.report_date - df2.last_cleaned)/np.timedelta64(1, 'M'))

In [None]:
df2.head().T

In [None]:
df2.isna().sum()

### Drop NaN's in the two columns we want to work with right now

In [None]:
df2.dropna(subset=['last_cleaned', 'months_elapsed'], inplace=True)

In [None]:
df2.shape

In [None]:
df2[df2.cause == 'grease'].months_elapsed.describe()

### Note for below, a pipe can be cleaned and structural damage or weather related events can occur the next day.

In [None]:
sns.distplot(df2[df2.cause!='grease'].months_elapsed)

### These are just cleaned and an SSO due to grease occurs rather quickly.

In [None]:
sns.distplot(df2[df2.cause=='grease'].months_elapsed)

In [None]:
sns.distplot(df2[(df2.cause=='grease') & (df2.months_elapsed <= 10)].months_elapsed)

# EXPLORE

#### _Looking for the repeat offenders..._

In [None]:
df1.num_spills_24mos[df1.num_spills_24mos > 1].value_counts()

#### _Locations of the most frequent SSOs in 2 years_

In [None]:
df1[['spill_street_address']][df1.num_spills_24mos >= 9]

#### _Total number of gallons spilled by the most frequent SSOs in 2 years_

In [None]:
df1.total_gallons[df1.num_spills_24mos >= 9].agg('sum')

In [None]:
df1[['spill_street_address', 'total_gallons', 'hours', 'root_cause',
     'unit_type', 'asset_type', 'last_cleaned', 'multiple_spills',
     'discharge_to', 'discharge_route']][df1.num_spills_24mos >= 9]

#### _Most common root causes of SSOs_

In [None]:
df1.root_cause.value_counts()

- [ ] **TODO:** Find a way to flesh out the address using regex to account for typos etc.
- [ ] **TODO:** Maybe try using unit id's instead of addresses.
- [ ] **TODO:** Drill down to only the top 3-5 locations.
- [ ] **TODO:** Compare predictions between preventing SSO on the most frequents versus not preventing.
- [ ] **TODO:** What is causing the spills on these top 3-5 locations?

In [None]:
df1.head(4).T

In [None]:
df1[['spill_street_address', 'unit_id_1','unit_id_2', 'unit_type', 'asset_type']].head(15)

- [ ] **TODO:** Maybe we can do some kind of clustering to group problem areas.

In [None]:
df1.unit_id_1.value_counts()[df1.unit_id_1.value_counts() > 7]

In [None]:
df1.unit_id_2.value_counts()[df1.unit_id_2.value_counts() > 7]

In [None]:
df1['root_cause'].value_counts()

In [None]:
df1['spill_street_address'].value_counts()[df1.spill_street_address.value_counts() > 7]

#### _Looking for locations with most SSOs that are also caused by grease._

In [None]:
df1.columns

In [None]:
df1['counts'] = df1.root_cause
df1['counts'] = df1.groupby(['spill_street_address']).transform('count')
df1

#### _Below shows the most frequent SSOs that are caused by grease._

In [None]:
df1.loc[(df1['counts'] >= 7) & (df1['root_cause'] == 'grease')]

#### _Below shows the most devastating SSOs by volume._

In [None]:
df1[df1.total_gallons > 1500000]

In [None]:
df1[df1.total_gallons > 1500000].shape

In [None]:
df1.installation_year.value_counts().sort_index()

#### _Spills by installation year._

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df1[df1.installation_year < 9999].groupby('installation_year')['spill_street_address'].count())

In [None]:
df1.year.value_counts().sort_index()

#### _Spills by year._

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df1[df1.year < 2019].groupby('year')['spill_street_address'].count())

#### _All observations grouped by month of the year._

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df1.groupby('month')['spill_street_address'].count())

#### _Colder months mean more grease clogs. Grease solidifies in colder temperatures._

In [None]:
plt.figure(figsize=(12,8))
plt.plot(df1[(df1.root_cause == 'grease') & (df1.year < 2019)].groupby('month')['spill_street_address'].count())

In [None]:
df1.head().T

In [None]:
df1.last_cleaned.value_counts(dropna=False)

In [None]:
df1.info()

In [None]:
df1['months_since_cleaned'] = np.where(df1['last_cleaned'].isnull(), 0, ((df1.report_date -
                                                                          df1.last_cleaned).astype('timedelta64[M]')))

In [None]:
df1.months_since_cleaned.value_counts()

#### _Number of observations that were cleaned._

In [None]:
df1.shape[0] - 2316

# GEOCODING API

### _Let's generate a batch file of just the street address_

In [None]:
df1.spill_street_address.to_csv('data/sso-addresses.csv', index=False)

In [None]:
addresses = df1.spill_street_address

In [None]:
type(addresses)

### _Here's the GET request from the Geocoding API_

In [None]:
def get_geocode(url):
    """
    This function takes a url and returns a csv that
    contains the lat and long of the provided
    street address.
    """
    response = get(url)
    return response.text

### _Let's try one_

In [None]:
response = (get_geocode('https://www.mapquestapi.com/geocoding/v1/address?key=Zd5jr3WZm1PbGobgPDHzLz9LEFDaco1V&inFormat=kvp&outFormat=json&location=2430+NW+Military+Hwy&thumbMaps=false&delimiter=%2C'))
response

In [None]:
obj = json.loads(response)
zip = obj['results'][0]['locations'][0]['postalCode']
lat = obj['results'][0]['locations'][0]['latLng']['lat']
long = obj['results'][0]['locations'][0]['latLng']['lng']

### _Let's try a function_

In [None]:
def get_zip(url):
    """
    This function takes a url with a street address on it
    and returns the zip code for that address along with
    latitude and longitude coordinates.
    """
    response = (get_geocode(url))
    obj = json.loads(response)
    return  obj['results'][0]['locations'][0]['postalCode'], obj['results'][0]['locations'][0]['latLng']['lat'], obj['results'][0]['locations'][0]['latLng']['lng']

In [None]:
get_zip('https://www.mapquestapi.com/geocoding/v1/address?key=Zd5jr3WZm1PbGobgPDHzLz9LEFDaco1V&inFormat=kvp&outFormat=json&location=2430+NW+Military+Hwy&thumbMaps=false&delimiter=%2C')

### _Let's try looping!_

In [None]:
list_addresses = []
for address in addresses:
    cleaned_address = address.replace(' ', '+')
    url = 'https://www.mapquestapi.com/geocoding/v1/address?key=' + API_KEY + '&inFormat=kvp&outFormat=json&location=' + cleaned_address + '+San+Antonio+TX' + '&thumbMaps=false&delimiter=%2C'
    list_addresses.append(get_zip(url))
#     print(url)

In [None]:
len(list_addresses)

In [None]:
len(addresses)

In [None]:
df_locations = pd.DataFrame()

In [None]:
df_locations['addresses']  = addresses.tolist()

In [None]:
# Uncomment on development; comment on production !
# df_locations = df_locations.head(10)

In [None]:
df_locations

In [None]:
from pandas import DataFrame
df_temp = DataFrame.from_records(list_addresses)

In [None]:
df_temp

In [None]:
df_locations['zip']  = df_temp[0].tolist()

In [None]:
df_locations['lat']  = df_temp[1].tolist()

In [None]:
df_locations['long']  = df_temp[2].tolist()

In [None]:
df_locations

In [None]:
df_locations.shape

In [None]:
df_locations.to_csv('data/sso-loc-info-10.csv')

In [None]:
# df_locations = pd.read_csv('data/sso-loc-info.csv')

In [None]:
df_locations

In [None]:
df_locations

In [None]:
df1.head()

In [None]:
df_locations = df_locations.rename(index=str, columns={'addresses':'spill_street_address'})

In [None]:
df_locations.head()

In [None]:
df = pd.merge(df1, df_locations)

In [None]:
df.to_csv('data/dd-geocoded.csv')

In [None]:
df

In [None]:
df.zip.value_counts()

In [None]:
def plot_rel(df, x, y, h):
	"""
	Creates a relplot.
	"""
	sns.relplot(x, y, h, data=df)

In [None]:
plot_rel(df, 'long', 'lat', 'months_since_cleaned')

In [None]:
plot_rel(df, 'long', 'lat', 'months_since_cleaned')