# ENVIRONMENT

In [1]:
import acquire_sara as acquire
import prepare_sara as prepare
import pandas as pd
import numpy as np
import pandas_profiling

# data visualization 
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import statsmodels.api as sm

from datetime import timedelta, datetime
from pylab import rcParams

# to explode the DataFrames and avoid truncation
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from fbprophet import Prophet

# ACQUIRE

#### _Let's read in the data from the csv files._

In [2]:
df_flood = acquire.read_data('sara-flood-stage-levels.csv')
df_rain_details = acquire.read_data('sara-rainfall-details.csv')
df_rain_summary = acquire.read_data('sara-rainfall-summary.csv')
df_water_quality = acquire.read_data('sara-water-quality-bexar.csv')

b'Skipping line 9327: expected 500 fields, saw 501\n'


# PREPARE

#### _Let's get the shape of the dfs and take a peek at their header._

In [3]:
print('Flood Stage Levels: ' + str(df_flood.shape))
print('Rain Details: ' + str(df_rain_details.shape))
print('Rain Summary: ' + str(df_rain_summary.shape))
print('Water Quality: ' + str(df_water_quality.shape))

Flood Stage Levels: (1470, 6)
Rain Details: (37182, 5)
Rain Summary: (13395, 5)
Water Quality: (10883, 500)


In [4]:
df_flood.head(6).T

Unnamed: 0,0,1,2,3,4,5
location_name,SAR 01 (Lonestar),SAR 01 (Lonestar),SAR 01 (Lonestar),SAR 01 (Lonestar),SAR 01 (Lonestar),SAR 01 (Lonestar)
latitude,29.4019,29.4019,29.4019,29.4019,29.4019,29.4019
longitude,-98.4885,-98.4885,-98.4885,-98.4885,-98.4885,-98.4885
date,2018-06-20,2018-06-21,2018-06-22,2018-06-23,2018-06-24,2018-06-25
daily_average_stage,1.12642,1.09613,0.833789,0.957238,0.935521,0.958099
tranducer_elevation,602.3,602.3,602.3,602.3,602.3,602.3


In [5]:
df_rain_details.head(6).T

Unnamed: 0,0,1,2,3,4,5
location_name,Calaveras Creek Dam,Calaveras Creek Dam,Calaveras Creek Dam,Calaveras Creek Dam,Calaveras Creek Dam,Calaveras Creek Dam
latitude,29.3697,29.3697,29.3697,29.3697,29.3697,29.3697
longitude,-98.3323,-98.3323,-98.3323,-98.3323,-98.3323,-98.3323
date_time,2018-01-02 11:45:00,2018-01-16 00:30:00,2018-01-16 01:00:00,2018-01-17 00:05:00,2018-01-17 13:25:00,2018-01-18 00:05:00
five_minute_rainfall,0.01,0.01,0.01,0.02,0.01,0.01


In [6]:
df_rain_summary.head(6).T

Unnamed: 0,0,1,2,3,4,5
location_name,Blanco Road Dam,Blanco Road Dam,Blanco Road Dam,Blanco Road Dam,Blanco Road Dam,Blanco Road Dam
latitude,29.6248,29.6248,29.6248,29.6248,29.6248,29.6248
longitude,-98.5213,-98.5213,-98.5213,-98.5213,-98.5213,-98.5213
date,2018-01-01,2018-01-02,2018-01-03,2018-01-04,2018-01-05,2018-01-06
daily_rainfall_total_inches,0.01,0,0,0,0,0


In [7]:
df_water_quality.head(6).T

Unnamed: 0,0,1,2,3,4,5
Station ID,12689,12689,12689,12689,12689,12689
Station Description,ROSILLO CREEK 0.1 KM ABOVE SALADO CREEK CONFLU...,ROSILLO CREEK 0.1 KM ABOVE SALADO CREEK CONFLU...,ROSILLO CREEK 0.1 KM ABOVE SALADO CREEK CONFLU...,ROSILLO CREEK 0.1 KM ABOVE SALADO CREEK CONFLU...,ROSILLO CREEK 0.1 KM ABOVE SALADO CREEK CONFLU...,ROSILLO CREEK 0.1 KM ABOVE SALADO CREEK CONFLU...
Latitude,29.3201,29.3201,29.3201,29.3201,29.3201,29.3201
Longitude,-98.4063,-98.4063,-98.4063,-98.4063,-98.4063,-98.4063
End Date,2008-09-18 00:00:00,2008-11-20 00:00:00,2008-12-10 00:00:00,2009-01-15 00:00:00,2009-02-19 00:00:00,2009-03-05 00:00:00
Tag ID,SA10917T,SA11053T,SA11123T,SA11193T,SA11249T,SA11319T
End Time,16:52,14:00,12:15,12:15,13:00,12:32
End Depth,0.28,0.15,0.05,0.08,0.06,0.03
Sample Type,RT,RT,RT,RT,RT,RT
Program Code,IPSW,IPSW,IPSW,IPSW,IPSW,IPSW


In [8]:
df_water_quality = df_water_quality[['Station ID',
                                     'End Date',
                                     'OXYGEN, DISSOLVED (MG/L) (00300)',
                                     'E. COLI, COLILERT, IDEXX METHOD, MPN/100ML (31699)',
                                     'E COLI,NA+MUG OR EA+MUG,24HRS, 35 DEGREE (#/100M (31700)',
                                     'E COLI, SEDIMENT, MPN/100G (31702)',
                                     'E.COLI, COLILERT, IDEXX, HOLDING TIME (31704)',
                                     'DAYS SINCE PRECIPITATION EVENT (DAYS) (72053)',
                                     'PRESENT WEATHER (1=CLEAR,2=PTCLDY,3=CLDY,4=RAIN,5=OTHER) (89966)',
                                     'WATER COLOR 1=BRWN 2=RED 3=GRN 4=BLCK 5=CLR 6=OT (89969)'
                                     ]]

In [9]:
df_water_quality.head(11).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
Station ID,12689,12689,12689,12689,12689,12689,12689,12689,12689,12689,12689
End Date,2008-09-18 00:00:00,2008-11-20 00:00:00,2008-12-10 00:00:00,2009-01-15 00:00:00,2009-02-19 00:00:00,2009-03-05 00:00:00,2009-05-07 00:00:00,2009-06-03 00:00:00,2009-07-15 00:00:00,2009-08-12 00:00:00,2009-10-21 00:00:00
"OXYGEN, DISSOLVED (MG/L) (00300)",8.2,3.7,7.0,12.4,9.8,9.3,-,7.4,-,-,7.0
"E. COLI, COLILERT, IDEXX METHOD, MPN/100ML (31699)",120,420,55,210,280,410,-,48,-,-,99
"E COLI,NA+MUG OR EA+MUG,24HRS, 35 DEGREE (#/100M (31700)",-,-,-,-,-,-,-,-,-,-,-
"E COLI, SEDIMENT, MPN/100G (31702)",-,-,-,-,-,-,-,-,-,-,-
"E.COLI, COLILERT, IDEXX, HOLDING TIME (31704)",2.38,2.27,2.50,3.38,3.70,3.62,-,3.23,-,-,25.83
DAYS SINCE PRECIPITATION EVENT (DAYS) (72053),5,> 7,< 1,> 7,2,> 7,> 7,7,> 7,> 7,> 7
"PRESENT WEATHER (1=CLEAR,2=PTCLDY,3=CLDY,4=RAIN,5=OTHER) (89966)",2,2,2,1,1,1,3,2,1,1,3
WATER COLOR 1=BRWN 2=RED 3=GRN 4=BLCK 5=CLR 6=OT (89969),3,1,3,3,3,3,3,3,-,-,3


In [10]:
df_water_quality = prepare.lowercase_and_rename(df_water_quality)

In [11]:
df_water_quality.head(13).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
station_id,12689,12689,12689,12689,12689,12689,12689,12689,12689,12689,12689,12689,12689
date,2008-09-18 00:00:00,2008-11-20 00:00:00,2008-12-10 00:00:00,2009-01-15 00:00:00,2009-02-19 00:00:00,2009-03-05 00:00:00,2009-05-07 00:00:00,2009-06-03 00:00:00,2009-07-15 00:00:00,2009-08-12 00:00:00,2009-10-21 00:00:00,2009-11-18 00:00:00,2009-12-16 00:00:00
oxygen,8.2,3.7,7.0,12.4,9.8,9.3,-,7.4,-,-,7.0,9.8,10.7
ecoli_idexx,120,420,55,210,280,410,-,48,-,-,99,120,25
ecoli_mug,-,-,-,-,-,-,-,-,-,-,-,-,-
ecoli_sediment,-,-,-,-,-,-,-,-,-,-,-,-,-
ecoili_holding_time,2.38,2.27,2.50,3.38,3.70,3.62,-,3.23,-,-,25.83,5.13,4.22
days_since_precipitation,5,> 7,< 1,> 7,2,> 7,> 7,7,> 7,> 7,> 7,> 7,5
present_weather,2,2,2,1,1,1,3,2,1,1,3,1,3
water_color,3,1,3,3,3,3,3,3,-,-,3,3,3


In [12]:
df_water_quality.shape

(10883, 10)

In [13]:
df_water_quality.ecoli_mug.value_counts()

-           9702
< 2           24
 70           19
 23           16
 47           13
< 7           11
 180          11
 116          11
 71           11
 93           11
< 233         10
< 23          10
 163           9
 50            9
 1000          9
 100           9
 36            9
 90            8
 2500          8
 1500          8
 400           8
 32            8
 29            8
 800           8
 80            8
 3000          8
> 66667        8
> 20000        8
 190           8
 1900          8
 1200          8
 21            8
 140           7
 1400          7
 67            7
 35            7
 120           7
 700           7
 200           7
 14            7
 250           6
< 71           6
 367           6
 4667          6
 53            6
 46            6
 600           6
 300           6
 240           6
 4000          6
 170           6
 7             6
 117           5
 667           5
 2000          5
 260           5
 154           5
 467           5
 57           

In [14]:
df_water_quality.ecoli_sediment.value_counts()

-      10880
 84        1
 4         1
 39        1
Name: ecoli_sediment, dtype: int64

In [15]:
prepare.missing_values_col(df_water_quality)

Unnamed: 0,num_missing,missing_percentage,num_empty,empty_percentage,dash_count,dash_percentage
station_id,0,0.0,0,0.0,0,0.0
date,0,0.0,0,0.0,0,0.0
oxygen,0,0.0,0,0.0,3321,30.515483
ecoli_idexx,0,0.0,0,0.0,4294,39.456032
ecoli_mug,0,0.0,0,0.0,9702,89.148213
ecoli_sediment,0,0.0,0,0.0,10880,99.972434
ecoili_holding_time,0,0.0,0,0.0,4968,45.649178
days_since_precipitation,0,0.0,0,0.0,1362,12.514932
present_weather,0,0.0,0,0.0,1337,12.285215
water_color,0,0.0,0,0.0,1803,16.567123


In [16]:
df_water_quality.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10883 entries, 0 to 10882
Data columns (total 10 columns):
station_id                  10883 non-null int64
date                        10883 non-null object
oxygen                      10883 non-null object
ecoli_idexx                 10883 non-null object
ecoli_mug                   10883 non-null object
ecoli_sediment              10883 non-null object
ecoili_holding_time         10883 non-null object
days_since_precipitation    10883 non-null object
present_weather             10883 non-null object
water_color                 10883 non-null object
dtypes: int64(1), object(9)
memory usage: 935.3+ KB


In [19]:
df = df_water_quality
df

Unnamed: 0,station_id,date,oxygen,ecoli_idexx,ecoli_mug,ecoli_sediment,ecoili_holding_time,days_since_precipitation,present_weather,water_color
0,12689,2008-09-18 00:00:00,8.2,120,-,-,2.38,5,2,3
1,12689,2008-11-20 00:00:00,3.7,420,-,-,2.27,> 7,2,1
2,12689,2008-12-10 00:00:00,7.0,55,-,-,2.50,< 1,2,3
3,12689,2009-01-15 00:00:00,12.4,210,-,-,3.38,> 7,1,3
4,12689,2009-02-19 00:00:00,9.8,280,-,-,3.70,2,1,3
5,12689,2009-03-05 00:00:00,9.3,410,-,-,3.62,> 7,1,3
6,12689,2009-05-07 00:00:00,-,-,-,-,-,> 7,3,3
7,12689,2009-06-03 00:00:00,7.4,48,-,-,3.23,7,2,3
8,12689,2009-07-15 00:00:00,-,-,-,-,-,> 7,1,-
9,12689,2009-08-12 00:00:00,-,-,-,-,-,> 7,1,-
