# Cleaning the data to build the prototype for crwa

### This data cleans the original sql output and performs cleaning tasks. Also checking validity of the results against original report found at
### https://www.crwa.org/uploads/1/2/6/7/126781580/crwa_ecoli_web_2017_updated.xlsx

In [None]:
import pandas as pd
pd.options.display.max_rows = 999
import numpy as np
import matplotlib.pyplot as plt

In [36]:
df = pd.read_csv("data_for_prototype.csv")

In [37]:
# There are 2 rows with Date = Null so droping those rows
df = df.dropna(subset=['Date_Collected'])


In [38]:
df.isna().sum()

Date_Collected               0
Time_Collected            1691
Component_Name               0
Site_ID                      0
Site_Name                  224
Town                       385
River_Mile_Headwaters    13262
Latitude_DD                862
Longitude_DD               862
Actual_Result               38
Result_Type                  0
Unit_Abbreviation            2
Reporting_Result            10
Result_Type-2                0
Unit_Abbreviation-2          0
dtype: int64

In [39]:
# There are following types of invalids in Site_ID 
invalids = ["N/A","NULL","ND"]

In [40]:
#Removing these invalid Site_IDs 

df["Site_Name"] = df["Site_Name"].map(lambda x: np.nan if x in invalids else x)
df["Site_Name"].fillna("ABCD", inplace=True)

In [41]:
#Removing these invalid Town Names 

df["Town"] = df["Town"].map(lambda x: np.nan if x in invalids else x)
df["Town"].fillna("ABCD", inplace=True)

In [42]:
df["River_Mile_Headwaters"].describe

<bound method NDFrame.describe of 0        52.1 MI
1        53.4 MI
2        56.7 MI
3        62.1 MI
4        64.8 MI
          ...   
46128    69.5 MI
46129    69.5 MI
46130    69.5 MI
46131    69.5 MI
46132    69.5 MI
Name: River_Mile_Headwaters, Length: 46133, dtype: object>

In [43]:
#Removing invalid Miles and selecting only numeric values for miles 


df["River_Mile_Headwaters"] = df["River_Mile_Headwaters"].map(lambda x: np.nan if x in invalids else x)
df["River_Mile_Headwaters"].fillna("00.0 MI", inplace=True)
df["Mile"] = pd.to_numeric(df["River_Mile_Headwaters"].str[0:4])

In [44]:
#Removing invalid entrees and selecting only numeric values 


df["Latitude_DD"] = df["Latitude_DD"].map(lambda x: np.nan if x in invalids else x)
df["Latitude_DD"].fillna("00.0 MI", inplace=True)
df["Longitude_DD"] = df["Longitude_DD"].map(lambda x: np.nan if x in invalids else x)
df["Longitude_DD"].fillna("00.0 MI", inplace=True)

In [45]:
#Removing invalid entrees and selecting only numeric values 


df["Actual_Result"] = df["Actual_Result"].map(lambda x: np.nan if x in invalids else x)
df["Actual_Result"] = df["Actual_Result"].str.lstrip('>')
df["Actual_Result"] = df["Actual_Result"].str.rstrip('>')
df["Actual_Result"] = df["Actual_Result"].str.lstrip('<')
df["Actual_Result"] = df["Actual_Result"].str.rstrip('<')
df["Actual_Result"] = df["Actual_Result"].str.lstrip('*')
df["Actual_Result"] = df["Actual_Result"].str.rstrip('*')
df["Actual_Result"] = df["Actual_Result"].str.replace(',','')
df["Actual_Result"] = df["Actual_Result"].str.replace('%','')
df["Actual_Result"] = df["Actual_Result"].str.replace(' ','')
df["Actual_Result"] = df["Actual_Result"].str.replace('ND','')
df["Actual_Result"] = df["Actual_Result"].str.lstrip('.')
df["Actual_Result"] = df["Actual_Result"].str.rstrip('.')
df["Actual_Result"] = df["Actual_Result"].str.replace('6..25','6.25')
df["Actual_Result"] = df["Actual_Result"].str.replace('480.81546.25291','480.81546')
df["Actual_Result"] = df["Actual_Result"].str.replace('379\r\n379',"379")



In [46]:
#Functiont to check if string can be converted to numeric

#Input --> string
#Output --> 1 if convertable else 0

def isInt_try(v):
    try:     i = float(v)
    except:  return False
    return True

In [47]:
# Applying above function to check any odd strings in Actual_Result Column

for i in df["Actual_Result"]:
    if isInt_try(i) == 0:
        print(i)


In [48]:
# Checking any odd strings in Actual_Result Column

for i in df["Actual_Result"]:
    if str(i).count('.') >= 2:
        print(i)


In [49]:
# Converting  Actual_Result to numeric and Date_Collected to datetime data type

df["Actual_Result"] = pd.to_numeric(df["Actual_Result"])
df["Date_Collected"] = pd.to_datetime(df["Date_Collected"])

In [50]:
"Slicing for E.coli"

df_ecoli = df[df["Component_Name"] == "Escherichia coli"]

In [51]:
df_ecoli.head()

Unnamed: 0,Date_Collected,Time_Collected,Component_Name,Site_ID,Site_Name,Town,River_Mile_Headwaters,Latitude_DD,Longitude_DD,Actual_Result,Result_Type,Unit_Abbreviation,Reporting_Result,Result_Type-2,Unit_Abbreviation-2,Mile
6798,2000-08-25 00:00:00-04:00,,Escherichia coli,2LARZ,Larz Anderson Bridge-Center,Cambridge,00.0 MI,42.3691,-71.1235,50.0,Actual,cfu/100ml,50.0,Actual,cfu/100ml,0.0
9718,2002-06-18 00:00:00-04:00,1899-12-30 05:54:00-05,Escherichia coli,760T,Muddy River at Commonwealth Ave.,Boston,00.0 MI,42.3487,-71.0918,240.0,Actual,cfu/100ml,240.0,Actual,cfu/100ml,0.0
9720,2002-06-18 00:00:00-04:00,1899-12-30 05:58:00-05,Escherichia coli,567S,Nahanton Park,Needham/Newton,56.7 MI,42.2971,-71.2078,130.0,Actual,cfu/100ml,130.0,Actual,cfu/100ml,56.7
9722,2002-06-18 00:00:00-04:00,1899-12-30 05:59:00-05,Escherichia coli,484S,Greendale Ave. / Lyons St. / Dedham Medical Ce...,Dedham/Needham,48.4 MI,42.267,-71.205,190.0,Actual,cfu/100ml,190.0,Actual,cfu/100ml,48.4
9747,2002-06-18 00:00:00-04:00,1899-12-30 06:00:00-05,Escherichia coli,199S,Populatic Pond Boat Launch,Norfolk,19.9 MI,42.1311,-71.3768,210.0,Actual,cfu/100ml,195.0,Calculated,cfu/100ml,19.9


In [52]:
# Validating against the original report

result = df_ecoli.loc[(df_ecoli.Town == "Milford") & (df_ecoli.Date_Collected == pd.to_datetime("2017-11-21 00:00:00-05:00"))]["Actual_Result"]
result

43885    122.0
Name: Actual_Result, dtype: float64