In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

import reverse_geocoder as rg

In [2]:
df = pd.read_csv('data_for_prototype.csv')
df.head()

Unnamed: 0,Date_Collected,Time_Collected,Component_Name,Site_ID,Site_Name,Town,River_Mile_Headwaters,Latitude_DD,Longitude_DD,Reporting_Result,Result_Type,Unit_Abbreviation
0,6/6/1995 0:00,1899-12-30 06:00:00-05,"Temperature, water",521S,Ames St. Bridge,Dedham,52.1 MI,42.2524,-71.1763,27.6,Actual,deg C
1,6/6/1995 0:00,1899-12-30 06:00:00-05,"Temperature, water",534S,Rt. 109 Bridge,Boston/Dedham,53.4 MI,42.2708,-71.1732,23.0,Actual,deg C
2,6/6/1995 0:00,1899-12-30 06:00:00-05,"Temperature, water",567S,Nahanton Park,Needham/Newton,56.7 MI,42.2971,-71.2078,22.3,Actual,deg C
3,6/6/1995 0:00,1899-12-30 06:00:00-05,"Temperature, water",621S,Leo J. Martin Golf Course / Park Rd.,Newton/Weston,62.1 MI,42.3306,-71.268,22.6,Actual,deg C
4,6/6/1995 0:00,1899-12-30 06:00:00-05,"Temperature, water",648S,"Auburndale Park, Lakes Region",Waltham,64.8 MI,42.3615,-71.2501,22.8,Actual,deg C


In [3]:
# Check Null value
df.isnull().sum()

Date_Collected               2
Time_Collected            1693
Component_Name               2
Site_ID                      1
Site_Name                  226
Town                       387
River_Mile_Headwaters    13264
Latitude_DD                864
Longitude_DD               864
Reporting_Result            12
Result_Type                  2
Unit_Abbreviation            2
dtype: int64

In [4]:
# Remove Null value from Date_Collected column
df = df.drop(df[df['Date_Collected'].isnull()].index)

In [5]:
# Leave only date information in Date_Collected column
df['Date_Collected'] = df['Date_Collected'].map(lambda x: x.split(' ')[0])

In [6]:
# Merge the data under the category of 'Escherichia coli' and 'Fecal coliform' as 'Escherichia coli'
df['Component_Name'] = df['Component_Name'].map(lambda x: 'Escherichia coli' if x == 'Fecal coliform' else x)

In [7]:
# Select Categories we need
categories = ['Escherichia coli', 'Phosphorus', 'Chlorophyll a']

In [8]:
# Filter by selected categories
df = df[df['Component_Name'].isin(categories)]

In [9]:
df = df[['Date_Collected', 'Component_Name',
         'Site_ID', 'Site_Name', 'Latitude_DD', 'Longitude_DD',
         'Reporting_Result', 'Unit_Abbreviation']]
df.head(2)

Unnamed: 0,Date_Collected,Component_Name,Site_ID,Site_Name,Latitude_DD,Longitude_DD,Reporting_Result,Unit_Abbreviation
30,7/25/1995,Escherichia coli,591S,Rt. 9 Gaging Station,42.3165,-71.2281,270.0,cfu/100ml
39,7/25/1995,Escherichia coli,229S,"Rt. 115, Baltimore St.",42.1433,-71.3493,380.0,cfu/100ml


In [10]:
# Get coordinates of the sites
coordinates = []
for row in df.pivot_table(index=['Latitude_DD', 'Longitude_DD']).index:
    coordinates.append(row)
coordinates = set(coordinates)

In [11]:
# Get town names based on coordinates
towns = []
for coor in coordinates: # This takes about 3 minutes
    towns.append(rg.search(coor)[0]['name'])

Loading formatted geocoded file...


In [12]:
# Make a dataframe with Latitude, Longitude and Town names
df_town = pd.DataFrame(data = coordinates, columns=['Latitude_DD', 'Longitude_DD'])

In [13]:
df_town['Town'] = towns
df_town.head(2)

Unnamed: 0,Latitude_DD,Longitude_DD,Town
0,42.28221,-71.18829,Needham
1,42.3624,-71.1507,Watertown


In [14]:
df = df.merge(df_town, on=['Latitude_DD', 'Longitude_DD'], how='left')

In [15]:
# Check Null value
# We don't have the record of the Site_Name/ lat & long in Access DB
# for the sites with Site_ID but without Site_Name/ lat & long
df.isnull().sum()

Date_Collected         0
Component_Name         0
Site_ID                0
Site_Name            119
Latitude_DD          742
Longitude_DD         742
Reporting_Result       0
Unit_Abbreviation      0
Town                 742
dtype: int64

In [16]:
df.shape

(15640, 9)

In [17]:
# Drop rows where site name or coordinate is Null (they're null in access table)
df.drop(df[df['Latitude_DD'].isnull()].index, axis=0, inplace=True)

In [18]:
df.isnull().sum()

Date_Collected       0
Component_Name       0
Site_ID              0
Site_Name            0
Latitude_DD          0
Longitude_DD         0
Reporting_Result     0
Unit_Abbreviation    0
Town                 0
dtype: int64

In [19]:
df.shape

(14898, 9)

In [20]:
# Save data respectively by component name
# for category in categories:
#     df[df['Component_Name']==category].to_csv('data_'+category+'.csv')

In [21]:
# Add a column for pie chart
df_ecoli = df[df['Component_Name']=='Escherichia coli']

In [26]:
df_ecoli['Safety'] = df_ecoli['Reporting_Result']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [27]:
df_ecoli['Safety'] = pd.cut(df_ecoli['Safety'], [-1, 235, 1260, 1000000], labels=['Safe', 'No Swimming', 'Not safe for activities'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [28]:
df_ecoli.head()

Unnamed: 0,Date_Collected,Component_Name,Site_ID,Site_Name,Latitude_DD,Longitude_DD,Reporting_Result,Unit_Abbreviation,Town,Safety
0,7/25/1995,Escherichia coli,591S,Rt. 9 Gaging Station,42.3165,-71.2281,270.0,cfu/100ml,Newton,No Swimming
1,7/25/1995,Escherichia coli,229S,"Rt. 115, Baltimore St.",42.1433,-71.3493,380.0,cfu/100ml,Millis-Clicquot,No Swimming
2,7/25/1995,Escherichia coli,35CS,Central Street Bridge,42.1395,-71.5123,414000.0,cfu/100ml,Milford,Not safe for activities
3,7/25/1995,Escherichia coli,387S,Elm Bank / Cheney Dr. Bridge,42.2757,-71.3095,440.0,cfu/100ml,Wellesley,No Swimming
4,7/25/1995,Escherichia coli,447S,"USGS Dover Gage, Mill St.",42.2563,-71.2596,40.0,cfu/100ml,Dover,Safe


In [29]:
df_ecoli.to_csv('data_EColi_w_safety.csv')

In [30]:
df_ecoli.shape

(12019, 10)