

**Use Darksky API to obtain weather data by geolocation and date**

---




In [1]:
import pandas as pd
import seaborn as sns

import re
import requests
from tqdm import tqdm_notebook

### Prepare Dataset for Darksky

#### Import data (all years)

In [2]:
df_2019 = pd.read_excel('Earthwatch_Data/FWW_full database.xlsm', '2019')
df_2018 = pd.read_excel('Earthwatch_Data/FWW_full database.xlsm', '2018') 
df_2017 = pd.read_excel('Earthwatch_Data/FWW_full database.xlsm', '2017') 
df_2016 = pd.read_excel('Earthwatch_Data/FWW_full database.xlsm', '2016')
df_2015 = pd.read_excel('Earthwatch_Data/FWW_full database.xlsm', '2015')
df_2014 = pd.read_excel('Earthwatch_Data/FWW_full database.xlsm', '2014')
df_2013 = pd.read_excel('Earthwatch_Data/FWW_full database.xlsm', '2013')
df_2012 = pd.read_excel('Earthwatch_Data/FWW_full database.xlsm', '2012')

#### Combine into one dataframe

In [3]:
df_total = [df_2019, df_2018, df_2017, df_2016, df_2015, df_2014, df_2013, df_2012]

df_total = pd.concat(df_total, ignore_index = True)

#### Reduce columns

In [4]:
df_features = df_total[['Sample Date', 'Site name', 'Geolocation (Latitude, Longitude)', 'Freshwater body type', 'What is the land use in the immediate surroundings?',
'What is the bank vegetation? (select all that apply)', 'Is there any of the following on the water surface?',
'Are there any pollution sources in the immediate surroundings? (select all that apply)',
'Can you see evidence of the following water uses? (select all that apply)', 'What aquatic life is there evidence of? (select all that apply)',
'Is the algae...','Estimate the water flow','Estimate the water level','Nitrate','Phosphate']]

#### Rename columns

In [5]:
df_features = df_features.copy()
df_features.rename(columns = {'Sample Date':'sample_date',
                              'Site name': 'site_name',
                              'Geolocation (Latitude, Longitude)': 'geolocation_lat_long',
                              'Freshwater body type': 'fw_bodytype',
                              'What is the land use in the immediate surroundings?': 'land_use',
                              'What is the bank vegetation? (select all that apply)': 'bank_vegetation',
                              'Is there any of the following on the water surface?': 'water_surface',
                              'Are there any pollution sources in the immediate surroundings? (select all that apply)': 'pollution_sources',
                              'Can you see evidence of the following water uses? (select all that apply)': 'water_use',
                              'What aquatic life is there evidence of? (select all that apply)' : 'aquatic_life',
                              'Is the algae...': 'algae',
                              'Estimate the water flow': 'water_flow',
                              'Estimate the water level': 'water_level', 'Nitrate': 'nitrate' ,
                              'Phosphate': 'phosphate'}, inplace = True)

#### Add date-1, date-2

In [6]:
df_features['sample_date_minus1'] = df_features['sample_date'] + pd.DateOffset(-1)
df_features['sample_date_minus2'] = df_features['sample_date'] + pd.DateOffset(-2)

#### Only include day, month year

In [7]:
df_features['sample_date'] = [el.date() for el in df_features['sample_date']]
df_features['sample_date_minus1'] = [el.date() for el in df_features['sample_date_minus1']]
df_features['sample_date_minus2'] = [el.date() for el in df_features['sample_date_minus2']]

#### Add latitude, longitude as separate columns

In [8]:
df_features['latitude'] = df_features['geolocation_lat_long'].str.extract(r'(-?\d+.\d+)')

In [9]:
df_features['longitude'] = df_features['geolocation_lat_long'].str.extract(r'( -?\d+.\d+)')


In [10]:
df_features.reset_index(inplace=True)

In [11]:
df_features.head()

Unnamed: 0,index,sample_date,site_name,geolocation_lat_long,fw_bodytype,land_use,bank_vegetation,water_surface,pollution_sources,water_use,aquatic_life,algae,water_flow,water_level,nitrate,phosphate,sample_date_minus1,sample_date_minus2,latitude,longitude
0,0,2019-11-05,Shimungalu,"-15.754, 27.784",River,Agriculture,Trees/shrubsGrass,,Industrial discharge,BoatingIrrigationSwimmingFishingPublic water s...,Floating plantsPlants emerging from the waterF...,No algae,Slow,Low,0.35,0.15,2019-11-04,2019-11-03,-15.754,27.784
1,1,2019-11-05,Shimungalu,"-15.754, 27.785",River,Industrial,Trees/shrubsGrass,,Industrial discharge,BoatingIrrigationSwimmingFishingPublic water s...,Floating plantsPlants emerging from the waterF...,No algae,Slow,Low,0.35,0.075,2019-11-04,2019-11-03,-15.754,27.785
2,2,2019-11-05,custom house,"53.348, -6.252",River,Urban Residential,No vegetation cover,LitterOily Sheen,Urban/Road discharge,Boating,Aquatic birds,,Steady,Low,1.5,0.15,2019-11-04,2019-11-03,53.348,-6.252
3,3,2019-11-05,Shimungalu,"-15.754, 27.785",River,Industrial,Trees/shrubsNo vegetation coverGrass,,Industrial discharge,BoatingIrrigationSwimmingFishingPublic water s...,Floating plantsPlants emerging from the waterF...,,Slow,Low,0.75,0.15,2019-11-04,2019-11-03,-15.754,27.785
4,4,2019-11-03,borro bucine,"43.646, 11.169",Stream,Agriculture,Trees/shrubs,,Other,Other,Plants below the surface,No algae,Steady,Average,7.5,0.075,2019-11-02,2019-11-01,43.646,11.169


#### Remove nulls so that code can run for DarkSky

In [12]:
df_features = df_features[(df_features['geolocation_lat_long'].notnull())]
df_features = df_features[(df_features['latitude'].notnull())]
df_features = df_features[(df_features['longitude'].notnull())]

In [13]:
df_features.sample_date = df_features.sample_date.astype(str)
df_features['new_date'] = df_features.sample_date.str.extract(r'(\d+-\d+-\d+)')

In [14]:
df_features.drop(columns = ['index'], inplace = True)

In [15]:
df_features.head()

Unnamed: 0,sample_date,site_name,geolocation_lat_long,fw_bodytype,land_use,bank_vegetation,water_surface,pollution_sources,water_use,aquatic_life,algae,water_flow,water_level,nitrate,phosphate,sample_date_minus1,sample_date_minus2,latitude,longitude,new_date
0,2019-11-05,Shimungalu,"-15.754, 27.784",River,Agriculture,Trees/shrubsGrass,,Industrial discharge,BoatingIrrigationSwimmingFishingPublic water s...,Floating plantsPlants emerging from the waterF...,No algae,Slow,Low,0.35,0.15,2019-11-04,2019-11-03,-15.754,27.784,2019-11-05
1,2019-11-05,Shimungalu,"-15.754, 27.785",River,Industrial,Trees/shrubsGrass,,Industrial discharge,BoatingIrrigationSwimmingFishingPublic water s...,Floating plantsPlants emerging from the waterF...,No algae,Slow,Low,0.35,0.075,2019-11-04,2019-11-03,-15.754,27.785,2019-11-05
2,2019-11-05,custom house,"53.348, -6.252",River,Urban Residential,No vegetation cover,LitterOily Sheen,Urban/Road discharge,Boating,Aquatic birds,,Steady,Low,1.5,0.15,2019-11-04,2019-11-03,53.348,-6.252,2019-11-05
3,2019-11-05,Shimungalu,"-15.754, 27.785",River,Industrial,Trees/shrubsNo vegetation coverGrass,,Industrial discharge,BoatingIrrigationSwimmingFishingPublic water s...,Floating plantsPlants emerging from the waterF...,,Slow,Low,0.75,0.15,2019-11-04,2019-11-03,-15.754,27.785,2019-11-05
4,2019-11-03,borro bucine,"43.646, 11.169",Stream,Agriculture,Trees/shrubs,,Other,Other,Plants below the surface,No algae,Steady,Average,7.5,0.075,2019-11-02,2019-11-01,43.646,11.169,2019-11-03


In [16]:
df_features.shape

(24203, 20)

#### Save file

In [17]:
df_features.to_csv('Earthwatch_Data/Darksky_df_features.csv', index = False )

### Pull from DarkSky API

In [18]:
secret_key = ''

In [19]:
# I can only do 2,000 pulls a day free, so needed to run this in chunks
url_template = 'https://api.darksky.net/forecast/{}/{},{},{}T20:00:00Z?exclude=currently?exclude=hourly&units=si'

lat_list = []
long_list = []

temperatureHigh = []
temperatureLow = []

summary = []
icon = []

precipIntensity = []
precipIntensityMax = []

humidity = []
pressure = []
windSpeed = []
windGust = []
cloudCover = []
ozone = []

for i, j in  tqdm_notebook (df_features[0:10].iterrows()):
    lat = j[17]
    long = (j[18].strip()) #removes whitespace
    date_minus1 = (j[15])
    
    url_template_1 = 'https://api.darksky.net/forecast/secret_key/{},{},{}T20:00:00Z?exclude=currently?exclude=hourly&units=si'

    request_url_1 = url_template.format(secret_key,lat,long,date_minus1)
    response = requests.get(request_url_1)
    json_data = response.json()
    
    try:
        lat_list.append(json_data['latitude'])
    except:

        lat_list.append('None')    
    try:
        long_list.append(json_data['longitude'])
    except:
        long_list.append('None') 
    try:
        temperatureHigh.append(json_data['daily']['data'][0]['temperatureHigh'])
    except:
        temperatureHigh.append('None') 
    try:
        temperatureLow.append(json_data['daily']['data'][0]['temperatureLow'])
    except:
        temperatureLow.append('None')   
    try:
         summary.append(json_data['daily']['data'][0]['summary'])
    except:
         summary.append('None')      
    try:
         icon.append(json_data['daily']['data'][0]['icon'])
    except:
         icon.append('None')              
    try:
        precipIntensity.append(json_data['daily']['data'][0]['precipIntensity'])
    except:
        precipIntensity.append('None')
    try:
        precipIntensityMax.append(json_data['daily']['data'][0]['precipIntensityMax'])
    except:
        precipIntensityMax.append('None')
    try:
        humidity.append(json_data['daily']['data'][0]['humidity'])
    except:
        humidity.append('None')
    try:
        pressure.append(json_data['daily']['data'][0]['pressure'])
    except:
        pressure.append('None')
    try:
        windSpeed.append(json_data['daily']['data'][0]['windSpeed'])
    except:
        windSpeed.append('None')
    try:
        windGust.append(json_data['daily']['data'][0]['windGust'])
    except:
        windGust.append('None')
                        
    try:
        cloudCover.append(json_data['daily']['data'][0]['cloudCover'])
    except:
        cloudCover.append('None')
        
    try:
        ozone.append(json_data['daily']['data'][0]['ozone'])
    except:
        ozone.append('None')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [20]:
weather = pd.DataFrame({'lat_list':lat_list, 'long_list':long_list,'temperatureHigh': temperatureHigh, 
              'temperatureLow':temperatureLow, 'summary': summary, 'icon': icon, 'precipIntensity': precipIntensity,
              'precipIntensity': precipIntensity, 'precipIntensityMax' : precipIntensityMax, 'humidity': humidity,
              'pressure': pressure, 'windSpeed': windSpeed, 'windGust': windGust, 'cloudCover': cloudCover,
             'ozone': ozone})

In [21]:
weather.head()

Unnamed: 0,lat_list,long_list,temperatureHigh,temperatureLow,summary,icon,precipIntensity,precipIntensityMax,humidity,pressure,windSpeed,windGust,cloudCover,ozone
0,-15.754,27.784,36.67,23.1,Clear throughout the day.,clear-day,0.0018,0.0072,0.3,1010.9,4.71,11.54,0.25,282.4
1,-15.754,27.785,36.67,23.1,Clear throughout the day.,clear-day,0.0018,0.0072,0.3,1010.9,4.71,11.54,0.25,282.4
2,53.348,-6.252,10.71,8.82,Light rain throughout the day.,rain,0.7519,2.141,0.95,984.7,5.17,13.85,0.87,327.1
3,-15.754,27.785,36.67,23.1,Clear throughout the day.,clear-day,0.0018,0.0072,0.3,1010.9,4.71,11.54,0.25,282.4
4,43.646,11.169,17.26,14.21,Light rain throughout the day.,rain,0.5085,1.8537,0.94,1009.5,1.82,8.82,1.0,279.1


#### Save to file

In [23]:
weather.to_csv('EARTHWATCH_Data/Darksky_23000_24203.csv', index = False )

### 2) Combine dataset with weather features

In [24]:
ds_1 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_200.csv')
ds_2 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_200_400.csv')
ds_3 = pd.read_csv('Earthwatch_Data//Darksky/Darksky_400_800.csv')
ds_4 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_800_1000.csv')
ds_5 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_1000_2000.csv')
ds_6 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_2000_5000.csv')
ds_7 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_5000_6000.csv')
ds_8 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_6000_6500.csv')
ds_9 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_6500_7000.csv')
ds_10 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_7000_9000.csv')
ds_11 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_9000_11000.csv')
ds_12 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_11000_13000.csv')
ds_13 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_13000_14400.csv')
ds_14 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_14400_15000.csv')
ds_15 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_15000_17000.csv')
ds_16 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_17000_19000.csv')
ds_17 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_19000_21000.csv')
ds_18 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_21000_23000.csv')
ds_19 = pd.read_csv('Earthwatch_Data/Darksky/Darksky_23000_24203.csv')

In [25]:
ds_2 = ds_2.drop(columns = ['Unnamed: 0'] )

#### Weather dataframe only

In [26]:
frames = [ds_1, ds_2, ds_3, ds_4, ds_5, ds_6, ds_7, ds_8, ds_9, ds_10, ds_11,
         ds_12, ds_13, ds_14, ds_15, ds_16, ds_17, ds_18, ds_19]

result = pd.concat(frames)
result.to_csv('Earthwatch_Data/Darksky/darksky_complete.csv', index = False)

In [27]:
result.reset_index(inplace = True, drop = True)

#### Combine weather dataset and Earthwatch dataset

In [28]:
df_weather = pd.concat([df_features, result], axis=1)

#### Save completed data to file

In [29]:
df_weather.to_csv('Earthwatch_Data/Darksky/df_weather.csv', index = False)