# Nenana Ice Classic Data Gathering
This notebook was used for gathering the data used for this project.
## Data Sources
* DarkSky's API to collect relevant weather information. (https://darksky.net/poweredby/)
* Nenana Ice Classic's website was scraped for ice thickness data (https://www.nenanaakiceclassic.com/ice.htm).

In [1]:
# imports
import numpy as np
import pandas as pd
import requests
import json
import datetime as dt
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import gc

In [2]:
def get_keys(path):
    with open(path) as f:
        return json.load(f)

keys = get_keys("/Users/davidwalkup/.secret/api_keys.json")
api_key = keys['darksky_api_key']

In [3]:
# initialize year and date lists
years_list = [year for year in range(2009, 2020)]
months_list = [4, 5]
days_list = [day for day in range(1, 32)]

In [4]:
# make date list
# **TODO** fix this to use dt.date(year, month, day).isoformat()
query_dates = []
for year in years_list:
    for month in months_list:
        for day in days_list:
            if len(str(day)) < 2:
                temp_day = '0' + str(day)
            else:
                temp_day = str(day)
            temp_date = str(year) + '-0' + str(month) + '-' + temp_day
            if month == 4 and day < 31:
                query_dates.append(temp_date)
            elif month == 4:
                pass
            else:
                query_dates.append(temp_date)

In [5]:
len(query_dates)

671

In [6]:
# delete unneeded variables and collect garbage
del years_list, months_list, days_list
gc.collect()

5

In [7]:
#initialize weather_data dict
weather_data = {}

In [8]:
# define variable for Nenana, Alaska lat & long
nenana_coords = '64.558056,-149.090556'

In [None]:
# test API call
# response = requests.get(f'https://api.darksky.net/forecast/{api_key}/{nenana_coords},2009-04-02T23:59:59?exclude=currently,hourly,minutely,alerts,flags')
# response.json()

**TODO**: get weather data for 2009-2014

In [9]:
#get weather data from DarkSky for dates in query_dates
for date in query_dates:
    response = requests.get(f'https://api.darksky.net/forecast/{api_key}/{nenana_coords},{date}T23:59:59?exclude=currently,hourly,minutely,alerts,flags')
    if response.status_code == requests.codes.ok:
        weather_data[date] = response.json()
    else:
        print('Status returned: ' + str(response.status_code) + ' for date: ' + date)
        break

In [10]:
# save weather data as a json file so I don't lose it when restarting the kernel
with open('../data/weather_data_2009-2020.json', 'w') as fp:
    json.dump(weather_data, fp)

In [11]:
# read weather data from json file
with open('../data/weather_data_2009-2020.json', 'r') as fp:
    weather_data = json.load(fp)
weather_data['2019-05-31']

{'latitude': 64.558056,
 'longitude': -149.090556,
 'timezone': 'America/Anchorage',
 'daily': {'data': [{'time': 1559289600,
    'summary': 'Light rain starting in the afternoon.',
    'icon': 'rain',
    'sunriseTime': 1559303220,
    'sunsetTime': 1559376240,
    'moonPhase': 0.92,
    'precipIntensity': 0.0067,
    'precipIntensityMax': 0.0279,
    'precipIntensityMaxTime': 1559366340,
    'precipProbability': 0.94,
    'precipType': 'rain',
    'temperatureHigh': 64.39,
    'temperatureHighTime': 1559342820,
    'temperatureLow': 50.98,
    'temperatureLowTime': 1559397060,
    'apparentTemperatureHigh': 63.89,
    'apparentTemperatureHighTime': 1559342820,
    'apparentTemperatureLow': 51.47,
    'apparentTemperatureLowTime': 1559397060,
    'dewPoint': 47.91,
    'humidity': 0.68,
    'pressure': 1005.3,
    'windSpeed': 2.29,
    'windGust': 8.64,
    'windGustTime': 1559358000,
    'windBearing': 343,
    'cloudCover': 0.91,
    'uvIndex': 3,
    'uvIndexTime': 1559339460,
   

In [12]:
query_dates[0]

'2009-04-01'

In [None]:
# temp_df = pd.read_json('weather_data_2009-2020.json',
#                        orient = 'index')

In [None]:
# temp_df.drop(columns = ['latitude', 'longitude', 'timezone', 'offset'], inplace = True)

In [13]:
weather_dict = {}
for k in weather_data:
    weather_dict[k] = weather_data[k]['daily']['data'][0]

In [14]:
weather_df = pd.DataFrame.from_dict(weather_dict).T

In [15]:
weather_df.head()

Unnamed: 0,time,summary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,temperatureHigh,...,temperatureMax,temperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipIntensityMaxTime,precipType,precipAccumulation,ozone
2009-04-01,1238572800,Partly cloudy throughout the day.,partly-cloudy-day,1238598720,1238647800,0.24,0,0,0,22.56,...,22.56,1238623320,-0.27,1238579940,17.69,1238608560,,,,
2009-04-02,1238659200,Partly cloudy throughout the day.,partly-cloudy-day,1238684940,1238734380,0.27,0,0,0,24.72,...,24.72,1238726100,-19.98,1238684760,20.02,1238735040,,,,
2009-04-03,1238745600,Clear throughout the day.,clear-day,1238771100,1238820960,0.31,0,0,0,31.73,...,31.73,1238804700,-5.11,1238773740,29.2,1238798700,,,,
2009-04-04,1238832000,Clear throughout the day.,clear-day,1238857260,1238907540,0.34,0,0,0,25.51,...,25.51,1238896800,-7.95,1238859660,20.35,1238896920,,,,
2009-04-05,1238918400,Clear throughout the day.,clear-day,1238943480,1238994120,0.38,0,0,0,31.85,...,31.85,1238985180,-7.83,1238929200,28.09,1238989980,,,,


In [16]:
weather_df.to_csv('../data/weather_2009-2020.csv', index_label = 'Date')

In [17]:
weather_df

Unnamed: 0,time,summary,icon,sunriseTime,sunsetTime,moonPhase,precipIntensity,precipIntensityMax,precipProbability,temperatureHigh,...,temperatureMax,temperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,precipIntensityMaxTime,precipType,precipAccumulation,ozone
2009-04-01,1238572800,Partly cloudy throughout the day.,partly-cloudy-day,1238598720,1238647800,0.24,0,0,0,22.56,...,22.56,1238623320,-0.27,1238579940,17.69,1238608560,,,,
2009-04-02,1238659200,Partly cloudy throughout the day.,partly-cloudy-day,1238684940,1238734380,0.27,0,0,0,24.72,...,24.72,1238726100,-19.98,1238684760,20.02,1238735040,,,,
2009-04-03,1238745600,Clear throughout the day.,clear-day,1238771100,1238820960,0.31,0,0,0,31.73,...,31.73,1238804700,-5.11,1238773740,29.2,1238798700,,,,
2009-04-04,1238832000,Clear throughout the day.,clear-day,1238857260,1238907540,0.34,0,0,0,25.51,...,25.51,1238896800,-7.95,1238859660,20.35,1238896920,,,,
2009-04-05,1238918400,Clear throughout the day.,clear-day,1238943480,1238994120,0.38,0,0,0,31.85,...,31.85,1238985180,-7.83,1238929200,28.09,1238989980,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-05-27,1558944000,Partly cloudy throughout the day.,clear-day,1558958340,1559029860,0.8,0.0003,0.0016,0.12,67.58,...,67.58,1558998180,53.2,1558969080,67.08,1558998180,1559012700,rain,,359.8
2019-05-28,1559030400,Drizzle in the morning.,rain,1559044560,1559116500,0.83,0.0008,0.0107,0.73,66.95,...,66.95,1559092740,49.5,1559052180,66.45,1559092740,1559055600,rain,,359
2019-05-29,1559116800,Partly cloudy throughout the day.,partly-cloudy-day,1559130780,1559203080,0.86,0.0001,0.0002,0.07,70.3,...,70.3,1559171220,47.31,1559132340,69.8,1559171220,1559119860,rain,,347.8
2019-05-30,1559203200,Partly cloudy throughout the day.,partly-cloudy-day,1559217000,1559289660,0.89,0.0001,0.0001,0.09,74.71,...,74.71,1559267160,52.49,1559220900,74.21,1559267160,1559264880,rain,,349.5


## Getting info from NIC website

In [None]:
url = 'https://www.nenanaakiceclassic.com/ice.htm'

In [None]:
response = requests.get(url)

In [None]:
print(response.raise_for_status())
soup = BeautifulSoup(response.text)
print(soup.prettify())

In [None]:
link_list = soup.findAll('a')

In [None]:
link_list[10:-1]

In [None]:
base_url = 'https://www.nenanaakiceclassic.com/'

In [None]:
link_list[10]['href']

In [None]:
target_url = base_url + link_list[13]['href']
target_url

In [None]:
chrome_path = '/Users/davidwalkup/Downloads/chromedriver-2'
options = Options()
driver = webdriver.Chrome(chrome_path, 
                          options=options)
driver.set_window_size(1400,1000)
driver.get(target_url)

In [None]:
page_source = driver.page_source
soup = BeautifulSoup(page_source)
tables = soup.findAll('table')
table = tables[0].prettify()

In [None]:
table_text = pd.read_html(table)[0][0][0]
table_text

In [None]:
table_year = table_text[:4]
table_year

In [None]:
table_detail = table_text[21:].strip()
table_detail

In [None]:
driver.quit()

In [None]:
line_split = table_detail.split('  ')
line_split

In [None]:
link_list[-2]['href']

In [None]:
target_url = base_url + link_list[-2]['href']

In [None]:
driver = webdriver.Chrome(chrome_path, 
                          options=options)
driver.set_window_size(1400,1000)
driver.get(target_url)
page_source = driver.page_source
soup = BeautifulSoup(page_source)
tables = soup.findAll('table')
table = tables[0].prettify()

In [None]:
table_text = pd.read_html(table)[0][0][0]
driver.quit()
table_text

In [None]:
table_year = table_text[:4]
table_year

In [None]:
table_detail = table_text[21:].strip()
table_detail

In [None]:
line_split = table_detail.split('  ')
line_split

In [None]:
month_fix = {'Jan' : '01',
             'Feb' : '02',
             'Mar' : '03',
             'Apr' : '04',
             'May' : '05',
             'Jun' : '06',
             'Jul' : '07',
             'Aug' : '08',
             'Sep' : '09',
             'Oct' : '10',
             'Nov' : '11',
             'Dec' : '12',
             'March' : '03',
             'April' : '04'}

In [None]:
for item in line_split:
    item_split = item.split(' ')
    date_split = item_split[0].split('-')
    item_date = table_year + '-' + date_split[1] + '-' + date_split[0]
    item_data = item_split[1] + ' ' + item_split[2]
    print(item_date , ':', item_data)

In [None]:
for item in item_split:
    date_split = item.split('-')
    print(date_split)

In [None]:
ice_thickness_df = pd.DataFrame(columns = ['Date', 'Thickness'])

In [None]:
# options.add_argument('--headless')
options = Options()
driver = webdriver.Chrome(chrome_path, 
                      options=options)
driver.set_window_size(500,300)
# for suffix in link_list[15:19]:
for suffix in link_list[10:21]:
    target_url = base_url + suffix['href']
    driver.get(target_url)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source)
    tables = soup.findAll('table')
    if tables:
        row_dict = {}
        table = tables[0].prettify()
        table_text = pd.read_html(table)[0][0][0]
        table_year = table_text[:4]
        table_detail = table_text[21:].strip()
        line_split = table_detail.split('  ')
        for item in line_split:
            item_split = item.split(' ')
            if item_split[-1] in ['Inches', 'inches']:
                ice_depth = item_split[-2]
            else:
                ice_depth = item_split[-1]
            if '-' in item_split[0]:
                for item in item_split:
                    date_split = item.split('-')
                    if len(date_split) > 1:
                        if date_split[1] in month_fix:
                            item_month = month_fix[date_split[1]]
                            item_day = date_split[0]
                            if len(item_day) < 2:
                                item_day = '0' + item_day
                        else:
                            print('check: ', date_split)
            else:
                if item_split[0] in month_fix:
                    item_month = month_fix[item_split[0]]
                    item_day = item_split[1]
                    if len(item_day) < 2:
                        item_day = '0' + item_day
                    elif len(item_day) > 2:
                        dash_split = item_day.split('-')
                        item_day = dash_split[0]
            ymd = table_year + '-' + item_month + '-' + item_day
            row_dict = {'Date' : [ymd], 'Thickness' : [ice_depth]}
            ice_thickness_df = ice_thickness_df.append(pd.DataFrame.from_dict(row_dict,
                                                                                orient = 'columns'),
                                                         ignore_index = True)
        pass
    else:
        row_dict = {}
        p_data = soup.findAll('p')
        table_year = p_data[0].text
        for datum in p_data[1:]:
            line_split = datum.text.split('\n')
            for line in line_split:
                line = line.strip()
                item_split = line.split(' ')
                if item_split[0].strip() in month_fix:
                    item_month = month_fix[item_split[0].strip()]
                    item_day = item_split[1].strip()
                    if len(item_day) < 2:
                        item_day = '0' + item_day
                    elif len(item_day) > 2:
                        dash_split = item_day.split('-')
                        item_day = dash_split[0]
                ymd = table_year.strip() + '-' + item_month + '-' + item_day
                if item_split[-1] in ['Inches', 'inches']:
                    ice_depth = item_split[-2]
                else:
                    ice_depth = item_split[-1]
                if ymd == '' or ice_depth == '':
                    pass
                else:
                    row_dict = {'Date' : [ymd], 'Thickness' : [ice_depth]}
                    ice_thickness_df = ice_thickness_df.append(pd.DataFrame.from_dict(row_dict,
                                                                                        orient = 'columns'),
                                                                 ignore_index = True)
driver.quit()
# list for special treatment: 2015, 2013, 
# 2012 is end of early regime; 2013 starts new regime

In [None]:
ice_thickness_df.head().append(ice_thickness_df.tail())

In [None]:
ice_thickness_df.to_csv('ice_thickness_2009-2019.csv', index = False)