# Nenana Ice Classic Data Gathering - Ice Measurements
This notebook was used for gathering the ice measurement data used for this project.
## Data Source
* Nenana Ice Classic's website was scraped for ice thickness data (https://www.nenanaakiceclassic.com/ice.htm).

In [1]:
# imports
import numpy as np
import pandas as pd
import requests
import json
import datetime as dt
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import gc

## Getting info from NIC website

In [2]:
url = 'https://www.nenanaakiceclassic.com/ice.htm'

In [3]:
response = requests.get(url)

In [4]:
print(response.raise_for_status())
soup = BeautifulSoup(response.text)
print(soup.prettify())

None
<!DOCTYPE html>
<html>
 <head>
  <meta charset="utf-8"/>
  <title>
   Nenana Ice Classic
  </title>
  <style type="text/css">
   h4 {
			text-align:center;
			Width: 900px;
			
		}
  </style>
 </head>
 <body>
  <header>
   <img alt="" height="400" src="banner.jpg" width="960"/>
  </header>
  <h4>
   <bold>
    <a href="index.htm" title="Nenana Ice Classic ">
     Home/
    </a>
    <a href="tickets.htm" title="Tickets">
     Tickets/
    </a>
    <a href="brochures.htm" title="Brochures">
     Brochures/
    </a>
    <a href="poolrules.htm" title="Pool Rules">
     Pool Rules/
    </a>
    <a href="ice.htm" title="Ice Page">
     Ice/
    </a>
    <a href="postercontest.htm" title="Poster Contest">
     Poster Contest/
    </a>
    <a href="souvenirs.htm" title="Souvenir Page">
     Souvenirs/
    </a>
    <a href="contact.htm" title="Contact Nenana Ice Classic">
     Contact/
    </a>
    <a href="organization.htm" title="Organization">
     Organization/
    </a>
    <a href="FA

In [5]:
link_list = soup.findAll('a')

In [6]:
link_list[10:-1]

[<a href="2019.html">2019</a>,
 <a href="2018.html">2018</a>,
 <a href="2017.htm">2017</a>,
 <a href="2016.htm">2016</a>,
 <a href="2015%20Ice.htm">2015</a>,
 <a href="2014.htm">2014</a>,
 <a href="2013.htm">2013</a>,
 <a href="2012.htm">2012</a>,
 <a href="2011.htm">2011</a>,
 <a href="2010.htm">2010</a>,
 <a href="2009.htm">2009</a>,
 <a href="2008.htm">2008</a>,
 <a href="2007.htm">2007</a>,
 <a href="2006.htm">2006</a>,
 <a href="2005.htm">2005</a>,
 <a href="2004.htm">2004</a>,
 <a href="2003.htm">2003</a>,
 <a href="2002.htm">2002</a>,
 <a href="2001.htm">2001</a>,
 <a href="2000.htm">2000</a>,
 <a href="1999.htm">1999</a>,
 <a href="1998.htm">1998</a>,
 <a href="1997.htm">1997</a>,
 <a href="1996.htm">1996</a>,
 <a href="1995.htm">1995</a>,
 <a href="1994.htm">1994</a>,
 <a href="1993.htm">1993</a>,
 <a href="1992.htm">1992</a>,
 <a href="1991.htm">1991</a>,
 <a href="1990.htm">1990</a>,
 <a href="1989.htm">1989</a>]

In [7]:
base_url = 'https://www.nenanaakiceclassic.com/'

In [8]:
link_list[10]['href']

'2019.html'

In [9]:
target_url = base_url + link_list[13]['href']
target_url

'https://www.nenanaakiceclassic.com/2016.htm'

In [10]:
chrome_path = '/Users/davidwalkup/Downloads/chromedriver-2'
options = Options()
driver = webdriver.Chrome(chrome_path, 
                          options=options)
driver.set_window_size(1400,1000)
driver.get(target_url)

In [11]:
page_source = driver.page_source
soup = BeautifulSoup(page_source)
tables = soup.findAll('table')
table = tables[0].prettify()

In [12]:
table_text = pd.read_html(table)[0][0][0]
table_text

'2016 Ice Measurements  Jan 13-25 Inches  Feb 8-33.5 Inches  Feb 22-33 Inches  Mar 01-40 Inches  Mar 06 - 40 Inches  Mar 10 - 36 Inches  March 16 - 37.2 Inches  March 21 - 35.3 Inches  March 24 - 38 Inches  March 28 - 36.7 Inches  March 31 - 35.7 Inches  April 4 - 36 Inches  April 7 - 34.8 Inches  April 11 - 34.7 Inches  April 14 - 32.3 Inches'

In [13]:
table_year = table_text[:4]
table_year

'2016'

In [14]:
table_detail = table_text[21:].strip()
table_detail

'Jan 13-25 Inches  Feb 8-33.5 Inches  Feb 22-33 Inches  Mar 01-40 Inches  Mar 06 - 40 Inches  Mar 10 - 36 Inches  March 16 - 37.2 Inches  March 21 - 35.3 Inches  March 24 - 38 Inches  March 28 - 36.7 Inches  March 31 - 35.7 Inches  April 4 - 36 Inches  April 7 - 34.8 Inches  April 11 - 34.7 Inches  April 14 - 32.3 Inches'

In [15]:
driver.quit()

In [16]:
line_split = table_detail.split('  ')
line_split

['Jan 13-25 Inches',
 'Feb 8-33.5 Inches',
 'Feb 22-33 Inches',
 'Mar 01-40 Inches',
 'Mar 06 - 40 Inches',
 'Mar 10 - 36 Inches',
 'March 16 - 37.2 Inches',
 'March 21 - 35.3 Inches',
 'March 24 - 38 Inches',
 'March 28 - 36.7 Inches',
 'March 31 - 35.7 Inches',
 'April 4 - 36 Inches',
 'April 7 - 34.8 Inches',
 'April 11 - 34.7 Inches',
 'April 14 - 32.3 Inches']

In [17]:
link_list[-2]['href']

'1989.htm'

In [18]:
target_url = base_url + link_list[-2]['href']

In [19]:
driver = webdriver.Chrome(chrome_path, 
                          options=options)
driver.set_window_size(1400,1000)
driver.get(target_url)
page_source = driver.page_source
soup = BeautifulSoup(page_source)
tables = soup.findAll('table')
table = tables[0].prettify()

In [20]:
table_text = pd.read_html(table)[0][0][0]
driver.quit()
table_text

'1989 Ice Measurements  26-Feb 42 Inches  16-Mar 37.5 Inches  21-Mar 37.5 Inches  25-Mar 40.5 Inches  28-Mar 41.5 Inches  4-Apr 42 Inches  10-Apr 43 Inches  12-Apr 40 Inches'

In [21]:
table_year = table_text[:4]
table_year

'1989'

In [22]:
table_detail = table_text[21:].strip()
table_detail

'26-Feb 42 Inches  16-Mar 37.5 Inches  21-Mar 37.5 Inches  25-Mar 40.5 Inches  28-Mar 41.5 Inches  4-Apr 42 Inches  10-Apr 43 Inches  12-Apr 40 Inches'

In [23]:
line_split = table_detail.split('  ')
line_split

['26-Feb 42 Inches',
 '16-Mar 37.5 Inches',
 '21-Mar 37.5 Inches',
 '25-Mar 40.5 Inches',
 '28-Mar 41.5 Inches',
 '4-Apr 42 Inches',
 '10-Apr 43 Inches',
 '12-Apr 40 Inches']

In [24]:
month_fix = {'Jan' : '01',
             'Feb' : '02',
             'Mar' : '03',
             'Apr' : '04',
             'May' : '05',
             'Jun' : '06',
             'Jul' : '07',
             'Aug' : '08',
             'Sep' : '09',
             'Oct' : '10',
             'Nov' : '11',
             'Dec' : '12',
             'March' : '03',
             'April' : '04'}

In [25]:
for item in line_split:
    item_split = item.split(' ')
    date_split = item_split[0].split('-')
    item_date = table_year + '-' + date_split[1] + '-' + date_split[0]
    item_data = item_split[1] + ' ' + item_split[2]
    print(item_date , ':', item_data)

1989-Feb-26 : 42 Inches
1989-Mar-16 : 37.5 Inches
1989-Mar-21 : 37.5 Inches
1989-Mar-25 : 40.5 Inches
1989-Mar-28 : 41.5 Inches
1989-Apr-4 : 42 Inches
1989-Apr-10 : 43 Inches
1989-Apr-12 : 40 Inches


In [26]:
for item in item_split:
    date_split = item.split('-')
    print(date_split)

['12', 'Apr']
['40']
['Inches']


In [27]:
ice_thickness_df = pd.DataFrame(columns = ['Date', 'Thickness'])

In [29]:
for suffix in link_list:
    print(suffix)

<a href="index.htm" title="Nenana Ice Classic ">Home/</a>
<a href="tickets.htm" title="Tickets">Tickets/</a>
<a href="brochures.htm" title="Brochures">Brochures/ </a>
<a href="poolrules.htm" title="Pool Rules">Pool Rules/ </a>
<a href="ice.htm" title="Ice Page">Ice/</a>
<a href="postercontest.htm" title="Poster Contest">Poster Contest/ </a>
<a href="souvenirs.htm" title="Souvenir Page">Souvenirs/ </a>
<a href="contact.htm" title="Contact Nenana Ice Classic">Contact/ </a>
<a href="organization.htm" title="Organization">Organization/ </a>
<a href="FAQ.htm" title="Frequently Asked Questions">FAQ</a>
<a href="2019.html">2019</a>
<a href="2018.html">2018</a>
<a href="2017.htm">2017</a>
<a href="2016.htm">2016</a>
<a href="2015%20Ice.htm">2015</a>
<a href="2014.htm">2014</a>
<a href="2013.htm">2013</a>
<a href="2012.htm">2012</a>
<a href="2011.htm">2011</a>
<a href="2010.htm">2010</a>
<a href="2009.htm">2009</a>
<a href="2008.htm">2008</a>
<a href="2007.htm">2007</a>
<a href="2006.htm">2006<

In [32]:
%time
# options.add_argument('--headless')
options = Options()
driver = webdriver.Chrome(chrome_path, 
                      options=options)
driver.set_window_size(500,300)
# for suffix in link_list[15:19]:
for suffix in link_list[10:-1]:
    target_url = base_url + suffix['href']
    driver.get(target_url)
    page_source = driver.page_source
    soup = BeautifulSoup(page_source)
    tables = soup.findAll('table')
    if tables:
        row_dict = {}
        table = tables[0].prettify()
        table_text = pd.read_html(table)[0][0][0]
        table_year = table_text[:4]
        table_detail = table_text[21:].strip()
        line_split = table_detail.split('  ')
        for item in line_split:
            item_split = item.split(' ')
            if item_split[-1] in ['Inches', 'inches']:
                ice_depth = item_split[-2]
            else:
                ice_depth = item_split[-1]
            if '-' in item_split[0]:
                for item in item_split:
                    date_split = item.split('-')
                    if len(date_split) > 1:
                        if date_split[1] in month_fix:
                            item_month = month_fix[date_split[1]]
                            item_day = date_split[0]
                            if len(item_day) < 2:
                                item_day = '0' + item_day
                        else:
                            print('check: ', date_split)
            else:
                if item_split[0] in month_fix:
                    item_month = month_fix[item_split[0]]
                    item_day = item_split[1]
                    if len(item_day) < 2:
                        item_day = '0' + item_day
                    elif len(item_day) > 2:
                        dash_split = item_day.split('-')
                        item_day = dash_split[0]
            ymd = table_year + '-' + item_month + '-' + item_day
            row_dict = {'Date' : [ymd], 'Thickness' : [ice_depth]}
            ice_thickness_df = ice_thickness_df.append(pd.DataFrame.from_dict(row_dict,
                                                                                orient = 'columns'),
                                                         ignore_index = True)
        pass
    else:
        row_dict = {}
        p_data = soup.findAll('p')
        table_year = p_data[0].text
        for datum in p_data[1:]:
            line_split = datum.text.split('\n')
            for line in line_split:
                line = line.strip()
                item_split = line.split(' ')
                if item_split[0].strip() in month_fix:
                    item_month = month_fix[item_split[0].strip()]
                    item_day = item_split[1].strip()
                    if len(item_day) < 2:
                        item_day = '0' + item_day
                    elif len(item_day) > 2:
                        dash_split = item_day.split('-')
                        item_day = dash_split[0]
                ymd = table_year.strip() + '-' + item_month + '-' + item_day
                if item_split[-1] in ['Inches', 'inches']:
                    ice_depth = item_split[-2]
                else:
                    ice_depth = item_split[-1]
                if ymd == '' or ice_depth == '':
                    pass
                else:
                    row_dict = {'Date' : [ymd], 'Thickness' : [ice_depth]}
                    ice_thickness_df = ice_thickness_df.append(pd.DataFrame.from_dict(row_dict,
                                                                                        orient = 'columns'),
                                                                 ignore_index = True)
driver.quit()
# list for special treatment: 2015, 2013, 
# 2012 is end of early regime; 2013 starts new regime

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


In [33]:
ice_thickness_df.head().append(ice_thickness_df.tail())

Unnamed: 0,Date,Thickness
0,2019-01-16,16.0
1,2019-02-07,16.0
2,2019-02-26,23.5
3,2019-03-04,32.5
4,2019-03-13,25.7
958,1989-03-25,40.5
959,1989-03-28,41.5
960,1989-04-04,42.0
961,1989-04-10,43.0
962,1989-04-12,40.0


In [35]:
ice_thickness_df.to_csv('../data/raw_ice_thickness_1989-2019.csv', index_label = 'Date')