In [1]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd
import io
import urllib
from PIL import Image 
import time 
import numpy as np
import datetime as dt
from twilio.rest import Client

In [2]:
#function to send twillio text
def send_text(msg):
    client = Client(account, token)

    message = client.messages.create(to="+", from_="+12055836550",
                                 body=msg)

In [3]:
#define website and station
station_id = 44008 
root_path = 'https://www.ndbc.noaa.gov'
station_path = '/station_page.php?station='

In [4]:
#clean up df
def clean_df(df):
    df['TIME(EDT)'] = df['TIME(EDT)'].apply(lambda x: delete_code(x)) 
    df[['MM', 'DD']] = df[['MM', 'DD']].astype(str) #month and day to string
    df['date_stamp'] = df.MM + ' ' + df.DD + ' ' + df['TIME(EDT)'] #create date stamp
    df['time_stamp'] = df['date_stamp'].apply(lambda x: get_date(x)) #create time stamp 
    df.drop(['MM', 'DD', 'TIME(EDT)', 'date_stamp'], axis = 1, inplace = True) #drop 
    return df

In [5]:
#get date as datetime object
def get_date(x):
    x = x.split()
    mm = x[0]
    dd = x[1]
    time = str(x[2])
    time = time.replace('\xa0', '').encode('utf-8')
    datetime_object = dt.datetime.strptime('{} {} {} {}' .format(x[0], x[1], 2020, time) , '%m %d %Y %I:%M%p')
    return datetime_object

In [6]:
#eliminate unicode character
def delete_code(x):
    x = x.replace('\\xa0', '').encode('utf-8')
    return x

In [7]:
#get html
def get_html(r_p, s_p, sta_id):
    try:
        filepath = r_p + s_p + '{}' .format(sta_id)
        r = requests.get(url = filepath)
        assert r.status_code == 200
        r.raise_for_status()
        html = r.content
        return html
    except:
        raise ValueError('Could not retrieve HTML')

In [8]:
#get table and return dataframe
def get_df(soup):
    try:
        table = soup.findAll("table", class_="dataTable")
        df_1 = pd.read_html(str(table))[0]
        df_2 = pd.read_html(str(table))[1]
        return df_1, df_2
    except:
        raise ValueError('Could not get dataframe')

In [9]:
#get src attribute
def get_src_attr(soup, sta_id):
    try:
        img = soup.findAll('img', {'alt': 'Photos from Buoy Camera at station {}' .format(sta_id)})
        src_attr = img[0]['src']
        return src_attr
    except:
        raise ValueError('Could not retrieve src attribute')

In [10]:
#save image
def save_img(img_path, src):
    try:
        raw_img = requests.get(url = img_path, stream = True)
        raw_img.raise_for_status()
        raw_img.raw.decode_content = True
        timestamp = str(src).split('/')
        with Image.open(raw_img.raw) as img:
            img.save('Images/{}' .format(timestamp[-1]), 'JPEG')
        raw_img.close() 
    except:
        raise ValueError('Could not save image')

In [11]:
#gets data and/or image
def retrieve_data(just_image, sta_id):
    try:
        html = get_html(root_path, station_path, sta_id)
        soup = bs(html, 'html5lib')
        if just_image:
            src_attr = get_src_attr(soup, sta_id)   
            img_path = root_path + src_attr 
            save_img(img_path, src_attr)
        else:
            df_1, df_2 = get_df(soup)
            return df_1, df_2
    except ValueError as err:
        print(err.args)

In [12]:
#seperates wind df
def get_wind_df(df):
    wind_df = clean_df(df)
    wind_df = wind_df[['WDIR', 'WSPDkts', 'GSTkts', 'time_stamp']]
    return wind_df

In [13]:
#seperates wave df
def get_wave_df(df):
    wave_df = clean_df(df)
    wave_df = wave_df[['WVHTft', 'time_stamp']]
    return wave_df

In [14]:
#merges wind, wave df
def merge_dfs(df_1, df_2):
    merged_df = df_1.merge(df_2, how='left', left_on='time_stamp', right_on='time_stamp')
    return merged_df

In [15]:
def main(just_image):
#get saved df
    try:
        old_df = pd.read_csv('dataset.csv')
    except:
        print('no dataset')
    just_image = just_image
    if just_image:
        retrieve_data(just_image, 44008)
    else:
        df_1, df_2 = retrieve_data(just_image, 44008)
        wind_df = get_wind_df(df_1)
        wave_df = get_wave_df(df_2)
        merged_df = wind_df.merge(wave_df, how='left', left_on='time_stamp', right_on='time_stamp')
        merged_dfs = merged_df.append(old_df)
        merged_dfs.drop_duplicates(inplace = True)
        merged_dfs.to_csv('dataset.csv', index = False)

In [16]:
#runs every hour for duration, gets just df at 0 and 13
def everyhour(): 
    time_stop = 0
    while time_stop < 14:
        if time_stop == 0 or time_stop == 13:
            main(False)
        else:
            main(True)
            time.sleep(3600)
        time_stop += 1
everyhour()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


KeyboardInterrupt: 

In [None]:
#notes to self
#Atlantic Daylight Time (ADT) is 3 hours behind Coordinated Universal Time (UTC).
#This time zone is a Daylight Saving Time time zone and is used in: North America, Atlantic.
#This time zone is often called ADT Time Zone.