__W205, Fall 2016__
__Final Project:__ Solar Fields and Weather
__Group:__ Boris Kletser, Maya Miller-Vedam, Geoff Striling, Laura Williams
# NOAA Data Ingest
OVERVIEW: This is just a file to help me learn how to load data from the NOAA website into a postgres table... This code will eventually live in a script called __data_ingest_noaa.py__.

In [1]:
# imports
from __future__ import absolute_import, print_function, unicode_literals
import os
import requests
import numpy as np
import pandas as pd
import psycopg2
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

In [2]:
# save working directory to be reset at the end
initial_directory = os.getcwd()
#os.chdir('/Users/mmillervedam/Documents/MIDS/W205/FinalProject')
os.chdir('/home/w205/w205_energy')

In [3]:
# import functions for creating/recreating the postgres database & tables
from setup import create_database, create_tables

### Set Up

In [4]:
# Globals
STATIONS_URL = 'http://www1.ncdc.noaa.gov/pub/data/uscrn/products/stations.tsv'

In [5]:
# helper function
def get_noaa_url(wban, stations_df):
    """ Function to take a wban number and output a url."""
    base = 'http://www1.ncdc.noaa.gov/pub/data/uscrn/products/monthly01/CRNM0102-'
    station = '_'.join(stations_df.loc[str(wban),['STATE', 'LOCATION', 'VECTOR']])
    return base + station.replace(' ','_') + '.txt'

### Ingest Stations Table from NOAA

In [6]:
# load USCERN stations indexed by their WBAN ID numbers
stations_df = pd.read_csv(STATIONS_URL, sep = '\t', header=0, index_col = 'WBAN')

In [7]:
# select the columns that we'll use & rename them to fit the postgres table
cols = ['NAME','LOCATION','VECTOR','STATE','LATITUDE','LONGITUDE','ELEVATION']
stations_df = stations_df[cols]
stations_df.index.names = ['wban_id']
stations_df = stations_df.rename(index=str, columns={c:c.lower() for c in cols})
stations_df.head()

Unnamed: 0_level_0,name,location,vector,state,latitude,longitude,elevation
wban_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3047,Sandhills State Park,Monahans,6 ENE,TX,31.62,-102.8,2724
3048,Sevilleta National Wildlife Refuge (LTER Site),Socorro,20 N,NM,34.35,-106.88,4847
3054,Muleshoe National Wildlife Refuge (Headquarter...,Muleshoe,19 S,TX,33.95,-102.77,3742
3055,OK Panhandle Research & Extn. Center (Native ...,Goodwell,2 E,OK,36.59,-101.59,3266
3060,Black Canyon of the Gunnison National Park (Ve...,Montrose,11 ENE,CO,38.54,-107.69,8402


In [8]:
# AHHHHHHH need to fix uniqueness problem
print(len(stations_df))
print(len(stations_df.index.unique()))

242
233


In [13]:
# duplicate stations don't have WBAN IDS and are all in Alaska
stations_df[stations_df.duplicated()]

Unnamed: 0_level_0,name,location,vector,state,latitude,longitude,elevation
wban_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
UN,"FWS, Selawik National Wildlife Refuge (Cabin S...",Selawik,28 E,AK,66.56,-159.0,22
UN,"NPS, Denali National Park (Wonder Lake Campgro...",Denali,27 N,AK,63.45,-150.87,2225
UN,"BLM, Paxson Airport",Glennallen,64 N,AK,63.02,-145.5,2669
UN,"FWS, Nowitna National Wildlife Refuge (Lake Site)",Ruby,44 ESE,AK,64.5,-154.12,259
UN,"Arctic Slope Regional Corporation, Ivotuk Airs...",Ivotuk,1 NNE,AK,68.48,-155.75,1909
UN,"AK Department of Natural Resources, Haul Road)",Deadhorse,3 S,AK,70.16,-148.46,30
UN,"The Nature Conservancy, Gustavus Forelands Pre...",Gustavus,2 NE,AK,58.42,-135.69,20
UN,"NPS, Katmai National Park (Contact Creek)",King Salmon,42 SE,AK,58.2,-155.92,661
UN,"FWS, Tetlin National Wildlife Refuge (Seaton R...",Tok,70 SE,AK,62.73,-141.2,2000


In [20]:
# removing duplicates
stations_df = stations_df[-stations_df.duplicated()]
len(stations_df)
sum(stations_df.duplicated())

0

### Load Stations table into Postgres  
NOTE: postgres must be running for the following code to work

In [None]:
# Uncomment and run if you need to reset the DB
#create_database()

In [56]:
# Uncomment and run if you need to reset the tables
#create_tables() 

In [20]:
from sqlalchemy import create_engine

In [57]:
# connection string to allow pandas to work with psycopg2 & sqlalchemy
db_loc = 'postgresql+psycopg2://postgres:pass@localhost:5432/solarenergy' 
engine = create_engine(db_loc)

In [58]:
# for now i'm just loading 5 row because later i'll need to deal with the 
# problem of duplicate stations.
test_df = stations_df[1:5]
test_df.to_sql("weather_stations", engine, if_exists='append')