# 3 Parts of an API Request

1. Data REQUEST: you try to access a URL in your browser that specifies a particular subset of data
2. Data processing: A web server somewhere uses that url to query a specified dataset
3. Data RESPONSE: That web server then sends you back some content 



## Environment Setup

In [None]:
conda list

In [None]:
conda install -c conda-forge pymongo

In [None]:
conda install -c conda-forge dnspython

In [None]:
conda install -c conda-forge datetime # not available from default or forge

## Connect to MongoDB Client

In [None]:
import requests
from pymongo import MongoClient

In [None]:
# Connect Client to MongoDB Atlas cluster 
client = MongoClient("mongodb+srv://grant-west:Howard11@westai-gw.q2bnx.mongodb.net/audit?retryWrites=true&w=majority")


In [None]:
# Create local "audit" database on the fly
db = client["audit"]


### Pull data into MongoDB database collections

In [None]:

for collection_name in ["stations", "states", "monthly-normals"]:
    # collect the data from the api
    response = requests.get()

# NCDC NOAA Data 

In [None]:
import os
import sys

import numpy as np
import pandas as pd
import requests
#import datetime

In [None]:
## No datetime package available from forge, but not necessary
#Use the datetime package to get a year ago today
#lastyear = datetime.datetime.now()-datetime.timedelta(days=365)

In [None]:
#Use the same begin and end date for just one day's data. Format for the API request
#begin_date = lastyear.strftime("%Y-%m-%d")
#end_date = lastyear.strftime("%Y-%m-%d")

In [None]:
# Define variables
# Set api token
mytoken = 'yUACYLIYIXLIsIYsWImwIeXDrueCfzqv'
locationid = 'FIPS:25' #location id for Massachusetts (can be found on NOAA or requested as a different API as well)
#datasetid = 'NORMAL_MLY' #datset id for "Monthly Normals"
datasetid = 'GHCND'
begin_date = '2005-05-01'
end_date = '2005-05-31'

In [None]:
# Set url for api
base_url_data = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/data'
base_url_stations = 'https://www.ncdc.noaa.gov/cdo-web/api/v2/stations'

In [None]:
def get_normals(locationid, datasetid, begin_date, end_date, mytoken, base_url):
    token = {'token': mytoken}

    #passing as string instead of dict because NOAA API does not like percent encoding
   # params = 'datasetid='+str(datasetid)+'&'+'locationid='+str(locationid)+'&'+'startdate='+str(begin_date)+'&'+'enddate='+str(end_date)
    
    r = requests.get("https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=NORMAL_MLY&startdate=2010-05-01&enddate=2010-05-01", headers=token)
    print("Request status code: "+str(r.status_code))

    try:
        #results comes in json form. Convert to dataframe
        df = pd.DataFrame.from_dict(r.json()['results'])
        print("Successfully retrieved "+str(len(df['station'].unique()))+" stations")
        dates = pd.to_datetime(df['date'])
        print("Last date retrieved: "+str(dates.iloc[-1]))

        return df

    #Catch all exceptions for a bad request or missing data
    except:
        print("Error converting normals to dataframe. Missing data?")

In [None]:
df_normals = get_normals(locationid, datasetid, begin_date, end_date, mytoken, base_url_data)

In [None]:
def get_weather(locationid, datasetid, begin_date, end_date, mytoken, base_url):
    token = {'token': mytoken}

    #passing as string instead of dict because NOAA API does not like percent encoding
    params = 'datasetid='+str(datasetid)+'&'+'locationid='+str(locationid)+'&'+'startdate='+str(begin_date)+'&'+'enddate='+str(end_date)+'&'+'limit=25'+'&'+'units=standard'
    
    r2 = requests.get(base_url, params = params, headers=token)
    print("Request status code: "+str(r2.status_code))

    try:
        #results comes in json form. Convert to dataframe
        df = pd.DataFrame.from_dict(r2.json()['results'])
        print("Successfully retrieved "+str(len(df['station'].unique()))+" stations")
        dates = pd.to_datetime(df['date'])
        print("Last date retrieved: "+str(dates.iloc[-1]))

        return df

    #Catch all exceptions for a bad request or missing data
    except:
        print("Error converting weather data to dataframe. Missing data?")

In [None]:
df_weather = get_weather(locationid, datasetid, begin_date, end_date, mytoken, base_url_data)

In [None]:
df_normals.head()

In [None]:
r = requests.get("https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=NORMAL_MLY&startdate=2010-05-01&enddate=2010-05-01", headers={'token': mytoken})


In [None]:
print("Request status code: "+str(r.status_code))

In [None]:
def get_station_info(locationid, datasetid, mytoken, base_url):
    token = {'token': mytoken}

    #passing as string instead of dict because NOAA API does not like percent encoding
    
    stations = 'locationid='+str(locationid)+'&'+'datasetid='+str(datasetid)+'&'+'units=standard'+'&'+'limit=1000'
    r = requests.get(base_url, headers = token, params=stations)
    print("Request status code: "+str(r.status_code))

    try:
        #results comes in json form. Convert to dataframe
        df = pd.DataFrame.from_dict(r.json()['results'])
        print("Successfully retrieved "+str(len(df['id'].unique()))+" stations")
        
        if df.count().max() >= 1000:
            print('WARNING: Maximum data limit was reached (limit = 1000)')
            print('Consider breaking your request into smaller pieces')
 
        return df
    #Catch all exceptions for a bad request or missing data
    except:
        print("Error converting station data to dataframe. Missing data?")

In [None]:
df_stations = get_station_info(locationid, datasetid, mytoken, base_url_stations)

In [None]:
df_stations.head()

In [None]:
df_weather.head()

In [None]:
#merge with the weather

df = df_weather.merge(df_stations, left_on = 'station', right_on = 'id', how='inner')

#Check for missing overlap between station weather info and location info
    
location_ismissing = df_weather[~df_weather['station'].isin(df_stations['id'])]
loc_miss_count = len(location_ismissing['station'].unique())
if loc_miss_count != 0:
    print("Missing location data for "+str(loc_miss_count)+" stations")
else:
    print("Successfully retrieved and combined location data")



In [90]:
df.head()

Unnamed: 0,date,datatype,station,attributes,value,elevation,mindate,maxdate,latitude,name,datacoverage,id,elevationUnit,longitude
0,2005-05-01T00:00:00,PRCP,GHCND:USC00190120,",,0,0700",0.54,44.2,1893-01-01,2020-08-31,42.3861,"AMHERST, MA US",0.9836,GHCND:USC00190120,METERS,-72.5374
1,2005-05-01T00:00:00,SNOW,GHCND:USC00190120,"P,,0,",0.0,44.2,1893-01-01,2020-08-31,42.3861,"AMHERST, MA US",0.9836,GHCND:USC00190120,METERS,-72.5374
2,2005-05-01T00:00:00,SNWD,GHCND:USC00190120,"P,,0,",0.0,44.2,1893-01-01,2020-08-31,42.3861,"AMHERST, MA US",0.9836,GHCND:USC00190120,METERS,-72.5374
3,2005-05-01T00:00:00,TMAX,GHCND:USC00190120,",,0,0700",53.0,44.2,1893-01-01,2020-08-31,42.3861,"AMHERST, MA US",0.9836,GHCND:USC00190120,METERS,-72.5374
4,2005-05-01T00:00:00,TMIN,GHCND:USC00190120,",,0,0700",49.0,44.2,1893-01-01,2020-08-31,42.3861,"AMHERST, MA US",0.9836,GHCND:USC00190120,METERS,-72.5374
