# This code documents the import of dam removal data into mongo and gc2 databases.  The code is a work in progress and will change through time.  

In [1]:
import urllib.request as urlreq
import pandas as pd
from bis2 import gc2
from bis2 import mongo
import requests, json, time, datetime, uuid

import subprocess
import geopandas as gpd
from shapely.geometry import Point

from DRIP_Functions_Variables import *

# Import and Process Most Recent American Rivers Database

In [2]:
#Determine newest data from American Rivers
sbItemUrl = 'https://www.sciencebase.gov/catalog/item/5a1d7403e4b09fc93dd7bd9c?format=json'
maxYearLink, maxYear = sbNewestData(arSbItemUrl)

Data Year: 2016
Data Url: https://ndownloader.figshare.com/files/9737656


In [3]:
#Download most recent American Rivers file to local directory
fileName = 'arDamRemovals' + str(maxYear) + '.csv'
urlreq.urlretrieve(maxYearLink, fileName)

('arDamRemovals2016.csv', <http.client.HTTPMessage at 0x28210fe8630>)

In [4]:
# Read downloaded file into a pandas dataframe.  Note there are weird characters requiring the ISO encoding
dfArDamRemovals = pd.read_csv(fileName, sep=',', encoding = "ISO-8859-1")

In [5]:
#Set information for run and database connections
thisRun = {}
thisRun['instance'] = 'DataDistillery'
thisRun['db'] = 'BCB_beta'
thisRun['baseURL'] = gc2.sqlAPI(thisRun['instance'],thisRun['db']) 

mongoColl = mongo.getCollection('arDamRemovals')

In [6]:
#Prep for check to see what is existing
intList = []
floatList = []
arFeatureList = []


#query AR Table in GC2 to find features already captured and to build list of ints and floats
q = (thisRun['baseURL'] + '&q=select * from drip.ardamremoval')

try:
    arInGc2 = requests.get(q).json()
    arTypes = arInGc2['forStore']
    arFeatures = arInGc2['features']

    #Identify field types with int and float (need to specify these as we format data to push into Mongo)
    for field in arTypes:
        if field['type']=='integer':
            intList.append(field['name'])
        elif field['type']=='double precision':
            floatList.append(field['name'])
    #Create List of features already in database to help understand what is new.
    for feature in arFeatures:
        arFeatureList.append(feature['properties'])
except:
    print ('Gc2 query failed')

currentArGc2 = pd.DataFrame(arFeatureList)
currentArGc2.head()

Unnamed: 0,_id,ar_id,city_county,dam_height_ft,dam_length_ft,dam_name,ftype,gid,huc8,last_updated,...,original use,registrationcode,registrationdate,river,river_miles_reported,spatialcertainty,state,type_material,year_built,year_removed
0,20e89558-768e-42e8-baeb-a96505b04e6e,AK-001,Kodiak,36.0,251.0,Lake Bettinger Complex Dam,Removed Dam,1,Kodiak-Afognak Islands,11/10/2017,...,Water supply,,2017-11-17 14:46:04.531+00,,,https://www.sciencebase.gov/vocab/term/5822257...,AK,Timber,1959.0,
1,ce8062a1-1c85-4990-94d9-cfda8a2c9cd5,AK-002,Kodiak,,245.0,Bettinger Lower Reservoir Dam B,Removed Dam,2,Kodiak-Afognak Islands,11/10/2017,...,Water supply,,2017-11-17 14:46:04.531+00,,,https://www.sciencebase.gov/vocab/term/5822257...,AK,Timber,1959.0,
2,14e8097c-f462-4463-a4a1-a1d1505fa8bf,AK-003,Matanuska-Susitna,,379.0,Memory Estates Dam #1,Removed Dam,3,Talkeetna River,11/10/2017,...,Recreation,,2017-11-17 14:46:04.531+00,,,https://www.sciencebase.gov/vocab/term/5822257...,AK,Earthen,1973.0,
3,e14316be-c826-4fa9-9b9f-845e5163601c,AK-004,Matanuska-Susitna,,444.0,Memory Estates Dam #2,Removed Dam,4,Talkeetna River,11/10/2017,...,Recreation,,2017-11-17 14:46:04.531+00,,,https://www.sciencebase.gov/vocab/term/5822257...,AK,Earthen,1973.0,
4,19e8e9b1-2a36-4886-ade4-ca82b7cada6a,AK-005,"Tongass National Forest, Juneau",15.0,,Switzer One Dam,Removed Dam,5,Lynn Canal,11/10/2017,...,,,2017-11-17 14:46:04.531+00,Switzer Creek,,https://www.sciencebase.gov/vocab/term/5822257...,AK,,,1988.0


In [10]:
#Connect to Mongo Instance
collName = 'arDamRemovals'
arMongo = mongo.getCollection(collName)

#Lists of integers and also floats, this is needed for pandasRowToDict (upload of data to Mongo fails without calling out ints and floats)
gc2Data = []
r = 0
rMax = len(dfArDamRemovals)
#rMax = 8    #Use for testing subset of data

#Start time to help time process
startTime = time.clock()

#For each feature within the dataset build a dictionary and append to a list
while r < rMax:
    record = pandasRowToDict(dfArDamRemovals, intList, floatList, r)    
    record['_id'] = str(uuid.uuid4())
    record['registrationDate'] = datetime.datetime.utcnow().isoformat() 
    record['registrationCode'] = ''    #link to this code showing how data was registered
    record['fType'] = 'Removed Dam'
    record['spatialCertainty'] = 'https://www.sciencebase.gov/vocab/term/5822257ee4b0b3d9add24304'
    r += 1
    gc2Data.append(record)
    
#Create pandas dataframe
gc2DfAll = pd.DataFrame(gc2Data)

#Check for new AR_ID, create new dataframe 
newArDf = gc2DfAll[~gc2DfAll.AR_ID.isin(currentArGc2.ar_id)]

#Revert back to list for mongo upload
newArData = newArDf.to_dict('records')

if newArData:
    result = mongoColl.insert_many(newArData)

    geometry = [Point(xy) for xy in zip(newArDf.Longitude, newArDf.Latitude)]
    crs = {'init': 'epsg:4269'}
    gdf = gpd.GeoDataFrame(newArDf, crs=crs, geometry=geometry)


    #new records, this exports a shapefile to be uploaded into gc2 using append to table
    #ideally row by row inserts would be better but we have ran into issues with that taking way to long
    # records with no x, y need to be deleted somewhere along the way.
    gdf.to_file('arDamRemovals2.shp', driver='ESRI Shapefile', crs_wkt='4269')

else:
    print ('No new updates')
    
print (str((time.clock()-startTime)/60))  #Print processing time in minutes




0.004749951316212749


# Import and Process Most Recent USGS Dam Removal Science Database

### Currently the Data Removal Science Database was accessed in MS Access Form and each table was exported as is in csv format. Tables were uploaded into gc2 through ui as is, using encoding 'ISO_8859_5' to deal with special characters. Table study design had symbols in field names which were also removed. These methods will be coded in future iterations of DRIP.

In [None]:
#Testing for USGS Dam Removal Science Database from MS Access format.... looking at new format for future data release so this isn't in MS Access

In [None]:
downloadName = '20150527_USGS_Dam Removal_Science_Database.zip'
downloadUrl = "https://www.sciencebase.gov/catalog/file/get/55071bf9e4b02e76d757c076?f=__disk__5b%2F6c%2F95%2F5b6c95bfff37ca93b9255b14f9ba4ecfa1e88825"
#Download GAP HUC12 file to local directory
ur.urlretrieve(downloadUrl, downloadName)

#In working directory unzips file
zip_ref = zipfile.ZipFile(downloadName, 'r')
zip_ref.extractall()
zip_ref.close()

In [None]:
db_file = '20150527_Dam Removal Database.accdb'
db = '{Microsoft Access Driver (*.mdb)}'
cnxn = pyodbc.connect('DRIVER={' + db + '};DBQ={};Uid={};Pwd={};'.format(db_file, db))

#query = "SELECT * FROM Dams"
#dataf = pd.read_sql(query, cnxn)
#cnxn.close()

In [None]:
import pyodbc
[x for x in pyodbc.drivers() if x.startswith('Microsoft Access Driver')]