In [23]:
import pandas as pd
import requests as req
from json import loads, dumps
import time 
import uuid

# uncomment to turn off column width
#pd.set_option('display.max_colwidth', -1)

In [24]:
# load data into dataframe
campsites = pd.read_csv('data/campsitesLocationInfo2.csv', sep="|", header=None)
cols = ["country", "state", "city", "fbURL", "gmapsURL", "gmapsJSON"]
campsites.columns = cols

In [25]:
print "check for null values:"
print campsites.info()

check for null values:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1109 entries, 0 to 1108
Data columns (total 6 columns):
country      1109 non-null object
state        1109 non-null object
city         1109 non-null object
fbURL        1109 non-null object
gmapsURL     1109 non-null object
gmapsJSON    1109 non-null object
dtypes: object(6)
memory usage: 60.6+ KB
None


In [26]:
print" check for duplicates"
print "NOTE: duplicate city/state/country names are okay and expected"
print "but would expect fbURL, gmapsURL and gmapsJSON to be unique"
print campsites.describe()

 check for duplicates
NOTE: duplicate city/state/country names are okay and expected
but would expect fbURL, gmapsURL and gmapsJSON to be unique
              country state      city  \
count            1109  1109      1109   
unique            142    86      1071   
top     United States  none  Columbia   
freq              359   651         3   

                                                    fbURL  \
count                                                1109   
unique                                               1099   
top     https://www.facebook.com/groups/free.code.camp...   
freq                                                    4   

                                                 gmapsURL  \
count                                                1109   
unique                                               1109   
top     https://maps.googleapis.com/maps/api/geocode/j...   
freq                                                    1   

                                     

In [27]:
print 'here are some duplicates facebook groups:'
dupFB = campsites[campsites['fbURL'].duplicated(False)]
print dupFB[['city', 'fbURL']]
print "these looks intentional"

here are some duplicates facebook groups:
              city                                              fbURL
560         Gda≈Ñsk  https://www.facebook.com/groups/free.code.camp...
568          Sopot  https://www.facebook.com/groups/free.code.camp...
739      Flagstaff  https://www.facebook.com/groups/free.code.camp...
741        Phoenix  https://www.facebook.com/groups/free.code.camp...
742       Prescott  https://www.facebook.com/groups/free.code.camp...
743     Scottsdale  https://www.facebook.com/groups/free.code.camp...
744         Sedona  https://www.facebook.com/groups/free.code.camp...
746          Tempe  https://www.facebook.com/groups/free.code.camp...
748   Verde Valley  https://www.facebook.com/groups/free.code.camp...
752        Alameda  https://www.facebook.com/groups/free.code.camp...
756       Berkeley  https://www.facebook.com/groups/free.code.camp...
782        Oakland  https://www.facebook.com/groups/free.code.camp...
854      Centralia  https://www.facebook.com/gr

In [28]:
print 'here are some duplicates google map responses:'
dupLocations = campsites['gmapsJSON'].duplicated(False)
print campsites[dupLocations]
print "It looks like:"
print "-Kingston upon Hull and Hull are the same place according to google maps"
print "-Women only group didn't have a city"
print "-Hampton Roads is in Virginia"
print "NOTE: Dropping these for now, but could be handled better"
droppedLocations = campsites[~dupLocations]

here are some duplicates google map responses:
           country      state                city  \
221        England       none                Hull   
223        England       none  Kingston upon Hull   
628   Saudi Arabia       none    Women only group   
730  United States  Ambiguous       Hampton Roads   

                                                 fbURL  \
221  https://www.facebook.com/groups/free.code.camp...   
223  https://www.facebook.com/groups/free.code.camp...   
628      https://www.facebook.com/groups/girlscodeksa/   
730  https://www.facebook.com/groups/free.code.camp...   

                                              gmapsURL  \
221  https://maps.googleapis.com/maps/api/geocode/j...   
223  https://maps.googleapis.com/maps/api/geocode/j...   
628  https://maps.googleapis.com/maps/api/geocode/j...   
730  https://maps.googleapis.com/maps/api/geocode/j...   

                                             gmapsJSON  
221  { "results" : [ { "address_components" : [ 

In [29]:
# MODEL FOR POSTING TO API
## {
##   "id": "string",
##   "url": "string",
##   "createdByUsername": "string",
##   "createdAt": "2016-03-12",
##   "lastUpdatedAt": "2016-03-12",
##   "isApproved": false,
##   "isDeleted": false,
##   "city": "string",
##   "subdivision": "string",
##   "country": "string",
##   "location": "string",
##   "mapURL": "string",
##   "googleId": "string"
## }


# build list of dicts to create
toCreate = []
#loop over cleaned df to build 
for index, row in droppedLocations.iterrows():
    createDict = {}
    # copy info from dataframe
    createDict['city'] = row['city']
    createDict['subdivision'] = row['state']
    createDict['country'] = row['country']
    createDict['url'] = row['fbURL']

    # parse Google maps JSON
    gmapsInfo = loads(row['gmapsJSON'])['results'][0]
    loc = gmapsInfo['geometry']['location']
    createDict['location'] = str(loc['lat']) + ',' + str(loc['lng'])
    createDict['googleId'] = str(gmapsInfo['place_id'])
    # NOTE: mapURL is missing from this API resonse and requires another lookup :/
    
    # add other info
    # unique ID for campsite
    createDict['id'] = uuid.uuid4().hex
    # username to check against in bash script
    createDict['createdByUsername'] = 'BULKCAMPSITEUPLOAD'
    # current time
    now = time.strftime('%c')
    createDict['createdAt'] = now
    createDict['lastUpdatedAt'] = now
    # this automatically approves the campsites, could be set to false 
    # if you want to manually approve 1000+ campsites!
    createDict['isApproved'] = True 
    createDict['isDeleted'] = False 
    # append json to list
    toCreate.append(createDict)
    

In [37]:
import codecs
# write out json to file 
import io
with io.open("data/toWrite.json", "w", encoding="latin-1") as fp:
    for campsite in toCreate:
        data = dumps(campsite)
        fp.write(data + "\n")
    fp.close()

TypeError: must be unicode, not str