# Communicating with the Flickr API

Make calls to the Flickr API and retrieve images for a certain geographical location

In [1]:
import requests
import os
import json
import pandas as pd
import glob
import numpy as np
import datetime
from time import gmtime, strftime


In [2]:
# Grab private data
if __name__ == "__main__":
    FLICKR_API_KEY =  os.environ['FLICKR_API_KEY']
    #print(os.environ["FLICKR_API_KEY"])

In [3]:
# Check if directory exists, if not, create
def checkdir(path):
    try: 
        os.makedirs(path)
    except OSError:
        if not os.path.isdir(path):
            raise

In [4]:
def flickr_photo_search(api_key,api_long='',api_lat='',radius='',response_format='json',
                       per_page='100',page='1',bbox='',extras='',min_upload_date='',
                       max_upload_date=''):
    # set some general API keys
    api_method = 'flickr.photos.search'
    # process function input
    if len(api_long) > 0: 
        api_long = '&lon='+api_long 
    if len(api_lat) > 0: 
        api_lat = '&lat='+api_lat
    if len(radius) > 0:
        radius = '&radius='+radius
    if len(bbox) > 0:
        bbox = '&bbox='+bbox
    if len(extras) > 0:
        extras = '&extras='+extras
    if len(min_upload_date) > 0:
        min_upload_date = '&min_upload_date='+min_upload_date
    if len(max_upload_date) > 0:
        max_upload_date = '&max_upload_date='+max_upload_date
    # build the call string
    api_call = ('https://api.flickr.com/services/rest/?method=' + 
                api_method + 
                '&api_key='+ api_key +
                min_upload_date+
                max_upload_date+
                api_lat+ 
                api_long+ 
                radius+
                bbox+
                extras+
                '&per_page='+per_page+
                '&page='+page+
                '&format='+response_format+ 
                '&nojsoncallback=1')    
    return api_call




In [5]:
def flickr_photo_details(api_key,photo_id,response_format='json'):
    api_method = 'flickr.photos.getInfo'
    photo_id = '&photo_id='+photo_id
    api_call = ('https://api.flickr.com/services/rest/?method=' + 
                api_method + 
                '&api_key='+ api_key +
                photo_id+
                '&format='+response_format+ 
                '&nojsoncallback=1')
    return api_call


In [6]:
def get_bbox(county):
    # get data
    try:
        counties = long_lat['county'].tolist()
    except:
        long_lat = pd.read_csv(os.path.join('..','data',
                                            'county_boundaries',
                                            'min_max.txt'))
        counties = long_lat['county'].tolist()
    # get min/max values
    lat_min = long_lat['lat_min'][long_lat['county'] == county].tolist()[0]
    lat_max = long_lat['lat_max'][long_lat['county'] == county].tolist()[0]
    lng_min = long_lat['long_min'][long_lat['county'] == county].tolist()[0]
    lng_max = long_lat['long_max'][long_lat['county'] == county].tolist()[0]
    # build bbox string
    bbox = '{:f},{:f},{:f},{:f}'.format(lng_min,lat_min,lng_max,lat_max)
    return bbox


In [7]:
def create_timestring(min_time,max_time):
    """
    From datetime objects creates a timestring for Flickr API.
    """
    fmt = "%Y-%m-%d+%H:%M:%S"
    min_upload_date = min_time.strftime(fmt)
    max_upload_date = max_time.strftime(fmt)
    return min_upload_date, max_upload_date


In [8]:
def get_timestamp(time):
    """
    Converts a datetime object to UNIX timestamp.
    """
    _EPOCH = datetime.datetime(1970, 1, 1)
    timestamp = (time - _EPOCH).total_seconds()
    return timestamp

In [20]:
def find_timestep(latest_date,time_step,limit=3999,**kwargs):
    nimg_in_response = limit + 1
    # half time_step
    while nimg_in_response > limit:
        time_step = time_step / 2. # days
        time_min = latest_date - datetime.timedelta(days=time_step)
        time_max = latest_date
        min_upload_date, max_upload_date = create_timestring(time_min,time_max)
        # make first API call
        api_call = flickr_photo_search(api_key=api_key,
                                       bbox=bbox,
                                       min_upload_date=min_upload_date,
                                       max_upload_date=max_upload_date,
                                       per_page=per_page,extras=api_extras)
        response = requests.get(api_call)
        nimg_in_response = float(response.json()['photos']['total'])
    return time_step, response

    

In [10]:
# I start looking for images from this day on, going backwards in time 
latest_date_str = '9/6/2016 00:00' 
fmt = '%m/%d/%Y %H:%M'
latest_date = datetime.datetime.strptime(latest_date_str, fmt)
print latest_date


2016-09-06 00:00:00


In [11]:
# General API settings
# Get API key from API explorer, will require a new key after some time 
# (reload explorer page, make a call, copy API key from call string at bottom
# of the page)
api_key = '89a4669c0553d3d254b50803bff8f692'
# Maxing out the return per page reduces the amount of additional call for subsequent pages
per_page = '500'
# Grab extra information
api_extras='geo,tags,date_upload,date_taken,url_m,description'

In [12]:
# Get search box
bbox = get_bbox('Alameda')

In [13]:
tmp_date_str = '9/6/1968 00:00' 
fmt = '%m/%d/%Y %H:%M'
tmp_date = datetime.datetime.strptime(tmp_date_str, fmt)

_EPOCH = datetime.datetime(1970, 1, 1)
timestamp = (tmp_date - _EPOCH).total_seconds()
print timestamp

-41644800.0


In [14]:
time_min = latest_date - datetime.timedelta(days=time_step)
time_max = latest_date
timestamp_min = int(get_timestamp(time_min))
timestamp_max = int(get_timestamp(time_max))
dest_file = 'data_{:d}_{:d}_{:04d}.json'.format(timestamp_min,timestamp_max,1)
print dest_file



NameError: name 'time_step' is not defined

In [26]:
base_dir = os.path.join('..','data','photo_search')
counties = ['Alameda']
counties = ['Contra Costa','San Francisco','Santa Cruz','Sonoma']
counties = ['Contra Costa']

api_key = '51a5b7e0a4e06336707685506ed2f048'

time_step_initial = 45

for county in counties:
    latest_date_str = '9/6/2016 00:00'
#     if county == 'Contra Costa':
#         latest_date_str = '5/20/2009 08:54'
#         latest_date_str = '10/29/1975 11:54'
#     if county == 'San Francisco':
#         latest_date_str = '10/26/2009 09:33'
    fmt = '%m/%d/%Y %H:%M'
    latest_date = datetime.datetime.strptime(latest_date_str, fmt)
    out_query_info = open(os.path.join(base_dir,county+'_query_output.txt'),"w")
    print '+++++++++++++++++++++++++++++'
    print 'County:\n', county
    out_query_info.write('+++++++++++++++++++++++++++++\n')
    out_query_info.write('County: {}\n'.format(county))
    # check folder
    checkdir(os.path.join(base_dir,county))
    # get bbox string
    bbox = get_bbox(county)
    print '\t Bounding box:', bbox    
    out_query_info.write('\t Bounding box: {}\n'.format(bbox))
    latest_date_timestamp = get_timestamp(latest_date)
    while latest_date_timestamp > 0:
        # set time_step
        time_step = time_step_initial # days
        time_min = latest_date - datetime.timedelta(days=time_step)
        time_max = latest_date
        min_upload_date, max_upload_date = create_timestring(time_min,time_max)
        # make an initial call
        api_call = flickr_photo_search(api_key=api_key,
                                       bbox=bbox,
                                       min_upload_date=min_upload_date,
                                       max_upload_date=max_upload_date,
                                       per_page=per_page,
                                       extras=api_extras)
        response = requests.get(api_call)
        nimg_in_response = float(response.json()['photos']['total'])
        print '\t Initial number of images in response (timestep):', nimg_in_response, '(',time_step_initial,')'
        out_query_info.write('\t Initial number of images in response (timestep): {} ( {} )\n'
                                 .format(nimg_in_response,time_step))

        if nimg_in_response > 3999:
            time_step, response = find_timestep(latest_date,time_step,limit=3999,
                                                api_key=api_key,bbox=bbox,
                                                per_page=per_page,extras=api_extras)
            nimg_in_response = float(response.json()['photos']['total'])
            print '\t Final number of images in response (timestep):', nimg_in_response, '(',time_step,')'
            out_query_info.write('\t Final number of images in response (timestep): {} ( {} )\n'
                                 .format(nimg_in_response,time_step))
        # Use the timestep to make the full call
        time_min = latest_date - datetime.timedelta(days=time_step)
        time_max = latest_date
        min_upload_date, max_upload_date = create_timestring(time_min,time_max)
        print '\t Time window (min/max):', time_min, time_max
        out_query_info.write('\t Time window (min/max): {} / {}\n'.format(time_min,time_max))
        # extract successful API call
        max_page = response.json()['photos']['pages']
        print '\t Max. page number:', max_page
        out_query_info.write('\t Max. page number: {}\n'.format(max_page))
        # save first reponse to disk
        dest_dir = os.path.join('..','data','photo_search',county)
        timestamp_min = int(get_timestamp(time_min))
        timestamp_max = int(get_timestamp(time_max))
        dest_file = 'data_{}_{:d}_{:d}_{:04d}.json'.format(
            county,timestamp_min,timestamp_max,1)
        print '\t First file on disk:', dest_file
        out_query_info.write('\t First file on disk: {}\n'.format(dest_file))
        with open(os.path.join(dest_dir,dest_file), 'w') as outfile:
            json.dump(response.json(), outfile)
        # loop through remaining pages
        for page in range(max_page):
            page += 1
            print '\t ', page, '/', max_page
            out_query_info.write('\t {} / {}\n'.format(page,max_page))
            if page > 1:
                api_call = flickr_photo_search(api_key=api_key,
                                               bbox=bbox,
                                               min_upload_date=min_upload_date,
                                               max_upload_date=max_upload_date,
                                               per_page=per_page,
                                               extras=api_extras,
                                               page=str(page))
                response = requests.get(api_call)
                dest_file = 'data_{}_{:d}_{:d}_{:04d}.json'.format(
                    county,timestamp_min,timestamp_max,page)
                print '\t Name of file on disk:', dest_file
                out_query_info.write('\t Name of file on disk: {}\n'.format(dest_file))
                with open(os.path.join(dest_dir,dest_file), 'w') as outfile:
                    json.dump(response.json(), outfile)
        # set current max time as new latest_date
        print '\t ------------------------------------------'
        out_query_info.write('\t ------------------------------------------\n')
        latest_date = time_min 
        latest_date_timestamp = get_timestamp(latest_date)
    out_query_info.close()



    

+++++++++++++++++++++++++++++
County:
Contra Costa
	 Bounding box: -122.441580,37.718890,-121.536200,38.097710
	 Initial number of images in response (timestep): 14679.0 ( 45 )
	 Final number of images in response (timestep): 3343.0 ( 11.25 )
	 Time window (min/max): 2016-08-25 18:00:00 2016-09-06 00:00:00
	 Max. page number: 7
	 First file on disk: data_Contra Costa_1472148000_1473120000_0001.json
	  1 / 7
	  2 / 7
	 Name of file on disk: data_Contra Costa_1472148000_1473120000_0002.json
	  3 / 7
	 Name of file on disk: data_Contra Costa_1472148000_1473120000_0003.json
	  4 / 7
	 Name of file on disk: data_Contra Costa_1472148000_1473120000_0004.json
	  5 / 7
	 Name of file on disk: data_Contra Costa_1472148000_1473120000_0005.json
	  6 / 7
	 Name of file on disk: data_Contra Costa_1472148000_1473120000_0006.json
	  7 / 7
	 Name of file on disk: data_Contra Costa_1472148000_1473120000_0007.json
	 ------------------------------------------
	 Initial number of images in response (timest

In [10]:
# Get a list of all 'photo_id's
image_id_all = []
for county in counties:
    tmp_dir = os.path.join('..','data','photo_search',county)
    base_dir = os.path.join('..','data','photo_details')
    dest_dir = os.path.join(base_dir,county)
    checkdir(dest_dir)
    json_list = glob.glob(os.path.join(tmp_dir,'*json'))
    for tmp_json in json_list:
        # open json file
        with open(tmp_json, 'r') as f:
            data = json.load(f)
        # get image list
        image_list = data['photos']['photo']
        # get all 'image_id's in list
        for image in image_list:
            image_id_all.append((image['id'],county))

