In [1]:
import flickrapi, os, json, time
import pandas as pd
from dotenv import load_dotenv
from typing import Union, Dict, List

# Load variables defined in the local .env file. 
print(f"Local folder defined env variables found?.... {load_dotenv()=}")

# a recursively typed json object for ints, strs and lists
JSONVal = Union[str, int, 'JSONArray', 'JSONObject']
JSONArray = List[JSONVal]
JSONObject = Dict[str, JSONVal]

Local folder defined env variables found?.... load_dotenv()=True


In [2]:
# Load in dot env variables
secret = os.getenv('SECRET')
key = os.getenv('KEY')
USER_NAME = os.getenv('ORG_USER_NAME')

In [3]:
# Log in and create user ID object
flickr = flickrapi.FlickrAPI(key, secret, format='parsed-json')

In [4]:
# FracTracker Org Credentials
user_info : JSONObject = flickr.people.findByUsername(username=USER_NAME)
user_id = user_info['user']['id']
user_id

'186454571@N08'

In [5]:
user_info

{'user': {'id': '186454571@N08',
  'nsid': '186454571@N08',
  'username': {'_content': 'FracTracker_Alliance'}},
 'stat': 'ok'}

In [6]:
# get the list of Fracktracker's albums
# album URL https://www.flickr.com/photos/fractracker/albums/with/72157713930235593
photosets : JSONObject = flickr.photosets.getList(user_id=user_id)

In [7]:
pic_ct_per_album = {attr['title']['_content']:attr['count_photos'] for attr in photosets['photosets']['photoset']}
pic_ct_per_album

{'Appalachian Buildout': 1099,
 'California': 39,
 'Channels of Life: the Gulf Coast Buildout in TX': 623,
 'Coal Mining': 100,
 'Coastal & Marine Environments': 586,
 'Communities': 747,
 'Compressor Stations': 231,
 'Construction': 761,
 'Cracker Plants & Ethylene': 541,
 'Culture & Livelihoods': 316,
 'Drilling & Fracking': 609,
 'East Palestine Train Derailment': 62,
 'Endless Effects: the Loyalsock Watershed Project': 53,
 'Environmental Justice': 652,
 'Explosions, Fires, & Flaring': 158,
 'Forests': 125,
 'Frac Sand Mining': 509,
 'Frac Sand Storage & Processing': 430,
 'Frac Sand Transportation': 268,
 'Freshwater Ecosystems': 365,
 'Grasslands & Prairies': 75,
 'Gulf Coast Buildout': 598,
 'Illinois': 119,
 'Impoundment Ponds': 254,
 'Injection Wells': 169,
 'Land Preservation & Management': 105,
 'Louisiana': 370,
 'Marshes, Swamps, & Wetlands': 472,
 'Maryland': 9,
 'Methane & Air and Water Quality Concerns': 1017,
 'Michigan': 393,
 'Minnesota': 68,
 'Movement Building': 55

In [7]:
with open('photosets.json', 'w') as file:
    json.dump(photosets, file, indent=4)

In [8]:
# album count
len(photosets['photosets']['photoset'])

55

In [9]:
# Demonstrate how to find the album names.
# photosets['photosets']['photoset'][0] returns one album
for attr in photosets['photosets']['photoset'][0]:
    if attr == 'title':
        print(attr)
        print(photosets['photosets']['photoset'][0][attr])
        print(photosets['photosets']['photoset'][0][attr]['_content'])
        print(type(photosets['photosets']['photoset'][0][attr]['_content']))

title
{'_content': 'Appalachian Buildout'}
Appalachian Buildout
<class 'str'>


In [10]:
# Verify that we understand the JSON structure. Important for define it's type
should_be_55=0

for albumMetaData in photosets['photosets']['photoset']:
        print(albumMetaData['title']['_content'])
        should_be_55+=1    

Appalachian Buildout
California
Channels of Life: the Gulf Coast Buildout in TX
Coal Mining
Coastal & Marine Environments
Communities
Compressor Stations
Construction
Cracker Plants & Ethylene
Culture & Livelihoods
Drilling & Fracking
East Palestine Train Derailment
Endless Effects: the Loyalsock Watershed Project
Environmental Justice
Explosions, Fires, & Flaring
Forests
Frac Sand Mining
Frac Sand Storage & Processing
Frac Sand Transportation
Freshwater Ecosystems
Grasslands & Prairies
Gulf Coast Buildout
Illinois
Impoundment Ponds
Injection Wells
Land Preservation & Management
Louisiana
Marshes, Swamps, & Wetlands
Maryland
Methane & Air and Water Quality Concerns
Michigan
Minnesota
Movement Building
Netherlands
North Dakota
Offshore Drilling
Ohio
Pennsylvania
Pipelines
Plastics & Petrochemicals
Pollution, Toxins, & Disposal Concerns
Power Plants
Refineries & Processing
Southeastern Michigan Sand Mining & Industrial Impacts
Storage
Texas
Transportation
Waste & Wastewater
Water at Risk

In [11]:
album_titles : List[str] = [albumMetaData['title']['_content'] for albumMetaData in photosets['photosets']['photoset']]

In [12]:
album_ids : List[str] = [albumMetaData['id'] for albumMetaData in photosets['photosets']['photoset']]

In [13]:
should_be_55==len(photosets['photosets']['photoset'])==len(album_ids)==len(album_titles)==55

True

### a demonstrative example 
In Flickr lingo, an album is a photoset. The API call...
```python
photosets : JSONObject = flickr.photosets.getList(user_id=user_id) ;
```
The above returns, meta data about each album and a small amount of meta data about the all the albums.
##### Understanding the response
###### Meta data about the call itself
```json
{
    "photosets": {
        "page": 1,
        "pages": 1,
        "perpage": 500,
        "total": 55,
        "photoset": [
            ...
        ]
    },
        "stat": "ok"

}
```
We interpret the above response as, this is the first page of albums (`"page": 1,`) of one page (`"pages": 1,`). There can be up to 500 albums per page (`"perpage": 500,`). This page has 55 albums (`"total": 55,`). The attribute photoset is a list containing meta-data about each album (`"photoset": [...]`).
##### Meta data about the albums
Below is the information provided for each album. There are 55 instances of the below attriubte becuase there are 55 albums. This data is in the ellipsis of the above snippet (`"photoset": [...]`).
```json
            {
                "id":,
                "owner":,
                "username":,
                "primary":,
                "secret":,
                "server":,
                "farm":,
                "count_views":,
                "count_comments":,
                "count_photos":,
                "count_videos":,
                "title": {
                    "_content":
                },
                "description": {
                    "_content":
                },
                "can_comment": ,
                "date_create":,
                "date_update":,
                "sorting_option_id":,
                "photos":,
                "videos":,
                "visibility_can_see_set":,
                "needs_interstitial":
            }

```
Values to each of the above atrriubtes are either `int`, `str`, or `dict`. The only `list` in the JSON reponse is the one containing the albums.
The `"id"` attriubte is used to find the assets (images or videos) within each album. 
### Lets create a list of all the album `id`s (this list should be 55 elements long). 

In [14]:
# dry run... print one id

for attr in photosets['photosets']['photoset'][0]:
    if attr == 'id':
        print(photosets['photosets']['photoset'][0][attr])
        print(type(photosets['photosets']['photoset'][0][attr]))

72157715916543893
<class 'str'>


In [15]:
album_ids = list()

for attr in photosets['photosets']['photoset']:
    print(f" adding {attr['id']} to list of albums ids. list is currently {len(album_ids)}")
    album_ids.append(attr['id'])

 adding 72157715916543893 to list of albums ids. list is currently 0
 adding 72157718955813667 to list of albums ids. list is currently 1
 adding 72157715839488573 to list of albums ids. list is currently 2
 adding 72157714012190006 to list of albums ids. list is currently 3
 adding 72157714222642517 to list of albums ids. list is currently 4
 adding 72157714265378312 to list of albums ids. list is currently 5
 adding 72157713782049298 to list of albums ids. list is currently 6
 adding 72157715838976098 to list of albums ids. list is currently 7
 adding 72157713999613671 to list of albums ids. list is currently 8
 adding 72157714221113491 to list of albums ids. list is currently 9
 adding 72157713805087661 to list of albums ids. list is currently 10
 adding 72177720306424638 to list of albums ids. list is currently 11
 adding 72157715400012723 to list of albums ids. list is currently 12
 adding 72177720305148733 to list of albums ids. list is currently 13
 adding 72157713930502332 to l

In [16]:
# Same as above
a_ids = [attr['id'] for attr in photosets['photosets']['photoset']]
len(a_ids)

55

In [17]:
len(album_ids)

55

In [18]:
len(set(album_ids)) # sanity check, any duplicates? 

55

In [19]:
a_single_album = album_ids[0]

In [20]:
%%time
pic_list =[]
pic_set=set()
pic_ppl_list =[]
pic_ppl_set=set()

#scrape images from photosets and collect unique photo IDs
# Go thruough the list of albums and save each albums id (photoset id). We will use each album id to 
## retrieve the assets of the album
for photoset_id in album_ids:

    #make list of photos in the photoset
    photos = flickr.photosets.getPhotos(photoset_id=photoset_id)

    #find unique photo IDs
    for photo in photos['photoset']['photo']:
        photo_id = photo['id']
        pic_list.append(photo_id)
        pic_set.add(photo_id)

#scrape images from individual uploads and collect unique photo IDs
## another location outsite of albums where assets were uploaded
individual_photos = flickr.people.getPhotos(user_id=user_id, per_page=500)

for photo in individual_photos['photos']['photo']:
    photo_id = photo['id']
    pic_ppl_list.append(photo_id)
    pic_ppl_set.add(photo_id)

CPU times: user 297 ms, sys: 51.5 ms, total: 348 ms
Wall time: 23.5 s


In [21]:
len(pic_list),len(pic_set),len(pic_ppl_list),len(pic_ppl_set)

(16121, 3128, 500, 500)

In [22]:
# union sets
grand_pic_set : set[str] = pic_set | pic_ppl_set

In [23]:
len(grand_pic_set)

3130

## Open Question
_why does `flickr.photosets.getPhotos` and `flickr.people.getPhotos` both need to be used?_ ...just for 2 assets?

In [24]:
# recall the namespace
%whos

Variable            Type                    Data/Info
-----------------------------------------------------
Dict                _SpecialGenericAlias    typing.Dict
JSONArray           _GenericAlias           typing.List[typing.Union[<...>orwardRef('JSONObject')]]
JSONObject          _GenericAlias           typing.Dict[str, typing.U<...>orwardRef('JSONObject')]]
JSONVal             _UnionGenericAlias      typing.Union[str, int, Fo<...>ForwardRef('JSONObject')]
List                _SpecialGenericAlias    typing.List
USER_NAME           str                     FracTracker_Alliance
Union               _SpecialForm            typing.Union
a_ids               list                    n=55
a_single_album      str                     72157715916543893
albumMetaData       dict                    n=21
album_ids           list                    n=55
album_titles        list                    n=55
attr                dict                    n=21
file                TextIOWrapper           <_io.Te

### Pull the all the assets from an album
Given the credentialed API object `flickr` the function call `photosets.getPhotos(photoset_id, per_page, page)` will return a similar `JSONObject`. Each asset is denoted as the following JSON
```json
{'id': '52748254383',
    'secret': '814202740b',
    'server': '65535',
    'farm': 66,
    'title': 'TAuch_Infrastructure-Heritage_HazardousWaste_Incinerator-ColumbianaCounty-OH_March2023',
    'isprimary': '0',
    'ispublic': 1,
    'isfriend': 0,
    'isfamily': 0}
```
There are up to 500 of the above chunks (which correspond to an asset) in a single `photosets.getPhotos()` call. With the asset's `id` value one can use `photos.getInfo(photo_id)` to retrieve complete meta data. The exhaustive attribute list is below

```json
{
  'photo': {'id': '',
  'secret': '',
  'server': '',
  'farm': ,
  'dateuploaded': '',
  'isfavorite': ,
  'license': '',
  'safety_level': '',
  'rotation': ,
  'originalsecret': '',
  'originalformat': '',
  'owner': {},
  'title': {},
  'description': {},
  'visibility': {},
  'dates': {},
  'views': '',
  'editability': {},
  'publiceditability': {},
  'usage': {},
  'comments': {},
  'notes': {},
  'people': {},
  'tags': {},
  'location': {'latitude': '',
   'longitude': '',
   'accuracy': '',
   'context': '',
   'locality': {},
   'county': {},
   'region': {},
   'country': {},
   'neighbourhood': {}},
  'geoperms': {},
  'urls': {},
  'media': ''},
 'stat': ''
}
```
Before and after the series of assets (`'photo': [...]`) contained in a page there is meta data. Before the series of assets there is album id (`id`), etc... After the assets tell you what page of the album the data corresponds to, how many assets per page (`per_page`), total possible assets per page (`perpage`), total assets of the album (`total`), etc...
```json
{'id': '72157715916543893',
  'primary': '49834137413',
  'owner': '186454571@N08',
  'ownername': 'FracTracker_Alliance',
  'photo': [...],
  'page': '1',
  'per_page': 500,
  'perpage': 500,
  'pages': 3,
  'title': 'Appalachian Buildout',
  'sorting_option_id': 'manual-add-to-end',
  'total': 1099},
 'stat': 'ok'}
 ```
### For a single album, pull all assets 


In [25]:
photos : JSONObject = flickr.photosets.getPhotos(photoset_id=a_single_album)
with open('photos.json', 'w') as file:
    json.dump(photosets, file, indent=4)

In [26]:

pg_start,pg_end = photos['photoset']['page'],photos['photoset']['pages']


In [28]:
start = time.time()

# for a single page of the single album
photos = flickr.photosets.getPhotos(photoset_id=a_single_album, per_page=500, page=1)

attributes = []
album_title = photos['photoset']['title']

# Create photo_info dictionary
for photo in photos['photoset']['photo']:
    photo_id = photo['id']
    photo_info = flickr.photos.getInfo(photo_id=photo_id)['photo']

    # Extract basic attributes
    title = photo_info['title']['_content']
    description = photo_info['description']['_content']
    url = f"https://www.flickr.com/photos/fractracker/{photo_id}/in/album-{a_single_album}"

    # Extract latitude and longitude if available
    latitude = None
    longitude = None
    if 'location' in photo_info:
        location_info = photo_info['location']
        if 'latitude' in location_info and 'longitude' in location_info:
            latitude = location_info['latitude']
            longitude = location_info['longitude']

    attributes.append({
        'PhotoID': photo_id,
        'Title': title,
        'Description': description,
        'URL': url,
        'Latitude': latitude,
        'Longitude': longitude,
        'AlbumID': a_single_album,
        'AlbumTitle': album_title
    })

end = time.time()
elapsed = end - start
print(f'Time taken: {elapsed:.6f} seconds')

Time taken: 130.976738 seconds


In [29]:
pic_ct_per_album = {attr['title']['_content']:attr['count_photos'] for attr in photosets['photosets']['photoset']}
pic_ct_per_album

In [30]:
x=sum(pic_ct_per_album.values())

In [31]:
# Given, 500 photos took 241 sec
((x/500)*elapsed)/60

88.93757113891442

In [32]:
attributes

[{'PhotoID': '52748254383',
  'Title': 'TAuch_Infrastructure-Heritage_HazardousWaste_Incinerator-ColumbianaCounty-OH_March2023',
  'Description': 'Photo citation: Ted Auch, FracTracker Alliance, 2023.\n\nEach photo label provides this information, explained below: \n<i>Photographer_topic-sitespecific-siteowner-county-state_partneraffiliation_date(version)</i>\n\nPhoto labels provide information about what the image shows and where it was made. The label may describe the type of infrastructure pictured, the environment the photo captures, or the type of operations pictured. For many images, labels also provide site-specific information, including operators and facility names, if it is known by the photographer. \n\nAll photo labels include location information, at the state and county levels, and at township/village levels if it is helpful. Please make use of the geolocation data we provide - especially helpful if you want to see other imagery made nearby! \n\nWe encourage you to reach 

In [33]:
df = pd.DataFrame(attributes)
df

Unnamed: 0,PhotoID,Title,Description,URL,Latitude,Longitude,AlbumID,AlbumTitle
0,52748254383,TAuch_Infrastructure-Heritage_HazardousWaste_I...,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5274...,40.631783,-80.546412,72157715916543893,Appalachian Buildout
1,52748254428,TAuch_Infrastructure-Heritage_HazardousWaste_I...,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5274...,40.630638,-80.545881,72157715916543893,Appalachian Buildout
2,52748254383,TAuch_Infrastructure-Heritage_HazardousWaste_I...,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5274...,40.631783,-80.546412,72157715916543893,Appalachian Buildout
3,52748254428,TAuch_Infrastructure-Heritage_HazardousWaste_I...,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5274...,40.630638,-80.545881,72157715916543893,Appalachian Buildout
4,52748254463,TAuch_Infrastructure-Heritage_HazardousWaste_I...,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5274...,40.631202,-80.545584,72157715916543893,Appalachian Buildout
...,...,...,...,...,...,...,...,...
497,49770461616,TAuch_Infrastructure-ImpoundmentPond_FrackPad-...,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/4977...,41.295671,-77.391300,72157715916543893,Appalachian Buildout
498,49769921923,TAuch_Infrastructure-ImpoundmentPond-LycomingC...,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/4976...,41.291399,-77.392866,72157715916543893,Appalachian Buildout
499,49769921573,TAuch_Infrastructure-ImpoundmentPond-LycomingC...,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/4976...,41.276419,-77.387866,72157715916543893,Appalachian Buildout
500,49749824313,TAuch_Infrastructure-FrackPad-LycomingCounty_P...,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/4974...,41.387659,-77.509059,72157715916543893,Appalachian Buildout


In [None]:
pg_start,pg_end,first_lst=1,1,500
for a_id in album_ids:
    photos : JSONObject = flickr.photosets.getPhotos(photoset_id=a_id, per_page=first_lst, page=pg_start)
    pg_start,pg_end = photos['photoset']['page'],photos['photoset']['pages']
    attributes = []
    album_title = photos['photoset']['title']
    for album_pg in range(pg_start,pg_end+1):