In [1]:
import flickrapi, os, json, time
from flickrapi.exceptions import FlickrError
import pandas as pd
from dotenv import load_dotenv
from typing import Union, Dict, List

# Load variables defined in the local .env file. 
print(f"Local folder defined env variables found?.... {load_dotenv()=}")

# a recursively typed json object for ints, strs and lists
JSONVal = Union[str, int, 'JSONArray', 'JSONObject']
JSONArray = List[JSONVal]
JSONObject = Dict[str, JSONVal]

Local folder defined env variables found?.... load_dotenv()=True


In [2]:
# Load in dot env variables
secret : str = os.getenv('SECRET')
key : str = os.getenv('KEY')
USER_NAME : str = os.getenv('ORG_USER_NAME')

In [3]:
# Log in and create user ID object
flickr : flickr.FlickrAPI = flickrapi.FlickrAPI(key, secret, format='parsed-json')
# FracTracker Org Credentials
user_info : JSONObject = flickr.people.findByUsername(username=USER_NAME)
user_id : str = user_info['user']['id']

In [4]:
# get the list of Fracktracker's albums
photosets : JSONObject = flickr.photosets.getList(user_id=user_id)

In [5]:
album_ids : List[str] = [albumMetaData['id'] for albumMetaData in photosets['photosets']['photoset']]

In [6]:
start_entire = time.time()
attributes=[]
for a_id in album_ids:
    pg_start, pg_end, per_page = 1, 1, 500

    photos : JSONObject = flickr.photosets.getPhotos(photoset_id=a_id, page=pg_start, per_page=per_page)
    for pg in range(1, photos['photoset']['pages']+1):
        print(f"ON page {pg=} of album id {photos['photoset']['id']}")
        if pg!=1: photos : JSONObject = flickr.photosets.getPhotos(photoset_id=a_id, page=pg, per_page=per_page)
        print(f"\t On this page of the album there are {len(photos['photoset']['photo'])}")
        album_title = photos['photoset']['title']
        start = time.time()
        for pic in photos['photoset']['photo']:
            photo_id = pic['id']
            try:
                photo_info = flickr.photos.getInfo(photo_id=photo_id)['photo']
            except FlickrError as e:
                if 'Status code 500' in str(e):
                    print(f"Encountered an error for photo_id {photo_id}: {e}. waiting 2 seconds and then trying again")
                    time.sleep(2) 
                    photo_info = flickr.photos.getInfo(photo_id=photo_id)['photo']
                else:
                    print(f"Encountered an error for photo_id {photo_id}: {e}. Skipping this photo. waiting 2 seconds and then trying again")
                    time.sleep(2) 
                    photo_info = flickr.photos.getInfo(photo_id=photo_id)['photo']
        
            title = photo_info['title']['_content']
            description = photo_info['description']['_content']
            url = f"https://www.flickr.com/photos/fractracker/{photo_id}/in/album-{a_id}"
            dt=photo_info['dates']['taken']
            
            # Extract latitude and longitude if available
            latitude = None
            longitude = None
            if 'location' in photo_info:
                location_info = photo_info['location']
                if 'latitude' in location_info and 'longitude' in location_info:
                    latitude = location_info['latitude']
                    longitude = location_info['longitude']
            else : #skip records with no localtion
                continue
            attributes.append({'PhotoID': photo_id,'Title': title, 'Date_taken': dt,'Description': description,'URL': url,
                'Latitude': latitude,'Longitude': longitude,'AlbumID': a_id,'AlbumTitle': album_title})
        end = time.time()
        elapsed = end - start
        elapsed_since_st = end - start_entire
        elapsed_since_st_MIN = elapsed_since_st/60
        print(f'\tTime taken for this page: {elapsed:.6f} seconds') 
        print(f'\tTime taken since code start: {elapsed_since_st:.6f} seconds') 
        print(f'\tTime taken since code start: {elapsed_since_st_MIN:.6f} min') 
            
end_entire = time.time()
elapsed_entire = end_entire - start_entire
print(f'Time taken for this entire snippet: {elapsed_entire:.6f} seconds')         

ON page pg=1 of album id 72157715916543893
	 On this page of the album there are 500
	Time taken for this page: 147.654266 seconds
	Time taken since code start: 148.437316 seconds
	Time taken since code start: 2.473955 min
ON page pg=2 of album id 72157715916543893
	 On this page of the album there are 500
	Time taken for this page: 167.837942 seconds
	Time taken since code start: 316.801038 seconds
	Time taken since code start: 5.280017 min
ON page pg=3 of album id 72157715916543893
	 On this page of the album there are 99
	Time taken for this page: 23.962692 seconds
	Time taken since code start: 341.577994 seconds
	Time taken since code start: 5.692967 min
ON page pg=1 of album id 72157718955813667
	 On this page of the album there are 40
	Time taken for this page: 10.447698 seconds
	Time taken since code start: 352.415282 seconds
	Time taken since code start: 5.873588 min
ON page pg=1 of album id 72157715839488573
	 On this page of the album there are 500
	Time taken for this page: 

In [7]:
len(attributes)

18453

In [30]:
df = pd.DataFrame(attributes)
df.to_csv('simple_pull.csv', index=False)
df

Unnamed: 0,PhotoID,Title,Date_taken,Description,URL,Latitude,Longitude,AlbumID,AlbumTitle
0,52748254383,TAuch_Infrastructure-Heritage_HazardousWaste_I...,2023-03-09 12:24:13,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5274...,40.631783,-80.546412,72157715916543893,Appalachian Buildout
1,52748254428,TAuch_Infrastructure-Heritage_HazardousWaste_I...,2023-03-09 12:19:38,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5274...,40.630638,-80.545881,72157715916543893,Appalachian Buildout
2,52748254463,TAuch_Infrastructure-Heritage_HazardousWaste_I...,2023-03-09 12:18:49,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5274...,40.631202,-80.545584,72157715916543893,Appalachian Buildout
3,52748172745,TAuch_Infrastructure-Heritage_HazardousWaste_I...,2023-03-09 12:18:29,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5274...,40.630791,-80.545256,72157715916543893,Appalachian Buildout
4,52748007544,TAuch_Infrastructure-Heritage_HazardousWaste_I...,2023-03-09 12:17:51,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5274...,40.631625,-80.546289,72157715916543893,Appalachian Buildout
...,...,...,...,...,...,...,...,...,...
18448,53706158545,TAuch_Infrastructure-FracSand_Mine-HiCrush-Tre...,2024-05-05 11:05:59,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5370...,44.347402,-91.360048,72157715920546937,Wisconsin
18449,53704822652,TAuch_Infrastructure-FracSand_Mine-HiCrush-Tre...,2024-05-05 11:06:28,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5370...,44.348458,-91.363195,72157715920546937,Wisconsin
18450,53705722411,TAuch_Infrastructure-FracSand_RailSpur-HiCrush...,2024-05-05 11:35:50,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5370...,44.352116,-91.391448,72157715920546937,Wisconsin
18451,53705936348,TAuch_Infrastructure-FracSand_RailSpur-HiCrush...,2024-05-05 11:40:50,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5370...,44.352547,-91.395617,72157715920546937,Wisconsin


In [31]:
df['Description'].iloc[0]

'Photo citation: Ted Auch, FracTracker Alliance, 2023.\n\nEach photo label provides this information, explained below: \n<i>Photographer_topic-sitespecific-siteowner-county-state_partneraffiliation_date(version)</i>\n\nPhoto labels provide information about what the image shows and where it was made. The label may describe the type of infrastructure pictured, the environment the photo captures, or the type of operations pictured. For many images, labels also provide site-specific information, including operators and facility names, if it is known by the photographer. \n\nAll photo labels include location information, at the state and county levels, and at township/village levels if it is helpful. Please make use of the geolocation data we provide - especially helpful if you want to see other imagery made nearby! \n\nWe encourage you to reach out to us about any imagery you wish to make use of, so that we can assist you in finding the best snapshots for your purposes, and so we can furt

In [32]:
df['Title'].iloc[0]

'TAuch_Infrastructure-Heritage_HazardousWaste_Incinerator-ColumbianaCounty-OH_March2023'

In [10]:
type(df['PhotoID'].value_counts())

pandas.core.series.Series

In [33]:
df['PhotoID'].value_counts() # get a sense of the duplicate photo ids

PhotoID
53184405947    17
53185048804    17
53184985776    17
53185189725    17
53185356068    17
               ..
52405982252     2
51438195404     1
51437688658     1
51438414330     1
51438414365     1
Name: count, Length: 2718, dtype: int64

In [34]:
df['Description'].value_counts() # get a sense of the duplicate descriptions 

Description
Photo citation: Ted Auch, FracTracker Alliance, 2023.\n\nEach photo label provides this information, explained below: \n<i>Photographer_topic-sitespecific-siteowner-county-state_partneraffiliation_date(version)</i>\n\nPhoto labels provide information about what the image shows and where it was made. The label may describe the type of infrastructure pictured, the environment the photo captures, or the type of operations pictured. For many images, labels also provide site-specific information, including operators and facility names, if it is known by the photographer. \n\nAll photo labels include location information, at the state and county levels, and at township/village levels if it is helpful. Please make use of the geolocation data we provide - especially helpful if you want to see other imagery made nearby! \n\nWe encourage you to reach out to us about any imagery you wish to make use of, so that we can assist you in finding the best snapshots for your purposes, and so 

In [35]:
len(df['Description'].value_counts()) #  47 types of Descipts

47

In [21]:
df.dtypes

PhotoID        object
Title          object
Date_taken     object
Description    object
URL            object
Latitude       object
Longitude      object
AlbumID        object
AlbumTitle     object
dtype: object

In [36]:
grouped_df = df.groupby('PhotoID')['AlbumTitle'].agg(list).reset_index()
grouped_df  # notice the length of this dataframe matches the length of df['PhotoID'].value_counts() from above

Unnamed: 0,PhotoID,AlbumTitle
0,49727911618,"[Frac Sand Mining, Wisconsin]"
1,49727914298,"[Frac Sand Mining, Illinois, Impoundment Ponds..."
2,49727914638,"[Frac Sand Mining, Frac Sand Transportation, I..."
3,49727915018,"[Frac Sand Mining, Illinois, Impoundment Ponds..."
4,49727915083,"[Frac Sand Mining, Illinois, Impoundment Ponds..."
...,...,...
2713,53706158340,"[Communities, Culture & Livelihoods, Environme..."
2714,53706158375,"[Communities, Culture & Livelihoods, Environme..."
2715,53706158545,"[Communities, Culture & Livelihoods, Environme..."
2716,53706158630,"[Communities, Culture & Livelihoods, Environme..."


In [37]:
# piece back the dataframes attriubtes w/o 'AlbumTitle'
df = df.drop(['AlbumTitle','AlbumID'], axis=1)
df = df.drop_duplicates(subset='PhotoID', keep='first')  # loose description corresponse 
df

Unnamed: 0,PhotoID,Title,Date_taken,Description,URL,Latitude,Longitude
0,52748254383,TAuch_Infrastructure-Heritage_HazardousWaste_I...,2023-03-09 12:24:13,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5274...,40.631783,-80.546412
1,52748254428,TAuch_Infrastructure-Heritage_HazardousWaste_I...,2023-03-09 12:19:38,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5274...,40.630638,-80.545881
2,52748254463,TAuch_Infrastructure-Heritage_HazardousWaste_I...,2023-03-09 12:18:49,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5274...,40.631202,-80.545584
3,52748172745,TAuch_Infrastructure-Heritage_HazardousWaste_I...,2023-03-09 12:18:29,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5274...,40.630791,-80.545256
4,52748007544,TAuch_Infrastructure-Heritage_HazardousWaste_I...,2023-03-09 12:17:51,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5274...,40.631625,-80.546289
...,...,...,...,...,...,...,...
11686,49834943141,TAuch_Transportation-Waste_BrineHauler-Malaga_...,2017-05-03 11:52:06,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/4983...,39.868114,-81.176025
12345,52404379539,TAuch_Infrastructure-Water_Pipeline_GaiaPad-We...,2022-09-30 09:51:45,"Photo citation: Ted Auch, 2022. Photo courtesy...",https://www.flickr.com/photos/fractracker/5240...,40.377602,-79.626473
16784,51438195404,TAuch_Infrastructure_Ship-SarniaHarbor-Ontario...,2021-09-03 09:42:49,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5143...,42.983333,-82.418811
16785,51438414330,TAuch_Infrastructure_Ship-SarniaHarbor-StClair...,2021-09-03 09:42:33,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5143...,42.985735,-82.421514


In [38]:
# notice the length of df and grouped_df...time to join
merged_df = pd.merge(grouped_df, df, on='PhotoID', how='left')
merged_df

Unnamed: 0,PhotoID,AlbumTitle,Title,Date_taken,Description,URL,Latitude,Longitude
0,49727911618,"[Frac Sand Mining, Wisconsin]",TAuch_FracSand-Mine-WisconsinProppants-Hixton_...,2019-06-05 14:07:05,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/4972...,44.376922,-91.069461
1,49727914298,"[Frac Sand Mining, Illinois, Impoundment Ponds...",TAuch_FracSand-Mine-USSilica-Ottawa_IL_LightHa...,2018-05-24 09:19:50,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/4972...,41.349366,-88.865747
2,49727914638,"[Frac Sand Mining, Frac Sand Transportation, I...",TAuch_FracSand-Mine-USSilica-Ottawa_IL_LightHa...,2016-06-24 10:47:44,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/4972...,41.349366,-88.865747
3,49727915018,"[Frac Sand Mining, Illinois, Impoundment Ponds...",TAuch_FracSand-Mine-Unimin-NorthUtica_IL_Light...,2016-06-24 10:36:01,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/4972...,41.339217,-89.001553
4,49727915083,"[Frac Sand Mining, Illinois, Impoundment Ponds...",TAuch_FracSand-Mine-Unimin-NorthUtica_IL_June2016,2016-06-24 12:13:10,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/4972...,41.351629,-88.992208
...,...,...,...,...,...,...,...,...
2713,53706158340,"[Communities, Culture & Livelihoods, Environme...",TAuch_Infrastructure-FracSand_RailSpur-Chiefta...,2024-05-03 15:37:18,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5370...,45.257286,-91.609020
2714,53706158375,"[Communities, Culture & Livelihoods, Environme...",TAuch_Infrastructure-FracSand_RailSpur-Norther...,2024-05-03 15:24:13,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5370...,45.281691,-91.626303
2715,53706158545,"[Communities, Culture & Livelihoods, Environme...",TAuch_Infrastructure-FracSand_Mine-HiCrush-Tre...,2024-05-05 11:05:59,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5370...,44.347402,-91.360048
2716,53706158630,"[Communities, Culture & Livelihoods, Environme...",TAuch_Infrastructure-FracSand_Mine-HiCrush-Tre...,2024-05-05 11:03:04,"Photo citation: Ted Auch, FracTracker Alliance...",https://www.flickr.com/photos/fractracker/5370...,44.354636,-91.362434


In [39]:
df.to_csv('tidied_data.csv', index=False)