In [1]:
import flickrapi, os, json, time
from flickrapi.exceptions import FlickrError
import pandas as pd
from dotenv import load_dotenv
from typing import Union, Dict, List

# Load variables defined in the local .env file. 
print(f"Local folder defined env variables found?.... {load_dotenv()=}")

# a recursively typed json object for ints, strs and lists
JSONVal = Union[str, int, 'JSONArray', 'JSONObject']
JSONArray = List[JSONVal]
JSONObject = Dict[str, JSONVal]

Local folder defined env variables found?.... load_dotenv()=True


In [2]:
# Load in dot env variables
secret : str = os.getenv('SECRET')
key : str = os.getenv('KEY')
USER_NAME : str = os.getenv('ORG_USER_NAME')

In [3]:
# Log in and create user ID object
flickr : flickr.FlickrAPI = flickrapi.FlickrAPI(key, secret, format='parsed-json')
# FracTracker Org Credentials
user_info : JSONObject = flickr.people.findByUsername(username=USER_NAME)
user_id : str = user_info['user']['id']

In [4]:
# get the list of Fracktracker's albums
photosets : JSONObject = flickr.photosets.getList(user_id=user_id)

In [5]:
album_ids : List[str] = [albumMetaData['id'] for albumMetaData in photosets['photosets']['photoset']]

In [None]:
start_entire = time.time()
attributes=[]
for a_id in album_ids:
    pg_start, pg_end, per_page = 1, 1, 500

    photos : JSONObject = flickr.photosets.getPhotos(photoset_id=a_id, page=pg_start, per_page=per_page)
    for pg in range(1, photos['photoset']['pages']+1):
        print(f"ON page {pg=} of album id {photos['photoset']['id']}")
        if pg!=1: photos : JSONObject = flickr.photosets.getPhotos(photoset_id=a_id, page=pg, per_page=per_page)
        print(f"\t On this page of the album there are {len(photos['photoset']['photo'])}")
        album_title = photos['photoset']['title']
        start = time.time()
        for pic in photos['photoset']['photo']:
            photo_id = pic['id']
            try:
                photo_info = flickr.photos.getInfo(photo_id=photo_id)['photo']
            except FlickrError as e:
                if 'Status code 500' in str(e):
                    print(f"Encountered an error for photo_id {photo_id}: {e}. waiting 2 seconds and then trying again")
                    time.sleep(2) 
                    photo_info = flickr.photos.getInfo(photo_id=photo_id)['photo']
                else:
                    print(f"Encountered an error for photo_id {photo_id}: {e}. Skipping this photo. waiting 2 seconds and then trying again")
                    time.sleep(2) 
                    photo_info = flickr.photos.getInfo(photo_id=photo_id)['photo']
        
            title = photo_info['title']['_content']
            description = photo_info['description']['_content']
            url = f"https://www.flickr.com/photos/fractracker/{photo_id}/in/album-{a_id}"
            dt=photo_info['dates']['taken']
            
            # Extract latitude and longitude if available
            latitude = None
            longitude = None
            if 'location' in photo_info:
                location_info = photo_info['location']
                if 'latitude' in location_info and 'longitude' in location_info:
                    latitude = location_info['latitude']
                    longitude = location_info['longitude']
            else : #skip records with no localtion
                continue
            attributes.append({'PhotoID': photo_id,'Title': title, 'Date_taken': dt,'Description': description,'URL': url,
                'Latitude': latitude,'Longitude': longitude,'AlbumID': a_id,'AlbumTitle': album_title})
        end = time.time()
        elapsed = end - start
        elapsed_since_st = end - start_entire
        elapsed_since_st_MIN = elapsed_since_st/60
        print(f'\tTime taken for this page: {elapsed:.6f} seconds') 
        print(f'\tTime taken since code start: {elapsed_since_st:.6f} seconds') 
        print(f'\tTime taken since code start: {elapsed_since_st_MIN:.6f} min') 
            
end_entire = time.time()
elapsed_entire = end_entire - start_entire
minn = elapsed_entire/60
print(f'Time taken for this entire snippet: {elapsed_entire:.6f} seconds')    
print(f'Time taken for this entire snippet: {minn:.6f} seconds') 

ON page pg=1 of album id 72157715916543893
	 On this page of the album there are 500
	Time taken for this page: 144.489284 seconds
	Time taken since code start: 145.306736 seconds
	Time taken since code start: 2.421779 min
ON page pg=2 of album id 72157715916543893
	 On this page of the album there are 500
	Time taken for this page: 138.659861 seconds
	Time taken since code start: 285.017826 seconds
	Time taken since code start: 4.750297 min
ON page pg=3 of album id 72157715916543893
	 On this page of the album there are 99
	Time taken for this page: 28.831605 seconds
	Time taken since code start: 314.116588 seconds
	Time taken since code start: 5.235276 min
ON page pg=1 of album id 72157718955813667
	 On this page of the album there are 40
	Time taken for this page: 10.179086 seconds
	Time taken since code start: 324.860964 seconds
	Time taken since code start: 5.414349 min
ON page pg=1 of album id 72157715839488573
	 On this page of the album there are 500
	Time taken for this page: 

In [None]:
len(attributes)

In [None]:
df = pd.DataFrame(attributes)
df.to_csv('simple_pull.csv', index=False)
df

In [None]:
df['Description'].iloc[0]

In [None]:
df['Title'].iloc[0]

In [None]:
type(df['PhotoID'].value_counts())

In [None]:
df['PhotoID'].value_counts() # get a sense of the duplicate photo ids

In [None]:
df['Description'].value_counts() # get a sense of the duplicate descriptions 

In [None]:
len(df['Description'].value_counts()) #  47 types of Descipts

In [None]:
df.dtypes

In [None]:
grouped_df = df.groupby('PhotoID')['AlbumTitle'].agg(list).reset_index()
grouped_df  # notice the length of this dataframe matches the length of df['PhotoID'].value_counts() from above

In [None]:
# piece back the dataframes attriubtes w/o 'AlbumTitle'
df = df.drop(['AlbumTitle','AlbumID'], axis=1)
df = df.drop_duplicates(subset='PhotoID', keep='first')  # loose description corresponse 
df

In [None]:
# notice the length of df and grouped_df...time to join
merged_df = pd.merge(grouped_df, df, on='PhotoID', how='left')
merged_df

In [None]:
merged_df.to_csv('tidied_data.csv', index=False)