Retrieving information via flickr api
---

https://www.flickr.com/services/api/

In [3]:
import flickrapi
import webbrowser
import json
import pandas as pd

f = open("credentials.json")
creds = json.load(f)
API_KEY = creds["API_KEY"]
API_SECRET = creds["API_SECRET"]
USER_OF_INTEREST = creds["USER_OF_INTEREST"]
OWN_USER = creds["OWN_USER"]

flickr = flickrapi.FlickrAPI(API_KEY, API_SECRET, format='parsed-json')

# Only do this if we don't have a valid token already
if not flickr.token_valid(perms='read'):

    # Get a request token
    flickr.get_request_token(oauth_callback='oob')

    # Open a browser at the authentication URL
    authorize_url = flickr.auth_url(perms='read')
    webbrowser.open_new_tab(authorize_url)

    # Get the verifier code from the user
    verifier = str(input('Verifier code: '))

    # Trade the request token for an access token
    flickr.get_access_token(verifier)

# Step 1: Get the IDs of commented photos and comments
**labeled as 1**

**GO STRAIGHT TO STEP 3, IF YOU HAVE THIS INFORMATION ALREADY IN A CSV FILE**

## A) Get IDs

Function: **flickr.activity.userComments**

Command always retrieves **own** comments, not comments by another user 

In [None]:
%%time

flickr_activity_userComments = {}

for page in range(1,99999):

    flickr_activity_userComments[page] = flickr.activity.userComments(per_page = 50, page = page)
    
    if flickr_activity_userComments[page]["items"]["total"] == 0:
        break

# number of comments on specific page

for page in flickr_activity_userComments.keys():
    print("page",page,flickr_activity_userComments[page]["items"]["total"])

# the photos I commented on

photo_list = []

for page in flickr_activity_userComments.keys():
    for item in flickr_activity_userComments[page]["items"]["item"]:
        if (item["type"] == "photo") & ((item["owner"],item["id"]) not in photo_list):
            photo_list.append((item["owner"],item["id"]))
            
print(len(photo_list),"photos were commented.")

## B) Get comments to the photos

Function: **flickr.photos.comments.getList**

In [None]:
%%time

photo_list_with_comments = []

for owner,photo_id in photo_list[:]:

    comments = flickr.photos.comments.getList(photo_id = photo_id)

    for com in comments["comments"]["comment"]:
        if com["author"] == OWN_USER:
            photo_list_with_comments.append((owner,photo_id,com["_content"]))
            
print(len(photo_list_with_comments),"comments were added to the list.")

In [None]:
df = pd.DataFrame(photo_list_with_comments,columns=["owner","photo_id","comment"])
df

# Step 2: Get images from user's groups, that were seen, but not commented 

**labeled as 0**

**GO STRAIGHT TO STEP 3, IF YOU HAVE THIS INFORMATION ALREADY IN A CSV FILE**

Function: **flickr.people.getPublicGroups**

In [365]:
groups = flickr.people.getPublicGroups(user_id=OWN_USER,invitation_only=True)
groups = [(group["nsid"],group["name"]) for group in groups["groups"]["group"]]
print("found",len(groups),"groups")

found 17 groups


[('14607632@N20', 'Another Planet'),
 ('2673387@N25', 'APF Magazine Street Photography Group'),
 ('1572172@N24', 'Beyond Obvious - Photomind'),
 ('4643780@N22', 'BUDAPESTREET'),
 ('2019080@N21', 'Color Street Photography 365'),
 ('2602096@N23', 'EASTREET'),
 ('472951@N22', 'Fotografi di Strada'),
 ('474929@N22', 'Fotografía en cuarentena  [nombre de calle]'),
 ('1840958@N20', 'Gazpacho Photography'),
 ('94761711@N00', 'HCSP (Hardcore Street Photography)'),
 ('2746094@N20', 'InQuadra Street Photography Evolution'),
 ('868185@N20', 'la familia abrazada'),
 ('2995645@N25', 'Rambles'),
 ('1363754@N20', 'Small Growers Street Association'),
 ('2570428@N22', 'Street Minimalism | Color |'),
 ('1812671@N25', "Street Photographers' Salon"),
 ('1699853@N22', 'un-posed.com')]

In [421]:
%%time

group_photos = {}

for group_id,group_name in groups:

    for page in range (1,9999):

        try:
            len_before = len(group_photos.get(group_id))
        except:
            len_before = 0

        result = flickr.photos.search(group_id = group_id, per_page=500, page=page)
        found_photos = [(photo["owner"],photo["id"]) for photo in result["photos"]["photo"] if (photo["owner"],photo["id"]) not in group_photos]

        if len_before == 0:
            group_photos[group_id] = found_photos
        else:
            group_photos[group_id] = group_photos.get(group_id) + found_photos

        len_after = len(group_photos.get(group_id))

        if len_after == len_before:
            break

    print(group_name,"=>",len(group_photos.get(group_id)),"photos")

Another Planet => 384 photos
APF Magazine Street Photography Group => 616 photos
Beyond Obvious - Photomind => 1864 photos
BUDAPESTREET => 0 photos
Color Street Photography 365 => 1405 photos
EASTREET => 408 photos
Fotografi di Strada => 298 photos
Fotografía en cuarentena  [nombre de calle] => 2693 photos
Gazpacho Photography => 68 photos
HCSP (Hardcore Street Photography) => 3436 photos
InQuadra Street Photography Evolution => 1314 photos
la familia abrazada => 2783 photos
Rambles => 179 photos
Small Growers Street Association => 268 photos
Street Minimalism | Color | => 1439 photos
Street Photographers' Salon => 1257 photos
un-posed.com => 1704 photos
CPU times: user 779 ms, sys: 71.1 ms, total: 850 ms
Wall time: 2min 2s


In [431]:
%%time

# make 1 list with all photos no matter which group they, and drop duplicates

photo_list_groups = []

for group in group_photos:
    for photo in group_photos[group]:
        if photo not in photo_list_groups:
            photo_list_groups.append(photo)
            
print(len(photo_list_groups),"photos from groups added.\n")

16116 photos from groups added.

CPU times: user 6.3 s, sys: 0 ns, total: 6.3 s
Wall time: 6.31 s


In [None]:
df_groups = pd.DataFrame(photo_list_groups,columns=["owner","photo_id"])
df_groups

# Step 3: Merge the two lists and save to disk

**Uncomment what's needed**

In [113]:
#df_all = pd.concat([df,df_groups],sort=False)
#df_all = df_all.drop_duplicates(["owner","photo_id"],keep="first")

#df_all.to_csv("./data/flickr_approach_A.csv",sep="\t",index=False)
#df_all=pd.read_csv("./data/flickr_approach_A.csv",sep="\t",dtype="str")

photo_list_all = [tuple(row) for row in df_all[["owner","photo_id"]].values]

# Step 4: Download all the images in JPG

In [106]:
#pip install httplib2
#pip install bs4
#pip install urllib3

import httplib2
from bs4 import BeautifulSoup, SoupStrainer
import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0')]
urllib.request.install_opener(opener)

from bs4 import SoupStrainer

strainer = SoupStrainer('img', attrs={'class': 'main-photo'})

class Extractor():

    
    def get_links(self, url):

        http = httplib2.Http()
        response, content = http.request(url)     

        images =  BeautifulSoup(content).find_all('img')

        image_links=[]

        for image in images:
            image_links.append(image['src'])
        
        return image_links
    
    def get_link(self,url):
        """
        return only main-photo link
        """
        http = httplib2.Http()
        response, content = http.request(url)
        
        image_link =  BeautifulSoup(content, 'html.parser', parse_only=strainer).find()
        try:
            image_link = "https:" + image_link["src"]
        except:
            image_link = ""
        
        return image_link
        
    def get_images(self, image_links, filename):
        
        for link in image_links:
                       
            image_url = "https:" + link    
            
            try:
            
                if link[-6:] == "_n.jpg":
                    ...
                else:            
                    local_file, response_headers  = urllib.request.urlretrieve(image_url, filename="./img_data/"+filename)
            
            except urllib.error.ContentTooShortError as shortError:
                print("content too short error")
            except urllib.error.HTTPError as e:
                print(e)
            except urllib.error.URLError as ue:
                print("failed to download!")
            except socket.timeout as se:
                print("socket timeout")
            except Exception as ee:
                print(ee)

## Download

incl. check whether or not images are already present in the folder

Alternative Approach: get first only the direct links and put them into the pandas DataFrame

Estimated time for scraping 16,300 image urls: 6.3h

(only the missing urls are actually searched)

In [104]:
%%time

import os

file_path = "./img_data/"
files = os.listdir(file_path)
photo_list_missing = [photo for photo in photo_list_all if photo[1]+".jpg" not in files]

myextractor = Extractor()

baseurl = "https://www.flickr.com/photos/"

for i in df_all.index:
    
    if pd.isna(df_all.loc[i,"url"]):
        url = f"""{baseurl}{df_all.loc[i,"owner"]}/{df_all.loc[i,"photo_id"]}"""
        image_link = myextractor.get_link(url)
        df_all.loc[i,"url"] = image_link 
    else:
        ...

KeyboardInterrupt: 

In [120]:
# save
df_all.to_csv("./data/flickr_approach_A.csv",sep="\t",index=False)

# Next Step: pre-process images

1) Crop/Scale to same dimensions  
2) Drop portrait and square format images