Retrieving information via flickr api
---

https://www.flickr.com/services/api/

In [None]:
import flickrapi
import webbrowser
import json
import pandas as pd

f = open("credentials.json")
creds = json.load(f)
API_KEY = creds["API_KEY"]
API_SECRET = creds["API_SECRET"]
USER_OF_INTEREST = creds["USER_OF_INTEREST"]
OWN_USER = creds["OWN_USER"]

flickr = flickrapi.FlickrAPI(API_KEY, API_SECRET, format='parsed-json')

# Only do this if we don't have a valid token already
if not flickr.token_valid(perms='read'):

    # Get a request token
    flickr.get_request_token(oauth_callback='oob')

    # Open a browser at the authentication URL
    authorize_url = flickr.auth_url(perms='read')
    webbrowser.open_new_tab(authorize_url)

    # Get the verifier code from the user
    verifier = str(input('Verifier code: '))

    # Trade the request token for an access token
    flickr.get_access_token(verifier)

# Step 1: Get the IDs of a large pool of photos
**not yet labeled**

we choose to use the photos in the user's group

Function: **flickr.people.getPublicGroups**

**GO STRAIGHT TO STEP 3, IF YOU HAVE THIS INFORMATION ALREADY IN A CSV FILE**

## A) Get IDs

In [None]:
groups = flickr.people.getPublicGroups(user_id=USER_OF_INTEREST)
groups = [(group["nsid"],group["name"]) for group in groups["groups"]["group"]]
print("found",len(groups),"groups")

In [None]:
%%time

group_photos = {}

for group_id,group_name in groups:
    
    try:
        flickr.photos.search(group_id = group_id, per_page=1, page=1)
        
        print(group_name)
    
        for page in range (1,100): #limit to 99 pages

            try:
                len_before = len(group_photos.get(group_id))
            except:
                len_before = 0

            result = flickr.photos.search(group_id = group_id, per_page=500, page=page,content_type=1 ,privacy_filter=1)
            found_photos = [(photo["owner"],photo["id"]) for photo in result["photos"]["photo"] if ((photo["owner"],photo["id"]) not in group_photos) & (photo["ispublic"] == 1)]

            if len_before == 0:
                group_photos[group_id] = found_photos
            else:
                group_photos[group_id] = group_photos.get(group_id) + found_photos

            len_after = len(group_photos.get(group_id))
            
            print("page:",page,", photos:",len_after)

            if len_after == len_before:
                break

        print(group_name,"=>",len(group_photos.get(group_id)),"photos")
    
    except:
        print(group_name,"=> no permission to view the pool")


In [None]:
%%time

# make 1 list with all photos no matter which group they, and drop duplicates

photo_list_groups = []

for group in group_photos:
    for photo in group_photos[group]:
        if (photo not in photo_list_groups) & (photo[0] != USER_OF_INTEREST):
            photo_list_groups.append(photo)
            
print(len(photo_list_groups),"photos from groups added.\n")

In [None]:
df_groups = pd.DataFrame(photo_list_groups,columns=["owner","photo_id"])
save_path = "./flickr_approach_B_tmp.csv"
df_groups.to_csv(save_path,sep="\t",index=False)
df_groups = pd.read_csv(filename, sep="\t", dtype="str")
df_groups

## B) Get comments to the photos

Function: **flickr.photos.comments.getList**

In [None]:
%%time

photo_list_with_comments = []

for owner,photo_id in photo_list_groups:

    comments = flickr.photos.comments.getList(photo_id = photo_id)["comments"]
    
    if comments.get("comment") != None:
    
        for com in comments["comment"]:
            if com["author"] == USER_OF_INTEREST:
                photo_list_with_comments.append((owner,photo_id,com["_content"]))
            
print(len(photo_list_with_comments),"comments were added to the list.")

In [None]:
df_comments = pd.DataFrame(photo_list_with_comments,columns=["owner","photo_id","comment"])
df_comments

# Step 3: Merge the two lists and save to disk

**Uncomment what's needed**

In [None]:
df = df_groups.merge(df_comments,on=["owner","photo_id"], how="left")

df.to_csv("./data/flickr_approach_B.csv",sep="\t",index=False)
#df=pd.read_csv("./data/flickr_approach_B.csv",sep="\t",dtype="str")

photo_list_with_comments = [tuple(row) for row in df[["owner","photo_id","comment"]].fillna("").values]

In [None]:
df

In [None]:
len(photo_list_with_comments)

# Step 4: Scrape more photos

## a) all users from previous search

In [None]:
users_so_far = list(df.owner.unique())

## b) all users in the groups

### flickr.groups.members.getList

In [None]:
%%time

users_from_groups = []

for group in groups:
    
    print(group[1])
    
    try:
        flickr.groups.members.getList(group_id=group[0],per_page=1, page=1)
    
        for page in range(1,99): # max 99 pages
            
            len_before = len(users_from_groups)
            
            res = flickr.groups.members.getList(group_id=group[0],per_page=500, page=page)["members"]
            members = res.get("member")
            if members != None:
                members = [mem["nsid"] for mem in members]
                users_from_groups += members
                
            len_after = len(users_from_groups)
            
            if len_before == len_after:
                break
    except:
        pass

## c) all users that are contacts

In [None]:
# no need for a loop here, n of contacts < 1000
res = flickr.contacts.getPublicList(user_id=USER_OF_INTEREST,per_page=1000, page=1)

In [None]:
len(res["contacts"]["contact"])

In [None]:
users_from_contact = [con["nsid"] for con in res["contacts"]["contact"]]

## d) all users with photos that were favourized by the `USER_OF_INTEREST`

### flickr.favorites.getList

This is also a good opportunity to add more photos that (despite being not commented) can be labeled as 1, since they were faved by the user

In [None]:
users_from_favs = []
faved_photos = []

for page in range(1,9): # max 9 pages
            
    len_before = len(faved_photos)

    res = flickr.favorites.getList(user_id=USER_OF_INTEREST,per_page=500, page=page)["photos"]
    favs = res.get("photo")
    if favs != None:
        users = [fav["owner"] for fav in favs]
        users_from_favs += users
        photos = [(fav["owner"],fav["id"]) for fav in favs]
        faved_photos += photos
        
    len_after = len(faved_photos)

    if len_before == len_after:
        break

In [None]:
len(faved_photos)

In [None]:
df_favs = pd.DataFrame(faved_photos,columns=["owner","photo_id"])
df_favs["fav"] = True
df_favs

In [None]:
# merge df_favs with main file and save to disk

df = df.merge(df_favs,on=["owner","photo_id"],how="outer")
df.fav = df.fav.fillna(False)

**uncomment what's needed**

In [None]:
#df.to_csv("./data/flickr_approach_B.csv",sep="\t",index=False)
#pd.read_csv("./data/flickr_approach_B.csv",sep="\t")

In [None]:
len(users_from_favs)

**Overall number of photos, that the user approved of (labeled as 1)**

In [None]:
len(df.loc[pd.notna(df["comment"]) | (df["fav"])])

## e) merge the pool of new found users

In [None]:
len(users_so_far) # what we had already

In [None]:
len(users_from_groups)

In [None]:
len(users_from_contact)

In [None]:
len(users_from_favs)

In [None]:
new_user_pool = users_so_far + users_from_groups + users_from_contact + users_from_favs
new_user_pool = list(dict.fromkeys(new_user_pool)) # to remove duplicates
len(new_user_pool)

## f) get photos from those users

### flickr.people.getPhotos

In [None]:
%%time

new_photo_pool = []

for u in new_user_pool:
    try:
        res = flickr.people.getPhotos(user_id=u, per_page=500,page=1)["photos"]
        pages = res["pages"]
    
        for page in range(1,pages+1):
            res = flickr.people.getPhotos(user_id=u, per_page=500,page=page)["photos"]["photo"]
            photos = [(photo["owner"],photo["id"]) for photo in res]
            new_photo_pool += photos
    except:
        ...

In [None]:
import json

with open("./data/new_photo_pool", "w") as fp:
    json.dump(new_photo_pool, fp)

In [None]:
#with open("./data/new_photo_pool", "r") as fp:
#    new_photo_pool = json.load(fp)

## g) get comments of new photo pool
### 

only for new photos

# Step X: Download all the images in JPG

In [None]:
#pip install httplib2
#pip install bs4
#pip install urllib3

import httplib2
from bs4 import BeautifulSoup, SoupStrainer
import urllib.request
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0')]
urllib.request.install_opener(opener)

from bs4 import SoupStrainer

strainer = SoupStrainer('img', attrs={'class': 'main-photo'})

class Extractor():

    
    def get_links(self, url):

        http = httplib2.Http()
        response, content = http.request(url)     

        images =  BeautifulSoup(content).find_all('img')

        image_links=[]

        for image in images:
            image_links.append(image['src'])
        
        return image_links
    
    def get_link(self,url):
        """
        return only main-photo link
        """
        http = httplib2.Http()
        response, content = http.request(url)
        
        image_link =  BeautifulSoup(content, 'html.parser', parse_only=strainer).find()
        try:
            image_link = "https:" + image_link["src"]
        except:
            image_link = ""
        
        return image_link
        
    def get_images(self, image_links, filename):
        
        for link in image_links:
                       
            image_url = "https:" + link    
            
            try:
            
                if link[-6:] == "_n.jpg":
                    local_file, response_headers  = urllib.request.urlretrieve(image_url, filename="./img_data/sm/"+filename)

                else:            
                    local_file, response_headers  = urllib.request.urlretrieve(image_url, filename="./img_data/md/"+filename)
            
            except urllib.error.ContentTooShortError as shortError:
                print("content too short error")
            except urllib.error.HTTPError as e:
                print(e)
            except urllib.error.URLError as ue:
                print("failed to download!")
            except socket.timeout as se:
                print("socket timeout")
            except Exception as ee:
                print(ee)

## Download

incl. check whether or not images are already present in the folder

Alternative Approach: get first only the direct links and put them into the pandas DataFrame

Estimated time for scraping 16,300 image urls: 6.3h

(only the missing urls are actually searched)

In [None]:
%%time

import os

file_path = "./img_data/md/"
files = os.listdir(file_path)
photo_list_missing = [photo for photo in photo_list_all if photo[1]+".jpg" not in files]

myextractor = Extractor()

baseurl = "https://www.flickr.com/photos/"

for i in df_all.index:
    
    if pd.isna(df_all.loc[i,"url"]):
        url = f"""{baseurl}{df_all.loc[i,"owner"]}/{df_all.loc[i,"photo_id"]}"""
        image_link = myextractor.get_link(url)
        df_all.loc[i,"url"] = image_link 
    else:
        ...

In [None]:
# save
#df_all.to_csv("./data/flickr.csv",sep="\t",index=False)

# Next Step: pre-process images

1) Crop/Scale to same dimensions  
2) Drop portrait and square format images