<b>Notebook for retrieving and processing Flickr data from the API (http://insideairbnb.com/)</b>
* All rights reserved to the respective owners.
* The author of this script is not affiliated with Flickr or any of Flickr's competitors.
* Sensitive information, namely 'username', is removed in the final CSV file to ensure anonymity of Flickr users  

In [None]:
#import libraries
import pandas as pd
import time
import flickrapi
import json
import datetime
import csv
import numpy as np
import os

import geopandas as gpd
from geopandas import GeoDataFrame
from shapely.geometry import Point

api_key = 'insert your own api key'
api_secret = 'insert your own secret key'
flickr = flickrapi.FlickrAPI(api_key, api_secret)


<B>Retrieving Flickr data </B>

In [None]:
#create new folder in output
out_folder = './output/flickr_data'
if os.path.exists(out_folder):
    shutil.rmtree(out_folder)
os.makedirs(out_folder)

#retrieve Flickr data form the API and write into csv
with open('./output/flickr_data/sample_flickr_24june2019.csv', 'w') as csvFile:
    
    #specify attributes to retrieve and open empty csv file
    fieldnames = ['photo_id','photo_title','username','date_posted','date_taken','user_location','lat','lon']
    writer = csv.DictWriter(csvFile, fieldnames=fieldnames,lineterminator = '\n')
    writer.writeheader()
    
    # access the API and retrieve every photos based on #amsterdam and bbox
    #also write values into csv file
    for photo in flickr.walk(per_page=500,tag_mode='any',tags='amsterdam',extras="geo",
                             bbox="4.702148,52.282442,5.059891,52.459775"):
        try:
            photo_title = photo.attrib['title']
            photo_id = photo.attrib['id']
            lat = photo.attrib['latitude']
            lon =  photo.attrib['longitude']
            
            bulk_info_byte = flickr.photos.getInfo(photo_id=photo_id,format='json')
            json = bulk_info_byte.decode('utf8')
            username = (json[json.find('"username":"')+len('"username":"'):json.rfind('","realname"')])
            date_posted = json[json.find('"dates":{"posted":"')+len('"dates":{"posted":"'):json.rfind('","taken"')]
            date_posted = datetime.datetime.fromtimestamp(int(date_posted))
            date_taken = json[json.find('"taken":"')+len('"taken":"'):json.rfind('","takengranularity"')]
            date_taken = datetime.datetime.strptime(date_taken, '%Y-%m-%d %H:%M:%S')
            user_location = json[json.find('"location":"')+len('"location":"'):json.rfind('","iconserver"')]
            
            writer.writerow({'photo_id': photo_id,'photo_title': photo_title,'username':username,'date_posted':date_posted,
                             'date_taken':date_taken,'user_location':user_location,'lat': lat,'lon':lon})
            time.sleep(1)
            print(username)
        except (FlickrError,NameError,UnicodeEncodeError) as e:
            time.sleep(30)

<B>Cleaning data </B> <br>
CSV contains data that was retrieved 24june2019 (3000+ photos). Data cleaning script may not be approriate for new data.

In [None]:
#cleaning data 
df = pd.read_csv('./output/flickr_data/sample_flickr_24june2019.csv',encoding = "ISO-8859-1")
df['date_taken'] = pd.to_datetime(df['date_taken'])
df['date_posted'] = pd.to_datetime(df['date_posted'])
df['country'] = df["user_location"]
df['country'] = df["user_location"].str.split(",", n = 1, expand = True)[[1]]
df['country'] = df['country'].str.strip()
df.fillna(value='-',inplace=True)
df["traveler_type"] = ""
df['country']= np.where(df['country']=='-', df['user_location'], df['country'])

In [None]:
#label based on country
nl = ['Nederland','Netherlands','The Netherlands','Holland','NL','the Netherlands','nederland']
df.loc[df['country'].isin(nl),'traveler_type'] = 'domestic'
df.loc[df['user_location'].str.contains('Amsterdam'),'traveler_type'] = 'local'
df.loc[df['user_location'].str.contains('Utrecht') | df['user_location'].str.contains('Amersfoort'),'traveler_type'] = 'domestic'

df.loc[df['user_location'].str.contains('Amsterdam'),'traveler_type'] = 'local'

In [None]:
subset = df.loc[df['user_location']=='-'].groupby(['username','date_taken']).size().reset_index()
subset['month_nr'] = subset['date_taken'].apply(lambda x: "%d" % (x.month))
subset = subset.groupby(['username', 'month_nr']).size().reset_index(name='photo_frequency')

# label based on amount of photos per month
# international more than 100 photos, domestic between 10-100, local less than 10
subset.loc[subset['photo_frequency']>100,'traveler_type'] = 'international'
subset.loc[(subset['photo_frequency']>=10) & (subset['photo_frequency']<=100),'traveler_type']= 'domestic'
subset.loc[subset['photo_frequency']<10,'traveler_type'] = 'local'
subset = subset.groupby(['username','traveler_type']).size().reset_index(name='count')
subset.drop(columns='count',inplace=True)
df = df.merge(subset,on='username',how = 'outer')
df['traveler_type_x']= np.where(df['traveler_type_x']=='', df['traveler_type_y'], df['traveler_type_x'])
df.drop(columns='traveler_type_y',inplace=True)
df.rename(columns={'traveler_type_x': 'traveler_type'}, inplace=True)

df.loc[df['traveler_type'].isnull(),'traveler_type']='international'

df = df.drop_duplicates(subset='photo_id', keep='first')


In [None]:
#delete username
df.drop(['username'], axis=1,inplace=True)

#export dataframe to geojson
df['date_posted'] = df['date_posted'].dt.strftime('%Y-%m-%d')
df['date_taken'] = df['date_taken'].dt.strftime('%Y-%m-%d')
geometry = [Point(xy) for xy in zip(df['lon'], df['lat'])]
flickr_gdf = df.drop(['lat', 'lon'], axis=1)
flickr_gdf = GeoDataFrame(flickr_gdf, crs={'init': 'epsg:4326'}, geometry=geometry)
flickr_gdf.to_file("./output/flickr_data/GeotaggedFlickr_24june2019.geojson",driver="GeoJSON",encoding='utf-8')

In [None]:
#delete csv file
filePath = './output/flickr_data/sample_flickr_24june2019.csv'; 
if os.path.exists(filePath):
    os.remove(filePath)
else:
    print("Can not delete the file as it doesn't exists")