# Loading the Unsplash Research dataset in Pandas dataframes

This notebooks is an example of how to load the Unsplash Research dataset in Pandas dataframes for analysis.


## Loading libraries

In [1]:
import numpy as np
import pandas as pd
import glob

## Loading the datasets in Pandas

Make sure that you correctly point to the correct path.

In [2]:
path = './unsplash-research-dataset-lite-latest/'
documents = ['photos', 'keywords', 'collections', 'conversions', 'colors']
datasets = {}

for doc in documents:
  files = glob.glob(path + doc + ".tsv*")

  subsets = []
  for filename in files:
    df = pd.read_csv(filename, sep='\t', header=0)
    subsets.append(df)

  datasets[doc] = pd.concat(subsets, axis=0, ignore_index=True)

## Exploring the datasets

Here are the first couple of rows from each dataset, as an example.

Enjoy the exploration!

In [3]:
datasets['photos'].head()

Unnamed: 0,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,photographer_username,...,photo_location_longitude,photo_location_country,photo_location_city,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence
0,2Q8zDWkj0Yw,https://unsplash.com/photos/2Q8zDWkj0Yw,https://images.unsplash.com/photo-141520117961...,2014-11-05 15:26:26.678711,t,4192,2794,1.5,,lanceanderson,...,,,,15854580,146388,school of jellyfish swimming in body of water,,,,
1,tsBDNuCJiLg,https://unsplash.com/photos/tsBDNuCJiLg,https://images.unsplash.com/photo-141768928330...,2014-12-04 10:31:44.647012,t,4324,2880,1.5,Surfer in the ocean,thehungryjpeg,...,,,,3157825,8247,surfer walking on body of water during daytime,,,,
2,A93gsuMxVcE,https://unsplash.com/photos/A93gsuMxVcE,https://images.unsplash.com/photo-142981401899...,2015-04-23 18:33:44.024841,t,2000,1333,1.5,Barren Countryside,jeremydgreat,...,,,,2159829,7064,landscape photo of road beside field of grass,,,,
3,oYIdH6bFssk,https://unsplash.com/photos/oYIdH6bFssk,https://images.unsplash.com/photo-143275722183...,2015-05-27 20:07:28.35245,t,4288,2848,1.51,Lightbulb reflections,chelaxydp,...,,,,2861489,11778,closed up photo of orange lightened lamp,,,,
4,wgLPy2YBXuc,https://unsplash.com/photos/wgLPy2YBXuc,https://images.unsplash.com/photo-143205996405...,2015-05-19 18:26:15.223161,t,5312,2988,1.78,,csogi,...,,,,6037626,52263,white clouds and blue sky at daytime,,,,


In [14]:
datasets['collections'].head()

Unnamed: 0,photo_id,collection_id,collection_title,photo_collected_at
0,--2IBUMom1I,162470,Majestical Sunsets,2016-03-15 17:04:25
1,--2IBUMom1I,4668070,Pose,2019-04-18 23:59:25
2,--2IBUMom1I,4172658,Guys,2019-02-02 14:40:14
3,--2IBUMom1I,9832457,business,2020-04-04 14:26:10
4,--2IBUMom1I,2143051,Travel / Places,2018-05-22 23:20:05


In [6]:
datasets['conversions'].head()

Unnamed: 0,converted_at,conversion_type,keyword,photo_id,anonymous_user_id,conversion_country
0,2020-02-28 19:17:37,download,birds,RLLR0oRz16Y,d5f2584b-3cde-4e6f-8875-2dde1373f9bb,NL
1,2020-02-28 19:20:15,download,winter,r6TLRDY4Ll0,ee639d04-e079-4737-a500-d3139cdfae9a,KR
2,2020-02-28 19:32:50,download,island,vh0FucFJ7pw,b97f3bdc-bfd2-478c-9a43-fe743ca4813c,ID
3,2020-02-28 19:39:59,download,full hd wallpaper,SbrZdkLtTCY,5a15ec7e-9bf4-464c-9601-1ad731795a8d,US
4,2020-02-28 19:49:56,download,river,x_gyAYzyeQA,4bb2e232-498c-46d8-a02b-eb647ecff408,IN


In [7]:
datasets['colors'].head()

Unnamed: 0,photo_id,hex,red,green,blue,keyword,ai_coverage,ai_score
0,A2mjVkcix-w,101C23,16,28,35,black,0.613267,0.635228
1,0ufkmj46xvU,625946,98,89,70,darkolivegreen,0.037172,0.024936
2,0ufkmj46xvU,C7897A,199,137,122,rosybrown,0.024714,0.111978
3,HY7Az9lZwB4,D18E46,209,142,70,peru,0.005867,0.115786
4,HY7Az9lZwB4,A37343,163,115,67,sienna,0.010667,0.109003


In [19]:
datasets['keywords']

Unnamed: 0,photo_id,keyword,ai_service_1_confidence,ai_service_2_confidence,suggested_by_user
0,zzux2cH-F-A,spring,34.244873,86.833668,f
1,zzux2cH-F-A,compass,26.864105,,f
2,zzux2cH-F-A,nature,99.837990,95.966119,f
3,zzux2cH-F-A,jar,43.128902,,f
4,zzux2cH-F-A,flower,81.635406,,f
5,zzux2cH-F-A,bottle,27.045839,,f
6,zzux2cH-F-A,field,94.293869,,f
7,zzux2cH-F-A,blossom,81.635406,,f
8,zzux2cH-F-A,grove,99.077385,,f
9,zzux2cH-F-A,petal,63.241280,,f


In [35]:
# http://partners.api.skyscanner.net/apiservices/geo/v1.0?apikey=prtl6749387986743898559646983194
# https://unsplash.com/data/lite/latest

In [5]:
import json

In [6]:
with open('geo.json') as f:
  geoJson = json.load(f)

In [11]:
countries = []
for continent in geoJson['Continents']:
    for country in continent['Countries']:
        countries.append({ 'Id': country['Id'], 'Name': country['Name'] })

In [12]:
dfCountries = pd.DataFrame(data=countries)

In [13]:
dfCountries.sample(10)

Unnamed: 0,Id,Name
116,SK,Slovakia
213,PS,Palestinian Territory
11,KZ,Kazakhstan
22,AI,Anguilla
164,,Namibia
121,KE,Kenya
10,UZ,Uzbekistan
18,PK,Pakistan
235,PN,Pitcairn
14,JP,Japan


In [14]:
keys = list(datasets['keywords']['keyword'])

In [15]:
filtered = [key for key in keys if key.startswith('tajikistan')]

In [16]:
filtered

['tajikistan', 'tajikistan']

In [17]:
dfCountries['keyword'] = dfCountries['Name'].str.lower() # we need to lowercase to search in unsplash dataset

In [18]:
dfCountries.sample(10)

Unnamed: 0,Id,Name,keyword
79,LU,Luxembourg,luxembourg
154,DZ,Algeria,algeria
31,SX,St Maarten,st maarten
199,TR,Turkey,turkey
120,CF,Central African Republic,central african republic
237,PG,Papua New Guinea,papua new guinea
234,WF,Wallis and Futuna Islands,wallis and futuna islands
215,CA,Canada,canada
56,GF,French Guiana,french guiana
42,KY,Cayman Islands,cayman islands


In [19]:
dfMerged = dfCountries.merge(datasets['keywords'], how='left')

In [20]:
dfMerged.sample(10)

Unnamed: 0,Id,Name,keyword,photo_id,ai_service_1_confidence,ai_service_2_confidence,suggested_by_user
1603,MG,Madagascar,madagascar,eQ_icBB_jFk,,,t
1389,FR,France,france,7EtSw7xOOGs,,,t
1011,IE,Ireland,ireland,LlHgaeBwYVE,,,t
41,IN,India,india,epjI4iHpXqU,,,t
170,JP,Japan,japan,AnFDa08X0U4,,,t
1420,UK,United Kingdom,united kingdom,IACtzClqfEA,,,t
1820,ID,Indonesia,indonesia,MBmLsgcCN3E,,,t
270,PY,Paraguay,paraguay,,,,
1823,ID,Indonesia,indonesia,GtcsP-egN1c,,,t
2067,IS,Iceland,iceland,GuKCcvC830U,,,t


In [21]:
dfMergedAgain = dfMerged.merge( datasets['photos'], how='left')

In [22]:
dfMergedAgain.sample(10)

Unnamed: 0,Id,Name,keyword,photo_id,ai_service_1_confidence,ai_service_2_confidence,suggested_by_user,photo_url,photo_image_url,photo_submitted_at,...,photo_location_longitude,photo_location_country,photo_location_city,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence
1549,KE,Kenya,kenya,LZpzMEY0pXI,,,t,https://unsplash.com/photos/LZpzMEY0pXI,https://images.unsplash.com/photo-148494324486...,2017-01-20 20:15:41.211145,...,98.583091,Thailand,Hot,1269397.0,3809.0,focus photography of wooden arrow signage,,,,
1854,HK,Hong Kong,hong kong,EyRalaLicv0,,,t,https://unsplash.com/photos/EyRalaLicv0,https://images.unsplash.com/photo-1550795629-4...,2019-02-22 00:34:05.824639,...,,Hong Kong,Hong Kong,5308825.0,16430.0,blue ocean waves on shore,,,,
2334,CA,Canada,canada,dAFk0IjztP0,,,t,https://unsplash.com/photos/dAFk0IjztP0,https://images.unsplash.com/photo-1556150443-7...,2019-04-25 00:04:20.94249,...,,,,436369.0,3775.0,group of penguin walking on mountain,,,,
1325,FR,France,france,gz9XbG7Vc-Y,,,t,https://unsplash.com/photos/gz9XbG7Vc-Y,https://images.unsplash.com/photo-156813132139...,2019-09-10 16:03:06.76762,...,,,,712562.0,1599.0,selective focus photography of short-coated br...,,,,
916,IT,Italy,italy,34OWXRoPNZM,,,t,https://unsplash.com/photos/34OWXRoPNZM,https://images.unsplash.com/photo-154203898465...,2018-11-12 16:12:36.260136,...,13.361487,Italy,Palermo,628024.0,3312.0,standing woman watching tree during daytime,,,,
408,RU,Russia,russia,FRMtK1yfFsY,,,t,https://unsplash.com/photos/FRMtK1yfFsY,https://images.unsplash.com/photo-1552575358-c...,2019-03-14 14:58:48.030944,...,,,,1140811.0,9813.0,close-up photography of ice,,,,
1891,IS,Iceland,iceland,sTF4MmHeQOM,,,t,https://unsplash.com/photos/sTF4MmHeQOM,https://images.unsplash.com/photo-158669797817...,2020-04-12 13:27:39.470545,...,-21.118298,Islande,Reykjavík,600540.0,1671.0,snow covered rocks and rocks,,,,
213,GD,Grenada,grenada,,,,,,,,...,,,,,,,,,,
2011,IS,Iceland,iceland,UiRRFlDSyXs,,,t,https://unsplash.com/photos/UiRRFlDSyXs,https://images.unsplash.com/photo-1547493447-0...,2019-01-14 19:19:43.243402,...,,Iceland,,408566.0,2456.0,ocean waves crashing on rock formation,,,,
1496,DE,Germany,germany,IppB3SB9f6s,,,t,https://unsplash.com/photos/IppB3SB9f6s,https://images.unsplash.com/photo-1552395599-6...,2019-03-12 13:00:44.384753,...,,,,1230082.0,3330.0,brown wooden board,,,,


In [23]:
dfFinal = dfMergedAgain[['Id', 'Name', 'photo_id', 'photo_image_url', 'photo_aspect_ratio']]

In [24]:
dfFinal.sample(10)

Unnamed: 0,Id,Name,photo_id,photo_image_url,photo_aspect_ratio
1111,NO,Norway,Bg1hgJEU3Es,https://images.unsplash.com/photo-157920115767...,0.67
2467,MX,Mexico,5A06OWU6Wuc,https://images.unsplash.com/photo-146393657582...,1.58
1907,IS,Iceland,pDeagUyN-Pk,https://images.unsplash.com/photo-150922036896...,0.67
917,IT,Italy,2gMDqim35xw,https://images.unsplash.com/photo-1552561190-c...,1.5
1610,TZ,Tanzania,JVn25VdB-qo,https://images.unsplash.com/photo-158639800176...,0.67
2787,AU,Australia,6rW-AcbSnTA,https://images.unsplash.com/photo-157997238366...,0.67
788,IT,Italy,uFWmQgG4emE,https://images.unsplash.com/photo-148562904972...,1.54
2104,IS,Iceland,BYQaqSDjNR4,https://images.unsplash.com/photo-150688221635...,1.5
1118,NO,Norway,5GXhIja2sh4,https://images.unsplash.com/photo-1546689021-7...,0.67
773,BA,Bosnia and Herzegovina,,,


In [25]:
dfFinal

Unnamed: 0,Id,Name,photo_id,photo_image_url,photo_aspect_ratio
0,MN,Mongolia,,,
1,CN,China,zF2aLwowymQ,https://images.unsplash.com/photo-1545520778-4...,1.50
2,CN,China,xAns8H5j9rM,https://images.unsplash.com/photo-157810191177...,1.78
3,CN,China,wpTWYBll4_w,https://images.unsplash.com/photo-144022753781...,1.50
4,CN,China,wIZpOCdTb8I,https://images.unsplash.com/photo-1542902093-d...,0.67
5,CN,China,upypO_FbRJ4,https://images.unsplash.com/photo-1550649613-6...,0.75
6,CN,China,tc0FfoFlQJQ,https://images.unsplash.com/photo-1548347781-8...,1.51
7,CN,China,tPhfHSEeGaI,https://images.unsplash.com/photo-1542642839-8...,1.76
8,CN,China,s2cocR07DQs,https://images.unsplash.com/photo-1545575132-c...,1.50
9,CN,China,o6k0ZH1eOwg,https://images.unsplash.com/photo-1552288092-7...,2.71


In [26]:
dfFinal = dfFinal.dropna()

In [27]:
dfFinal

Unnamed: 0,Id,Name,photo_id,photo_image_url,photo_aspect_ratio
1,CN,China,zF2aLwowymQ,https://images.unsplash.com/photo-1545520778-4...,1.50
2,CN,China,xAns8H5j9rM,https://images.unsplash.com/photo-157810191177...,1.78
3,CN,China,wpTWYBll4_w,https://images.unsplash.com/photo-144022753781...,1.50
4,CN,China,wIZpOCdTb8I,https://images.unsplash.com/photo-1542902093-d...,0.67
5,CN,China,upypO_FbRJ4,https://images.unsplash.com/photo-1550649613-6...,0.75
6,CN,China,tc0FfoFlQJQ,https://images.unsplash.com/photo-1548347781-8...,1.51
7,CN,China,tPhfHSEeGaI,https://images.unsplash.com/photo-1542642839-8...,1.76
8,CN,China,s2cocR07DQs,https://images.unsplash.com/photo-1545575132-c...,1.50
9,CN,China,o6k0ZH1eOwg,https://images.unsplash.com/photo-1552288092-7...,2.71
10,CN,China,kmu4AjOBuHo,https://images.unsplash.com/photo-1548850174-4...,0.67


In [28]:
# j = (
#              .to_json(orient='records'))

In [30]:
grouped = dfFinal.groupby(['Id','Name'], as_index=False)

In [31]:
g = grouped.apply(lambda group: group[['photo_id','photo_image_url', 'photo_aspect_ratio']].to_dict('r')).reset_index().rename(columns={0:'photos'})

In [32]:
imagesJson = json.loads(g.to_json(orient='records'))

In [33]:
imagesJson

[{'Id': 'AM',
  'Name': 'Armenia',
  'photos': [{'photo_id': 'mVPpRr9K0jk',
    'photo_image_url': 'https://images.unsplash.com/photo-1462380178751-1f807dbf53ac',
    'photo_aspect_ratio': 0.73},
   {'photo_id': 'mQF2vmyV0Zc',
    'photo_image_url': 'https://images.unsplash.com/photo-1497354829124-d3d128a97ee9',
    'photo_aspect_ratio': 1.2}]},
 {'Id': 'AQ',
  'Name': 'Antarctica',
  'photos': [{'photo_id': 'pSFdBw62Hww',
    'photo_image_url': 'https://images.unsplash.com/photo-1470519903472-9bfac69c2556',
    'photo_aspect_ratio': 1.5},
   {'photo_id': 'nvBfwtaUBnI',
    'photo_image_url': 'https://images.unsplash.com/photo-1462888387064-2a2ea232edb8',
    'photo_aspect_ratio': 1.33},
   {'photo_id': 'lpY5UwUM4Us',
    'photo_image_url': 'https://images.unsplash.com/photo-1549382257-0177fa8e7abb',
    'photo_aspect_ratio': 1.78},
   {'photo_id': 'jKiu7sPW7SE',
    'photo_image_url': 'https://images.unsplash.com/photo-1587606605848-7395dfb56d90',
    'photo_aspect_ratio': 0.67},
   {

In [34]:
with open('images-by-country-id.json', 'w') as outfile:
    json.dump(imagesJson, outfile, indent=2)