In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import glob

In [2]:
path = 'unsplash-lite-dataset-25k-nature/'
documents = ['photos', 'keywords', 'collections', 'conversions']
datasets = {}

for doc in documents:
  files = glob.glob(path + doc + ".tsv*")

  subsets = []
  for filename in files:
    df = pd.read_csv(filename, sep='\t', header=0)
    subsets.append(df)

  datasets[doc] = pd.concat(subsets, axis=0, ignore_index=True)

# Preliminary data inspection

### Photos

In [24]:
print(datasets['photos'].count())
num_photos = datasets['photos']['photo_id'].size

photo_id                          25000
photo_url                         25000
photo_image_url                   25000
photo_submitted_at                25000
photo_featured                    25000
photographer_username             25000
photographer_first_name           25000
photographer_last_name            23344
exif_camera_make                  22170
exif_camera_model                 22126
exif_iso                          21785
exif_aperture_value               21384
exif_focal_length                 21483
exif_exposure_time                21762
photo_location_name                9478
photo_location_latitude            6895
photo_location_longitude           6892
photo_location_country             8563
photo_location_city                5984
stats_views                       25000
stats_downloads                   25000
ai_description                    23607
ai_primary_landmark_name              0
ai_primary_landmark_latitude          0
ai_primary_landmark_longitude         0


### Keywords

In [4]:
print(datasets['keywords'].count())
print(len(pd.unique(datasets['keywords']['photo_id'])))

photo_id                   2689739
keyword                    2689739
ai_service_1_confidence    2390469
ai_service_2_confidence     219507
suggested_by_user          2689739
dtype: int64
25000


In [42]:
keywords = datasets['keywords'].groupby('keyword').count()['photo_id']
print(f"The most used keyword is {keywords.idxmax()} which is used {keywords.max()} times.")

The most used keyword is plant which is used 20582 times.


In [43]:
summary = ""
keyword_counts = [1, 5, 10, 25, 50, 100, 500, 1000, 5000, 10000, 20000]
for i in keyword_counts:
    summary = summary + (f"{len(keywords[keywords>i])} keywords used more that {i} times.\n")
print(summary)

11940 keywords used more that 1 times.
5909 keywords used more that 5 times.
4580 keywords used more that 10 times.
3300 keywords used more that 25 times.
2604 keywords used more that 50 times.
2067 keywords used more that 100 times.
1016 keywords used more that 500 times.
654 keywords used more that 1000 times.
107 keywords used more that 5000 times.
23 keywords used more that 10000 times.
1 keywords used more that 20000 times.



In [49]:
print(f"The average number of times a keyword is used is {stats.mode(keywords)[0]}")

The average number of times a keyword is used is [1]


### Collections

In [5]:
print(datasets['collections'].count())
print(len(datasets['collections']['collection_id'].unique()))
print(len(datasets['collections']['photo_id'].unique()))

photo_id              1646597
collection_id         1646597
collection_title      1646575
photo_collected_at    1646597
dtype: int64
405906
24954


### Conversions

In [6]:
print(datasets['conversions'].count())
print(len(datasets['conversions']['photo_id'].unique()))

converted_at          4075504
conversion_type       4075504
keyword               4075504
photo_id              4075504
anonymous_user_id     4075504
conversion_country    4068248
dtype: int64
23288


So some initial observations from checking out the unique 'photo_id's from all of the dataframes are that there are 25000 total photos, that all photos are found throughout the 'Keywords' dataset, but that some photos have not been placed in a collection or been converted. Not all photos have an ai description, which gives a priliminary seperation of the photos into a training and testing set.

# Brainstorming - What to do with data

### Machine Learning
* given an image, assign keywords
* suggest additional keywords for images
* suggest new images for collections, or vise versa, collections to add an image to.
* build new collections based on a set of keywords
* suggest photo subjects based on what's popular

### General Data Stuff
* Most popular types of photos based on the conversions df.
* Which countries download what.

# Visualization

In [7]:
import matplotlib as plt

# Database Table Schema

There's a few fields I'm not super interested in here, like the camera info and photographer first and last name, so I'm going to not add those to the db. photo_featured is always true in this set, so I'm going to leave it out as well. The photo location info is a bit messy, with there not being a standard format used inside photo_location_name, but I'm going to store all location data since a few ideas I want to do depend on it and will do further cleaning when called for.

CREATE TABLE photos(\
	photo_id INT NOT NULL, \
	photo_url VARCHAR NOT NULL,\
	photo_image_url VARCHAR NOT NULL,\
    photo_submitted_at DATE NOT NULL,\
	photo_featured VARCHAR NOT NULL,\
	photographer_username VARCHAR NOT NULL,\
    photo_location_name VARCHAR,\
    photo_location_latitude NUMERIC,\
    photo_location_longitude NUMERIC,\
    photo_location_country VARCHAR,\
    photo_location_city VARCHAR,\
    stats_views INT NOT NULL,\
    stats_downloads INT NOT NULL,\
    ai_description VARCHAR,\
	PRIMARY KEY (photo_id)	\
);