## Tutorial on Extracting Image Metadata using Pillow


In [1]:
!pip install Pillow
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[K     |████████████████████████████████| 212.4 MB 68 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 34.8 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880768 sha256=b627aa16c74595b9838a783a2dbb75b0140d49ecaf20f46268171eea7e8e1a0f
  Stored in directory: /root/.cache/pip/wheels/a5/0a/c1/9561f6fecb759579a7d863dcd846daaa95f598744e71b02c77
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.2


In [2]:
# Importing PIL - the Python Imaging Library

import PIL as pil
from PIL import Image
from PIL.ExifTags import TAGS
print(pil.__version__)

7.1.2


In [3]:
# Importing other needed libraries

from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pyspark.ml
import pyspark.mllib
import numpy as np
import json
import os
import pandas as pd

In [4]:
## Set up Google mount to upload images.

from google.colab import drive
drone_1 = drive.mount('/content/drive/')

Mounted at /content/drive/


In [5]:
!ls -l /content/drive/
gdrive_path = "/content/drive/MyDrive/Data Science/Data Science Teams/data/Part1/Images"

total 4
drwx------ 25 root root 4096 Sep 22 22:40 MyDrive


In [6]:
# Function for ingesting data - Copied the previous code
# You'll need to !pip install Pillow and...
# import PIL as pil, from PIL import Image, from PIL.ExifTags import TAGS

def extract_drone_metadata(filepath_before_name, range_start, range_end):
  j_nest = []

  for i in range(range_start, range_end):
    # Extracting the metadata from each image
    the_path = filepath_before_name + "100_0006_00{} (2).JPG".format("%02d" % i)
    image = Image.open(the_path)
    exif = image.getexif()
    j_input = {}

    # Generating a giant dict of dictionary
    for tagID in exif:
      tag_name = TAGS.get(tagID, tagID)
      value = exif.get(tagID)
      # if isinstance(value, bytes):
      #   value = value.decode()
      # print(f"{tag_name} : {value}")
      key_value = {tag_name: value}
      j_input.update(key_value)
    
    j_nest.append(j_input)

  drone_df = pd.DataFrame(j_nest)
  return drone_df


In [7]:
# Breaking up the data processing into two chunks 

path_to_name = "/content/drive/MyDrive/Data Science/Data Science Teams/Team 2 - Drone Database/data/Part1/Images/"
drone_df = extract_drone_metadata(path_to_name, 1, 41)
drone_df.tail(2)

Unnamed: 0,ExifVersion,ComponentsConfiguration,CompressedBitsPerPixel,DateTimeOriginal,DateTimeDigitized,ShutterSpeedValue,ApertureValue,ExposureBiasValue,MaxApertureValue,SubjectDistance,MeteringMode,LightSource,Flash,FocalLength,ColorSpace,ExifImageWidth,ExifImageHeight,Contrast,Saturation,Sharpness,SubjectDistanceRange,ExposureIndex,ImageDescription,Make,Model,FileSource,ExposureTime,ExifInteroperabilityOffset,XResolution,FNumber,SceneType,YResolution,XPComment,XPKeywords,ExposureProgram,GPSInfo,CustomRendered,ISOSpeedRatings,ResolutionUnit,ExposureMode,FlashPixVersion,WhiteBalance,BodySerialNumber,Software,DateTime,DigitalZoomRatio,FocalLengthIn35mmFilm,SceneCaptureType,GainControl,Orientation,ExifOffset,YCbCrPositioning,MakerNote
38,b'0230',b'\x00\x03\x02\x01',"(8751077, 2495232)",2020:04:19 14:55:37,2020:04:19 14:55:37,"(-8965, -1000)","(531, 100)","(0, 32)","(297, 100)","(0, 100)",2,1,32,"(880, 100)",1,5472,3648,0,0,0,0,"(0, 0)",DCIM\SURVEY\100_0006\100_0,DJI��������������������������,FC6310R����������������������,b'\x03',"(2000, 1000000)",656,"(72, 1)","(630, 100)",b'\x01',"(72, 1)","b'T\x00y\x00p\x00e\x00=\x00N\x00,\x00 \x00M\x0...",b'v\x000\x001\x00.\x000\x009\x00.\x001\x007\x0...,2,"{0: b'\x02\x03\x00\x00', 1: 'N', 2: ((49, 1), ...",0,100,2,0,b'0010',1,cbb8ce090560181ea0aec57b42a6e31c���������������,v01.09.1754������������,2020:04:19 14:55:38,"(0, 0)",24,0,0,1,182,1,b'$\x00\x01\x00\x02\x00\x04\x00\x00\x00DJI\x00...
39,b'0230',b'\x00\x03\x02\x01',"(8661879, 2495232)",2020:04:19 14:55:40,2020:04:19 14:55:40,"(-8965, -1000)","(531, 100)","(0, 32)","(297, 100)","(0, 100)",2,1,32,"(880, 100)",1,5472,3648,0,0,0,0,"(0, 0)",DCIM\SURVEY\100_0006\100_0,DJI��������������������������,FC6310R����������������������,b'\x03',"(2000, 1000000)",656,"(72, 1)","(630, 100)",b'\x01',"(72, 1)","b'T\x00y\x00p\x00e\x00=\x00N\x00,\x00 \x00M\x0...",b'v\x000\x001\x00.\x000\x009\x00.\x001\x007\x0...,2,"{0: b'\x02\x03\x00\x00', 1: 'N', 2: ((49, 1), ...",0,100,2,0,b'0010',1,cbb8ce090560181ea0aec57b42a6e31c���������������,v01.09.1754������������,2020:04:19 14:55:41,"(0, 0)",24,0,0,1,182,1,b'$\x00\x01\x00\x02\x00\x04\x00\x00\x00DJI\x00...


In [8]:
path_to_name = "/content/drive/MyDrive/Data Science/Data Science Teams/Team 2 - Drone Database/data/Part1/Images/"
drone_df2 = extract_drone_metadata(path_to_name, 41, 81)
drone_df2.head(2)

Unnamed: 0,ExifVersion,ComponentsConfiguration,CompressedBitsPerPixel,DateTimeOriginal,DateTimeDigitized,ShutterSpeedValue,ApertureValue,ExposureBiasValue,MaxApertureValue,SubjectDistance,MeteringMode,LightSource,Flash,FocalLength,ColorSpace,ExifImageWidth,ExifImageHeight,Contrast,Saturation,Sharpness,SubjectDistanceRange,ExposureIndex,ImageDescription,Make,Model,FileSource,ExposureTime,ExifInteroperabilityOffset,XResolution,FNumber,SceneType,YResolution,XPComment,XPKeywords,ExposureProgram,GPSInfo,CustomRendered,ISOSpeedRatings,ResolutionUnit,ExposureMode,FlashPixVersion,WhiteBalance,BodySerialNumber,Software,DateTime,DigitalZoomRatio,FocalLengthIn35mmFilm,SceneCaptureType,GainControl,Orientation,ExifOffset,YCbCrPositioning,MakerNote
0,b'0230',b'\x00\x03\x02\x01',"(8175195, 2495232)",2020:04:19 14:55:43,2020:04:19 14:55:43,"(-8965, -1000)","(531, 100)","(0, 32)","(297, 100)","(0, 100)",2,1,32,"(880, 100)",1,5472,3648,0,0,0,0,"(0, 0)",DCIM\SURVEY\100_0006\100_0,DJI��������������������������,FC6310R����������������������,b'\x03',"(2000, 1000000)",656,"(72, 1)","(630, 100)",b'\x01',"(72, 1)","b'T\x00y\x00p\x00e\x00=\x00N\x00,\x00 \x00M\x0...",b'v\x000\x001\x00.\x000\x009\x00.\x001\x007\x0...,2,"{0: b'\x02\x03\x00\x00', 1: 'N', 2: ((49, 1), ...",0,100,2,0,b'0010',1,cbb8ce090560181ea0aec57b42a6e31c���������������,v01.09.1754������������,2020:04:19 14:55:44,"(0, 0)",24,0,0,1,182,1,b'$\x00\x01\x00\x02\x00\x04\x00\x00\x00DJI\x00...
1,b'0230',b'\x00\x03\x02\x01',"(8237147, 2495232)",2020:04:19 14:55:46,2020:04:19 14:55:46,"(-9321, -1000)","(531, 100)","(0, 32)","(297, 100)","(0, 100)",2,1,32,"(880, 100)",1,5472,3648,0,0,0,0,"(0, 0)",DCIM\SURVEY\100_0006\100_0,DJI��������������������������,FC6310R����������������������,b'\x03',"(1562, 1000000)",656,"(72, 1)","(630, 100)",b'\x01',"(72, 1)","b'T\x00y\x00p\x00e\x00=\x00N\x00,\x00 \x00M\x0...",b'v\x000\x001\x00.\x000\x009\x00.\x001\x007\x0...,2,"{0: b'\x02\x03\x00\x00', 1: 'N', 2: ((49, 1), ...",0,100,2,0,b'0010',1,cbb8ce090560181ea0aec57b42a6e31c���������������,v01.09.1754������������,2020:04:19 14:55:46,"(0, 0)",24,0,0,1,182,1,b'$\x00\x01\x00\x02\x00\x04\x00\x00\x00DJI\x00...


In [9]:
drone_df = pd.concat([drone_df, drone_df2], axis=0)
print(drone_df.shape)
# print(drone_df.isnull().sum()) - Checked for null values - None
drone_df_cp = drone_df.copy()
drone_df_cp.head()

(80, 53)


Unnamed: 0,ExifVersion,ComponentsConfiguration,CompressedBitsPerPixel,DateTimeOriginal,DateTimeDigitized,ShutterSpeedValue,ApertureValue,ExposureBiasValue,MaxApertureValue,SubjectDistance,MeteringMode,LightSource,Flash,FocalLength,ColorSpace,ExifImageWidth,ExifImageHeight,Contrast,Saturation,Sharpness,SubjectDistanceRange,ExposureIndex,ImageDescription,Make,Model,FileSource,ExposureTime,ExifInteroperabilityOffset,XResolution,FNumber,SceneType,YResolution,XPComment,XPKeywords,ExposureProgram,GPSInfo,CustomRendered,ISOSpeedRatings,ResolutionUnit,ExposureMode,FlashPixVersion,WhiteBalance,BodySerialNumber,Software,DateTime,DigitalZoomRatio,FocalLengthIn35mmFilm,SceneCaptureType,GainControl,Orientation,ExifOffset,YCbCrPositioning,MakerNote
0,b'0230',b'\x00\x03\x02\x01',"(8922615, 2495232)",2020:04:19 14:53:47,2020:04:19 14:53:47,"(-8965, -1000)","(531, 100)","(0, 32)","(297, 100)","(0, 100)",2,1,32,"(880, 100)",1,5472,3648,0,0,0,0,"(0, 0)",DCIM\SURVEY\100_0006\100_0,DJI��������������������������,FC6310R����������������������,b'\x03',"(2000, 1000000)",656,"(72, 1)","(630, 100)",b'\x01',"(72, 1)","b'T\x00y\x00p\x00e\x00=\x00N\x00,\x00 \x00M\x0...",b'v\x000\x001\x00.\x000\x009\x00.\x001\x007\x0...,2,"{0: b'\x02\x03\x00\x00', 1: 'N', 2: ((49, 1), ...",0,100,2,0,b'0010',1,cbb8ce090560181ea0aec57b42a6e31c���������������,v01.09.1754������������,2020:04:19 14:53:48,"(0, 0)",24,0,0,1,182,1,b'$\x00\x01\x00\x02\x00\x04\x00\x00\x00DJI\x00...
1,b'0230',b'\x00\x03\x02\x01',"(8989522, 2495232)",2020:04:19 14:53:50,2020:04:19 14:53:50,"(-8965, -1000)","(531, 100)","(0, 32)","(297, 100)","(0, 100)",2,1,32,"(880, 100)",1,5472,3648,0,0,0,0,"(0, 0)",DCIM\SURVEY\100_0006\100_0,DJI��������������������������,FC6310R����������������������,b'\x03',"(2000, 1000000)",656,"(72, 1)","(630, 100)",b'\x01',"(72, 1)","b'T\x00y\x00p\x00e\x00=\x00N\x00,\x00 \x00M\x0...",b'v\x000\x001\x00.\x000\x009\x00.\x001\x007\x0...,2,"{0: b'\x02\x03\x00\x00', 1: 'N', 2: ((49, 1), ...",0,100,2,0,b'0010',1,cbb8ce090560181ea0aec57b42a6e31c���������������,v01.09.1754������������,2020:04:19 14:53:50,"(0, 0)",24,0,0,1,182,1,b'$\x00\x01\x00\x02\x00\x04\x00\x00\x00DJI\x00...
2,b'0230',b'\x00\x03\x02\x01',"(7965271, 2495232)",2020:04:19 14:53:53,2020:04:19 14:53:53,"(-8965, -1000)","(531, 100)","(0, 32)","(297, 100)","(0, 100)",2,1,32,"(880, 100)",1,5472,3648,0,0,0,0,"(0, 0)",DCIM\SURVEY\100_0006\100_0,DJI��������������������������,FC6310R����������������������,b'\x03',"(2000, 1000000)",656,"(72, 1)","(630, 100)",b'\x01',"(72, 1)","b'T\x00y\x00p\x00e\x00=\x00N\x00,\x00 \x00M\x0...",b'v\x000\x001\x00.\x000\x009\x00.\x001\x007\x0...,2,"{0: b'\x02\x03\x00\x00', 1: 'N', 2: ((49, 1), ...",0,100,2,0,b'0010',1,cbb8ce090560181ea0aec57b42a6e31c���������������,v01.09.1754������������,2020:04:19 14:53:53,"(0, 0)",24,0,0,1,182,1,b'$\x00\x01\x00\x02\x00\x04\x00\x00\x00DJI\x00...
3,b'0230',b'\x00\x03\x02\x01',"(8220861, 2495232)",2020:04:19 14:53:55,2020:04:19 14:53:55,"(-8965, -1000)","(497, 100)","(0, 32)","(297, 100)","(0, 100)",2,1,32,"(880, 100)",1,5472,3648,0,0,0,0,"(0, 0)",DCIM\SURVEY\100_0006\100_0,DJI��������������������������,FC6310R����������������������,b'\x03',"(2000, 1000000)",656,"(72, 1)","(560, 100)",b'\x01',"(72, 1)","b'T\x00y\x00p\x00e\x00=\x00N\x00,\x00 \x00M\x0...",b'v\x000\x001\x00.\x000\x009\x00.\x001\x007\x0...,2,"{0: b'\x02\x03\x00\x00', 1: 'N', 2: ((49, 1), ...",0,100,2,0,b'0010',1,cbb8ce090560181ea0aec57b42a6e31c���������������,v01.09.1754������������,2020:04:19 14:53:56,"(0, 0)",24,0,0,1,182,1,b'$\x00\x01\x00\x02\x00\x04\x00\x00\x00DJI\x00...
4,b'0230',b'\x00\x03\x02\x01',"(8275193, 2495232)",2020:04:19 14:53:58,2020:04:19 14:53:58,"(-8965, -1000)","(531, 100)","(0, 32)","(297, 100)","(0, 100)",2,1,32,"(880, 100)",1,5472,3648,0,0,0,0,"(0, 0)",DCIM\SURVEY\100_0006\100_0,DJI��������������������������,FC6310R����������������������,b'\x03',"(2000, 1000000)",656,"(72, 1)","(630, 100)",b'\x01',"(72, 1)","b'T\x00y\x00p\x00e\x00=\x00N\x00,\x00 \x00M\x0...",b'v\x000\x001\x00.\x000\x009\x00.\x001\x007\x0...,2,"{0: b'\x02\x03\x00\x00', 1: 'N', 2: ((49, 1), ...",0,100,2,0,b'0010',1,cbb8ce090560181ea0aec57b42a6e31c���������������,v01.09.1754������������,2020:04:19 14:53:59,"(0, 0)",24,0,0,1,182,1,b'$\x00\x01\x00\x02\x00\x04\x00\x00\x00DJI\x00...


Great source for [uploading a Pandas DataFrame to MongoDB]('https://medium.com/analytics-vidhya/how-to-upload-a-pandas-dataframe-to-mongodb-ffa18c0953c1')!

In [10]:
# drone_df_cp = drone_df_cp.reset_index(inplace=True)
mongoDB_upload = drone_df_cp.to_dict('records')
mongoDB_upload

# result = drone_df.to_json(orient="column")
# parsed = json.loads(result)
# json.dumps(parsed, indent=4)

[{'ApertureValue': (531, 100),
  'BodySerialNumber': 'cbb8ce090560181ea0aec57b42a6e31c\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
  'ColorSpace': 1,
  'ComponentsConfiguration': b'\x00\x03\x02\x01',
  'CompressedBitsPerPixel': (8922615, 2495232),
  'Contrast': 0,
  'CustomRendered': 0,
  'DateTime': '2020:04:19 14:53:48',
  'DateTimeDigitized': '2020:04:19 14:53:47',
  'DateTimeOriginal': '2020:04:19 14:53:47',
  'DigitalZoomRatio': (0, 0),
  'ExifImageHeight': 3648,
  'ExifImageWidth': 5472,
  'ExifInteroperabilityOffset': 656,
  'ExifOffset': 182,
  'ExifVersion': b'0230',
  'ExposureBiasValue': (0, 32),
  'ExposureIndex': (0, 0),
  'ExposureMode': 0,
  'ExposureProgram': 2,
  'ExposureTime': (2000, 1000000),
  'FNumber': (630, 100),
  'FileSource': b'\x03',
  'Flash': 32,
  'FlashPixVersion': b'0010',
  'FocalLength': (880, 100),
  'FocalLengthIn35mmFilm': 24,
  'GPSInfo': {0: b'\x02\x03\x00\x00',
   1: 'N',
   2: ((49, 1), (15, 1), (110857, 10000)),
   3: 'W',
  