### **Unstructured Data - Images**

<font color="red">File access required:</font> In Colab this notebook requires first uploading file **Flags.zip** using the *Files* feature in the left toolbar. If running the notebook on a local computer, make sure the unzipped **Flags** folder is in the same workspace as the notebook, and don't run the first cell below.

In [None]:
# Only run this cell if using Colab
from zipfile import ZipFile
with ZipFile('/content/Flags.zip') as zip:
  zip.extractall()
  print('Unzip done - refresh Files using the folder refresh icon')

In [None]:
# Set-up
from PIL import Image
from io import BytesIO
from IPython.display import display
import os
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.spatial import distance

#### Set up RGB triples for basic colors (easy to add more)

In [None]:
colordict = {
    'red': (255,0,0),
    'purple': (128,0,128),
    'blue': (0,0,255),
    'green': (0,255,0),
    'yellow': (255,255,0),
    'orange': (255,165,0),
    'pink': (255,192,203),
    'white': (255,255,255),
    'gray': (128,128,128),
    'black': (0,0,0) }

#### Image dataset is directory of **png** or **jpg** files. Folder **Flags** contains 206 country flags. Filter on file (country) name to reduce dataset size.

In [None]:
# If running notebook on local computer, replace first line with:
# folder = 'Flags/'
folder = '/content/Flags/'
filterstring = 'nia' # Use '' for no filter
files = []
for filename in os.listdir(folder):
  if filterstring in filename:
    files.append(filename)
for f in files: print(f)

### Find dominant color in images

In [None]:
for filename in files:
    # Show filename and image
    print(filename)
    data = open(folder + filename, mode='rb').read()
    image = Image.open(BytesIO(data))
    display(image)
    # Find dominant color
    # getcolors() parameter (2500) specifies up to 2500 different colors in image;
    #   function returns 'None' if image has more than 2500 different colors
    colors = image.getcolors(2500)
#    print(colors)
    if colors == None:
        print('Too many colors')
    else:
        highest = 0
        domcolor = (-1,-1,-1)
        for c in colors:
            if c[0] > highest:
                highest = c[0]
                domcolor = c[1]
        print('Dominant color:')
        # Normalize RGB values for imshow function
        normcolor = [float(x)/255 for x in domcolor]
        plt.imshow([[normcolor]])
        plt.show()

### Find closest basic color to dominant color

In [None]:
for filename in files:
    # Show filename and image
    print(filename)
    data = open(folder + filename, mode='rb').read()
    image = Image.open(BytesIO(data))
    display(image)
    # Find dominant color
    colors = image.getcolors(2500)
    if colors == None:
        print('Too many colors')
    else:
        highest = 0
        domcolor = (-1,-1,-1)
        for c in colors:
            if c[0] > highest:
                highest = c[0]
                domcolor = c[1]
        # Closest basic color
        # Remove alpha value if present (RGBA to RGB)
        domcolor = domcolor[:3]
        closest = ''
        mindist = 1000
        for c in colordict:
            dist = distance.euclidean(domcolor,colordict[c])
            if dist < mindist:
                mindist = dist
                closest = c
        print('Closest basic color:', closest, '\n')

### Find weighted average distance from each basic color

In [None]:
for filename in files:
    # Show filename and image
    print(filename)
    data = open(folder + filename, mode='rb').read()
    image = Image.open(BytesIO(data))
    display(image)
    colors = image.getcolors(2500)
    if colors == None:
        print('Too many colors')
    else:
        for b in colordict:
            # find weighted average distance from colors
            num = 0
            total = 0
            for c in colors:
                # Remove alpha value if present (RGBA to RGB)
                color = c[1][:3]
                dist = distance.euclidean(color,colordict[b])
                total += c[0] * dist
                num += c[0]
            wavg = total/num
            print(b, wavg)
        print('\n')

### <font color="green">**Your Turn**</font>

#### Find the simplest and most complex flags: the simplest is the flag with the fewest colors, the most complex is the flag with the most colors. Don't worry about ties. Warning: may be very slow over entire dataset.

In [None]:
# Initialize variables to starting values
fewestcolors = 3000
mostcolors = 0
simpleflag = ''
complexflag = ''
# Find the flags with the fewest and most colors
for filename in files:
#
    YOUR CODE HERE
#
# Display results
print('Simplest flag has', fewestcolors, 'colors:')
print(simpleflag)
data = open(folder + simpleflag, mode='rb').read()
image = Image.open(BytesIO(data))
display(image)
print('Most complex flag has', mostcolors, 'colors:')
print(complexflag)
data = open(folder + complexflag, mode='rb').read()
image = Image.open(BytesIO(data))
display(image)

#### Loop where user enters basic color, system returns flag closest to that color, i.e., the flag with the smallest weighted average distance from the entered color. Don't worry about ties. Warning: may be very slow over entire dataset.

In [None]:
# Here is code for loop of user entering basic color
while True:
    text = input("Enter basic color (or 'quit' to quit): ")
    if text == 'quit': break
    searchcolor = colordict.get(text,'none')
    if searchcolor == 'none':
        print('Color not found, try a different one')
    else:
        print('Finding closest flag...')
        # Find the flag closest to that color
        # Hint: See earlier code for weighted average distance from basic colors
        YOUR CODE HERE
print('goodbye')