# The Project #
1. This is a project with minimal scaffolding. Expect to use the the discussion forums to gain insights! It’s not cheating to ask others for opinions or perspectives!
2. Be inquisitive, try out new things.
3. Use the previous modules for insights into how to complete the functions! You'll have to combine Pillow, OpenCV, and Pytesseract
4. There are hints provided in Coursera, feel free to explore the hints if needed. Each hint provide progressively more details on how to solve the issue. This project is intended to be comprehensive and difficult if you do it without the hints.

### The Assignment ###
Take a [ZIP file](https://en.wikipedia.org/wiki/Zip_(file_format)) of images and process them, using a [library built into python](https://docs.python.org/3/library/zipfile.html) that you need to learn how to use. A ZIP file takes several different files and compresses them, thus saving space, into one single file. The files in the ZIP file we provide are newspaper images (like you saw in week 3). Your task is to write python code which allows one to search through the images looking for the occurrences of keywords and faces. E.g. if you search for "pizza" it will return a contact sheet of all of the faces which were located on the newspaper page which mentions "pizza".  
This will test your ability to :
- learn a new library
- use OpenCV to detect faces
- use tesseract to do optical character recognition
- use PIL to composite images together into contact sheets

Each page of the newspapers is saved as a single PNG image in a file called [images.zip](../readonly/images.zip). These newspapers are in english, and contain a variety of stories, advertisements and images. Note: This file is fairly large (~200 MB) and may take some time to work with, I would encourage you to use [small_img.zip](../readonly/small_img.zip) for testing.

Here's an example of the output expected. Using the [small_img.zip](../readonly/small_img.zip) file, if I search for the string "Christopher" I should see the following image:
![Christopher Search](../readonly/small_project.png)
If I were to use the [images.zip](../readonly/images.zip) file and search for "Mark" I should see the following image (note that there are times when there are no faces on a page, but a word is found!):
![Mark Search](../readonly/large_project.png)

Note: That big file can take some time to process - for me it took nearly ten minutes! Use the small one for testing.

In [None]:
from collections import Counter
from typing import Set
import zipfile

import math
import numpy as np
import pandas as pd
from io import StringIO

from PIL import Image
from PIL import ImageDraw
from PIL import ImageChops
import cv2 as cv

import pytesseract
import kraken
from kraken import pageseg

import matplotlib.pyplot as plt
from IPython.display import display
from pprint import pprint

%matplotlib inline

In [None]:
# loading the face detection classifier
face_cascade = cv.CascadeClassifier('/app/readonly/haarcascade_frontalface_default.xml')
eye_cascade = cv.CascadeClassifier('/app/readonly/haarcascade_eye.xml')

# Extract all images
with zipfile.ZipFile("/app/readonly/images.zip") as zip_images:
    zip_images.extractall(path="/app")

In [None]:
# Read images

images = {
    f"a{n}": cv.cvtColor(
        cv.imread(f"/app/a-{n}.png"), cv.COLOR_BGR2RGB
    )
    for n in range(5)
}
{n: img.shape for n, img in images.items()}

In [None]:
def get_words(cv_img: np.array) -> Set[str]:
    # Convert to black and white
    if len(cv_img.shape) == 3:
        cv_img = cv.cvtColor(cv_img, cv.COLOR_RGB2GRAY)    
    threshold, cv_img_bw = cv.threshold(cv_img, int(cv_img.mean()) - 10, 255, cv.THRESH_BINARY)
    
    # Run Terreract & clean output
    r = pytesseract.image_to_data(cv_img_bw)
    df = pd.read_csv(StringIO(r), sep="\t")
    df = df.dropna(subset=["text"])
    df["text"] = df["text"].str.strip()
    df["len"] = df["text"].str.len()
    df["char_width"] = df["width"] / df["len"]
    df = df[df["len"] > 0]
    
    return (df, set(df["text"].str.lower()))


def erase_text_on_img(cv_img: np.array, df_tesseract: pd.DataFrame):
    cv_img_no_text = cv_img.copy()
    for row in df_tesseract.itertuples():
        cv_img_no_text[row.top: row.top + row.height, row.left: row.left + row.width] = 255
    return cv_img_no_text


df_tesseract, words = get_words(images["a0"])
cv_img_no_text = erase_text_on_img(images["a0"], df_tesseract)


len(words), list(words)[:10]

In [None]:
from typing import NamedTuple

class Face_coord(NamedTuple):
    x: int
    y: int
    w: int
    h: int
    
    @property
    def center(self):
        return (self.x + self.w//2, self.y + self.h//2)

In [None]:
def get_faces(cv_img: np.array):
    pass

cv_img_rgb = images["a4"]
cv_img_gray = cv.cvtColor(cv_img_rgb, cv.COLOR_RGB2GRAY)
df_tesseract, words = get_words(cv_img_gray)
cv_img_no_text = erase_text_on_img(cv_img_gray, df_tesseract)

# Use the face_cascade classifier
faces_color = [
    (face_cascade.detectMultiScale(cv_img_no_text, 2.50), "black"),
    (face_cascade.detectMultiScale(cv_img_no_text, 2.00), "red"),
    (face_cascade.detectMultiScale(cv_img_no_text, 1.70), "white"),
    (face_cascade.detectMultiScale(cv_img_no_text, 1.40), "blue"),
    (face_cascade.detectMultiScale(cv_img_no_text, 1.20), "green"),
]

#pil_img = Image.fromarray(cv_img_rgb, mode="RGB")
pil_img = Image.fromarray(cv_img_no_text, mode="L")
drawing = ImageDraw.Draw(pil_img)
# PIL.ImageDraw is looking for (x1, y1, x2, y2)

faces_img = []
for faces, color in faces_color:
    for x, y, w, h in faces:
        img = cv_img_gray[y: y + h, x: x + w]
        eyes = eye_cascade.detectMultiScale(img)
        #if len(eyes) > 0:
        #    pprint(eyes)
        drawing.rectangle((x, y, x + w, y + h), outline=color, width=4)
        faces_img.append(img)

print("nb words :", df_tesseract.shape[0])
print("nb faces detected :", len(faces_img))
print("char width :", df["char_width"].median())
display(pil_img)
for img in faces_img:
    display(Image.fromarray(img, mode="L"))

In [None]:
faces_coords = []
for faces, color in faces_color:
    for x, y, w, h in faces:
        faces_coords.append(Face_coord(x, y, w, h))

distances = [
    math.sqrt((p1.center[0] - p2.center[0])**2 + (p1.center[1] - p2.center[1])**2)
    for p1 in faces_coords
    for p2 in faces_coords
]

n, bins, patches = plt.hist(distances, 5000, density=False, facecolor='g', alpha=0.75)

plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title('Histogram of IQ')
plt.xlim(-10, 300)
#plt.ylim(0, 0.03)
plt.grid(True)
plt.show()

In [None]:
faces_coords = [
    faces_coords.append(Face_coord(x, y, w, h))
    for faces, color in faces_color
    for x, y, w, h in faces
]

clusters = []
while len(faces_coords) > 0:
    p0 = faces_coords[0]
    faces_coords = sorted(
        faces_coords[1:],
        key=lambda p: (p0.center[0] - p.center[0])**2 + (p0.center[1] - p0.center[1])**2
    )

In [None]:
values = [
    (p1.w + p1.h) // 2
    for p1 in faces_coords
]

n, bins, patches = plt.hist(values, 100, density=False, facecolor='g', alpha=0.75)

plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title('Histogram of IQ')
#plt.xlim(-10, 300)
#plt.ylim(0, 0.03)
plt.grid(True)
plt.show()

In [None]:
x_max = cv_img.shape[1]
y_max = cv_img.shape[0]

char_width = int(df["char_width"].median())
mask = np.full(cv_img.shape[:2], 255, dtype=np.uint8)
for row in df.itertuples():
    mask[row.top: row.top + row.height, row.left: row.left + row.width] = 0
    
    if row.left - char_width > 0:
        mask[row.top: row.top + row.height, row.left - char_width: row.left] = 127
    if row.left + row.width + char_width < x_max:
        mask[row.top: row.top + row.height, row.left + row.width: row.left + row.width + char_width] = 127

display(Image.fromarray(mask, "L"))

In [None]:
print(df["char_width"].mean(), df["char_width"].median(), df["char_width"].min(), df["char_width"].max())
df.sort_values("height", ascending=False).head()

In [None]:
# Find lines

cv_img_rgb = images["a0"]
df, words = get_words(cv_img_rgb)
cv_img_no_text = erase_text_on_img(cv_img_rgb, df)

display(Image.fromarray(cv_img_no_text, "RGB"))


dst = cv.Canny(cv_img_no_text, 50, 200, None, 3)
cdst = cv.cvtColor(dst, cv.COLOR_GRAY2RGB)

# cv.HoughLinesP(image, rho, theta, threshold, lines, minLineLength, maxLineGap) -> lines
lines = cv.HoughLinesP(dst, rho=1, theta=np.pi / 180, threshold=50, minLineLength=500, maxLineGap=20)

for i, line in enumerate(lines):
    l = line[0]
    cv.line(cdst, l[0:2], l[2:4], (0, 0, 255), 3, cv.LINE_AA)

# source
#display(Image.fromarray(dst, "L"))

#cv.imshow("Detected Lines (in red) - Probabilistic Line Transform", cdstP)
display(Image.fromarray(cdst, "RGB"))