**COLLECTING AND PROCESSING IMAGES FROM VIDEOS**<br>
Note:   This script requires videos in order to work. If there are no videos to process, it will fail.<br>
        It was decided that submitting videos with the scripts was not a good idea, for obvious reasons. Karl will happily Zoom with Professor Fox if he would like to see it in action.<br>
>**Process**
1.  Collect pre-recorded Zoom videos with individual "speaker" frames rather than all persons in small cubes in every frame and put the videos into one folder
2.  For each video in the drop folder...
3.  Look at frames of video, skipping over a set number of frames between selected frames
4.  Snip name part of image and run through text recognition to get the name of the person in image
5.  If the name of the person is on our "PeepMaster" list, file names are created using the person's name + video name + approximage frame number. Image is then cropped, to remove text from the bottom of the image and saved as a .jpg file. 
6.  Report what has been done
7.  A maximum of 50 images are saved per person per video. <br><br>

>**Challenges**<br>
1.  Text recognition was not 100% accurate. Because of backgrounds, scraped names sometimes contained erroneous characters of which it needed to be stripped
2.  Videos tool long time to scrape. Attempts were made to skip more frames during segments where we did not need to collect images, but these sometimes resulted in missing images we wanted
3.  Some people did not include entire face in image
4.  Some people did not speak much during class, so image selection was limited
<br>



In [1]:
# Load packages
import pandas as pd
import numpy as np
from dataclasses import make_dataclass
import glob
from PIL import Image
import re
import os
# use opencv to do the job
#!pip3 install opencv-python
import cv2 # getting text from image
#!pip install pytesseract
import pytesseract
from os import walk
from copy import deepcopy
from datetime import datetime
import pathlib


In [2]:
# uncomment and adapt the code below if you need to release a file that was locked during a crash
# fileLoc = imageFolder + "IST687_Wk2_Raw.mp4"
# thisVid = cv2.VideoCapture(fileLoc)
# thisVid.release()

In [3]:
#####################################################################
# settings
# change the names of these variables to suit your own file structure
#####################################################################

# the name of the video file you are scraping
# I changed the program to scrape every file in the folder
#fileName = "IST687_Test2_Raw.mp4"

# below establishes skip rates between frames at 25 fps
blankSkip = 125 # skip rate after tesseract can't read image name (500 = 20 seconds)
longSkip = 125 # skip rate while on persons we aren't tracking (250 = 10 seconds)
shortSkip = 50 # skip rate while on persons we are tracking (125 = 5 seconds)

# the path of the folder where your downloaded videos are stored
# REMEMBER: if you copy and paste the path, you must change the slashes to forward slashes or otherwise make them palatable to your Python
rawFootageFolder = "D:/Karl/1 Syracuse Masters Program/2022 - Winter/IST 718/Final Project/RawFootage/"

srcFolder = pathlib.Path("FaceRecog_FromCSV.ipynb").parent.resolve()
imageFolder = str(srcFolder) + "\\ToProcess"

tesseractInstallFolder = "C:/Program Files/Tesseract/tesseract.exe"
maxImagesPerPersonPerVideo = 200
showPreview = True # True = show preview image save folder. Setting to False speeds up processing by 12%

# Names of all people we are tracking
# note, first item in row is full name, second is the short version to be found within image text
peepMaster = [
            ["Andrew Dobkowski", 0, 0],
            ["Annie Titus", 0, 0],
            ["Bomin Powers", 0, 0],
            ["Chao Xu", 0, 0],
            ["Christian Bendillo", 0, 0],
            ["Christian (Nick) Bendillo", 0, 0],
            ["David Ladd", 0, 0],
            ["Farahin Choudhury", 0, 0],
            ["Garen Moghoyan", 0, 0],
            ["Jackson Shands", 0, 0],
            ["Jessika Pietryka", 0, 0],
            #["Jon Fox", 0, 0],
            ["Joshua Biggs-Bauer", 0, 0],
            #["Karl Treen", 0, 0],
            #["Katie Haugh", 0, 0],
            ["Lei Cheng", 0, 0],
            ["Matthew Laken", 0, 0],
            ["Nick Waine", 0, 0],
            ["Noah Laraway", 0, 0],
            ["Noe Fernandes", 0, 0],
            ["Sandy Spicer", 0, 0],
            ["Sandra Spicer", 0, 0],
            ["Sarah Goodwyn", 0, 0],
            ["Seth Rissmiller", 0, 0],
            ["Sherhone GRANT", 0, 0],
            ["Jiebin (Alex) Zhu", 0, 0],
            ["Jiebin Zhu", 0, 0],
            ["Jjebin Zhu", 0, 0],
            ["Jjjebin Zhu", 0, 0],
            ["atosis", 0, 0],
            ["atolsia", 0, 0]
]

allPeeps = deepcopy(peepMaster)

# do not change below
tempImageFileLocation = imageFolder + "_tempImgFile.jpg"
streamingImageFileLocation = imageFolder + "_streamingImage.jpg"


In [4]:
#################
# handy functions
#################

def formatNameForCompare(name):
  #name = name[0:3] # modified so it only gets first 3 characters
  return(re.sub('[^a-zA-Z]', '', name).replace(" ", "").lower())

# check if name is in peeps
def showPeep(name):
  for peep in allPeeps:
    nameWeGot = formatNameForCompare(name)  # change 20220302
    print(nameWeGot)                         # change 20220302

    #if formatNameForCompare(name) in formatNameForCompare(peep[0]):
    if nameWeGot in formatNameForCompare(peep[0]): # change 20220302
      
      # increment the number of images stored
      peep[2] = peep[2] + 1

      # if we've collected enough of this person, hide the name
      if(peep[2] > maxImagesPerPersonPerVideo):
        name = ""

      return name != ""

  return False

# function to cut name from image
def getAbbrevNameFromImage(img):

  # crop image
  image_temp = img[330:355, 0:100]

  # write the temp name image to a file
  cv2.imwrite(os.path.join(imageFolder,"_tempImgFile.jpg"), image_temp)     # save frame as JPEG file
  
  ### now get the name from the image ###
  # code from: https://towardsdatascience.com/how-to-extract-text-from-images-with-python-db9b87fe432b# NOTE: tesseract must be installed on your computer. 
  # Find it here: https://github.com/UB-Mannheim/tesseract/wiki
  # I installed the 32 bit version and installed it in the following location
  pytesseract.pytesseract.tesseract_cmd = tesseractInstallFolder
  thisName = pytesseract.image_to_string(tempImageFileLocation)
  return(thisName[0:7])


# function to create file name
def newImageName(vidName, person, curCount):
  vidName = vidName.lower().replace(".mp4", "")
  imgName = person + "_" + vidName + "_" + str(curCount) + ".jpg"
  return(imgName)

# function to get face part from image


In [5]:
#############################
# put peeps into a data frame
#############################
persons = pd.DataFrame(allPeeps)
persons.columns = ["name", "imgTtlCount", "imgCurCount"]

In [6]:
# getting file names. Code from: https://stackoverflow.com/questions/3207219/how-do-i-list-all-files-of-a-directory
from os import walk
fileNames = []
for (x) in walk(rawFootageFolder):
    fileNames.append(x)

# should contain the names of all files in the rawFootageFolder
fileNames = fileNames[0][2]
allImageDfs = []

# get a fresh copy of peeps
allPeeps = deepcopy(peepMaster)

# so now we can loop through these files and extract images from each
# for each file in the raw footage folder...
for fileName in fileNames:

    # get each image
    # inspired by: https://stackoverflow.com/questions/33311153/python-extracting-and-saving-video-frames
    thisPath = rawFootageFolder + fileName
    thisVid = cv2.VideoCapture(thisPath)
    count = 0
    savedImgCount = 0
    skipFrames = longSkip

    trueFrames = 0

    while True:
        secondsBetFrames = skipFrames/25
        thisVid.set(cv2.CAP_PROP_POS_MSEC,(count*skipFrames))

        # get each image
        success, image = thisVid.read()

        # if fail or at end of video, end loop
        if not success:
            break

        personName = ""    
        try: # this function sometimes errors out, move to next iteration of loop
            personName = getAbbrevNameFromImage(image)            
        except:
            print("Exception thrown from getAbbrevNameFromImage() at frame {}".format(trueFrames))
            # make sure to increment the count, so we keep moving forward
            count += 1
            trueFrames = trueFrames + skipFrames + 1
            continue # skip to next iteration

        # create an image in the folder which will tell us where we're at in the video
        if showPreview:
            cv2.imwrite(os.path.join(streamingImageFileLocation), image)     # save current frame as JPEG file (which appears to stream the video during processing)

        # empty person name means we're on a blank image. Set to extra long skip and move to next iteration
        if(personName == ""):            
            skipFrames = blankSkip
            count += 1
            trueFrames = trueFrames + skipFrames + 1
            continue
        else: # set to long skip - this may be changed to short skip if we are showing the image
            skipFrames = longSkip
        
        
        # if the captureed image is one that we want to keep, store it
        if(showPeep(personName)):
            # set to short skip because we're onto someone we want to track
            skipFrames = shortSkip
            
            # crop the image to remove the name from the bottom of it
            # trying without this because it seems to cut off some chins 
            #image = image[0:329, 100:640]   # commented out 20220302

            # get a name for the image we're saving
            imgName = newImageName(fileName, personName, count)

            # write the image to a file
            cv2.imwrite(os.path.join(imageFolder,imgName), image)     # save frame as JPEG file

            # print the name of the file
            #print("{} saved.".format(imgName))
            
            # increment count
            savedImgCount += 1
                            
        trueFrames = trueFrames + skipFrames + 1
        
        count += 1

    # release the video
    thisVid.release()

    # report about video processing
    print("{} processed at {}".format(fileName, datetime.today()))
    print("{} images extacted from {} into {}. Frames processed={} with minimum of {} seconds between frames.".format(savedImgCount, fileName,imageFolder, trueFrames, shortSkip))
    
    # report counts of images stored from this video by person
    print("Images stored per person:")

    # below may be a little buggy
    df = pd.DataFrame(allPeeps)
    df.columns = ["Name", "MaxCols", "ImagesFound"]
    
    # hold onto all dfs that report images found
    allImageDfs.append(df)

    # print results from all files
    for d in allImageDfs:
        print(d.loc[d["ImagesFound"] > 0])

profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
jiebin
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profss
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess
profess