In [2]:
from cv2 import cv2
import numpy as np
from matplotlib import pyplot as plt
import statistics
import mahotas       # Otsu thresholding
import bisect        # Key insert point
import imutils       # Crop and resize images
import pandas as pd

from __future__ import unicode_literals
import youtube_dl
import os            # Folder paths
import sys           # Exit function
import glob          # Folder searching

In [3]:
# note class
class Note: 
    def __init__(self, centroid_x, y_dot):
        self.centroid_x = centroid_x
        self.y_dot = y_dot

# just  a function for printing images
def display_img(title, img):
    cv2.imshow(title, img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
      
# function for getting next note
def getNextNote(first_note):
    if "#" in first_note: 
        octave = first_note[2]
        if first_note[:2] == "A#":
            return ("C#" + octave)
        elif first_note[:2] == "C#":
            return ("D#" + octave)
        elif first_note[:2] == "D#":
            return ("F#" + octave)
        elif first_note[:2] == "F#":
            return ("G#" + octave)
        elif first_note[:2] == "G#":
            next_octave = int(octave) + 1
            return ("A#" + str(next_octave))
    
    else: 
        octave = first_note[1]
        if first_note[0] == "A":
            return ("B" + octave)
        elif first_note[0] == "B":
            return ("C" + octave)
        elif first_note[0] == "C":
            return ("D" + octave)
        elif first_note[0] == "D":
            return ("E" + octave)
        elif first_note[0] == "E":
            return ("F" + octave)
        elif first_note[0] == "F":
            return ("G" + octave)
        elif first_note[0] == "G":
            next_octave = int(octave) + 1
            return ("A" + str(next_octave))

# Function for Gaussian Blurring
def gaussianBlurring(gray_img, blur_sq, std_dev = 0):
    return cv2.GaussianBlur(gray_img, (blur_sq, blur_sq), std_dev)

# Function for Canny Edge Detection
def cannyDetection(blurred_img, th1, th2, apertureSize = 3):
    return cv2.Canny(blurred_img, th1, th2, apertureSize)

# Function to return the top and bottom of the keyboard using HoughLines
def keyboardYCoords(edged_img, rho = 1, theta = np.pi/180, threshold = None):
    
    if threshold is None:
        threshold = edged_img.shape[1]//2 # half the width
    
    lines = cv2.HoughLines(edged_img, rho, theta, threshold) 
    y_cord = [] #the y-value of the lines generated from hough transform

    #iterating through lines
    for line in lines: 
        rho_l, theta_l = line[0]
        a = np.cos(theta_l)
        b = np.sin(theta_l)
        x0 = a * rho_l
        y0 = b * rho_l
        y_cord.append(y0) #appending to list

    y_cord.sort(reverse=True)
    return y_cord[0:2]

# Function to threshold an image
def theshold(gray_img, th1 = 90, th2 = 150, thresh_type = cv2.THRESH_BINARY_INV):
    _, threshed_img = cv2.threshold(gray_img, th1, th2, thresh_type)
    return threshed_img


        
# Function for doing connected components
def connectedComponents(binarized_img, img, display_result):
    connectivity = 1
    output = cv2.connectedComponentsWithStats(binarized_img, connectivity, cv2.CV_32S)
    num_labels = output[0]
    labels = output[1]
    stats = output[2]
    centroids = output[3]

    final_labels = []

    output = img.copy()

    for i in range(1, num_labels):
        x = stats[i, cv2.CC_STAT_LEFT]
        y = stats[i, cv2.CC_STAT_TOP]
        w = stats[i, cv2.CC_STAT_WIDTH]
        h = stats[i, cv2.CC_STAT_HEIGHT]
        area = stats[i, cv2.CC_STAT_AREA]
        (cX, cY) = centroids[i]
        
        if (100 < area < np.inf):
            final_labels.append([i,cX])
            
            if (display_result):
                cv2.rectangle(output, (x,y), (x+w, y+h), (255,0,0),1)
                cv2.circle(output, (int(cX), int(cY)), 4, (255,255,0), -1)
                componentMask = (labels == i).astype("uint8") * 255
                cv2.imshow("Output", output)
                cv2.waitKey(0)

    key_width = statistics.median(stats[:, cv2.CC_STAT_WIDTH])
    cv2.destroyAllWindows()
    return final_labels, key_width


def displayCentroid(key_list, img):
    y = img.shape[0]*3//4
    for (note, centroid) in key_list: 
        font = cv2.FONT_HERSHEY_SIMPLEX
        line = cv2.line(img,(int(centroid),0),(int(centroid),900),(0,0,255),1)
        text_label = cv2.putText(img, note, (int(centroid), y), font, 0.5, (0,255,0), 1)
        cv2.imshow("Key Label", img)
        cv2.waitKey(0)
    cv2.destroyAllWindows()

def key_pressed(key_list, key_index):
    insertion_point = bisect.bisect_left(key_list[:,1].astype(float),key_index)
    
    #Insertion outside our index, means to insert it at the end (return the last key)
    if insertion_point >= len(key_list):
        insertion_point = len(key_list)-1
#     print(insertion_point)
#     print('You pressed the {} key.'.format(key_list[insertion_point,0]))

    note = key_list[insertion_point,0]
    index = insertion_point

    return note, index




# # For downloading YouTube videos
# def my_hook(d):
#     if d['status'] == 'finished':
#         print('Download complete.')
#     elif d['status'] == 'error':
#         print('Error in downloading file - exiting program!')
#         sys.exit()
        
        
# # Function to download a YouTube video
# def downloadYouTube(videourl, path = './videos/video_to_process.%(ext)s', quiet = True):
    
#     ydl_opts = {'outtmpl': path,
#                'quiet': quiet,
#                'progress_hooks': [my_hook]}
#     try:
#         with youtube_dl.YoutubeDL(ydl_opts) as ydl:
#             ydl.download([videourl])
#     except youtube_dl.utils.DownloadError:
#         print('Exiting program!')

- Ask user for YouTube link
- Download (?) video (or atleast a temporary copy)
- Using the first few frames, detect the keyboard
    - If 72 keys are not detected, try next few frames and repeat
    - If they can't be detected at all, end program
- Label black and white keys
- Run algorithm, get array with start/end times for each note
- Convert to .midi
- Convert to sheet music

Thoughts on a 'Piano' class - initialized with an image.
Methods:
-Gray keyboard
-Blur keyboard
-Canny Keyboard, etc. etc.

In [27]:
class Keyboard: 
    def __init__(self, youtubeURL):
        self.youtubeURL = youtubeURL
        
    # For downloading YouTube videos
    def my_hook(self, d):
        if d['status'] == 'finished':
            print('Download complete.')
        elif d['status'] == 'error':
            print('Error in downloading file - exiting program!')
            sys.exit()
            
    # Remove previously downloaded file
    def clear_previous(self, path = "./videos/video_to_process*"):
        vtp = glob.glob(path)
        if vtp:
            camera = cv2.VideoCapture(vtp[0])
            camera.release()
            os.remove(vtp[0])

    # Function to download a YouTube video
    def downloadYouTube(self, path = './videos/video_to_process.%(ext)s', quiet = True):
    
        self.clear_previous()
    
        ydl_opts = {'outtmpl': path,
                   'quiet': quiet,
                   'progress_hooks': [self.my_hook]}
        try:
            with youtube_dl.YoutubeDL(ydl_opts) as ydl:
                print('Downloading video...')
                ydl.download([self.youtubeURL])
        except youtube_dl.utils.DownloadError:
            print('Exiting program!')
            
    # Use the file called video_to_process and detect our 88 keys
    def detect_keys(self, resize_width, bl_blur_sq, bl_canny_th1, bl_canny_th2, bl_thresh1, bl_thresh2,
                wh_blur_sq, path = "./videos/video_to_process*",num_bl_keys = 36, num_wh_keys = 52):
        vtp = glob.glob(path)
        if vtp:
            camera = cv2.VideoCapture(vtp[0])
            # Could not access the file
            if not camera.isOpened():
                print('Error - video file could not be read!')
                sys.exit()
            # Process video
            else:
                print('Detecting keys...')
                while True:
                    #grabbed is a boolean than tells us if there is a valid frame
                    (grabbed, frame) = camera.read()
                    if not grabbed:
                        break

                    frame = imutils.resize(frame,width = resize_width)

                    # Get the bottom-half of the frame (where the keyboard lies) and process from here
                    keys = frame[frame.shape[0]//2:,:]
                    gray_keys = cv2.cvtColor(keys, cv2.COLOR_BGR2GRAY)

                    # Process
                    blurred = gaussianBlurring(gray_keys, blur_sq = bl_blur_sq)
                    edges = cannyDetection(blurred, th1 = bl_canny_th1, th2 = bl_canny_th2)

                    # Crop
                    crop_coordinates  = keyboardYCoords(edges)
                    cropped_keys      = keys[int(crop_coordinates[1])+20:int(crop_coordinates[0])]
                    cropped_gray_keys = gray_keys[int(crop_coordinates[1])+20:int(crop_coordinates[0])]

                    # Labels keys
                    thresh_keys = theshold(cropped_gray_keys, th1 = bl_thresh1, th2 = bl_thresh2)

                    # Black keys
                    final_labels_bl, key_width_bl = connectedComponents(thresh_keys, cropped_keys, False)
                    if len(final_labels_bl) == num_bl_keys: 
                        first_note = "A#0"
                        for i in range(num_bl_keys):
                            final_labels_bl[i][0] = first_note
                            first_note = getNextNote(first_note)

                        final_labels_bl = sorted(final_labels_bl, key=lambda x: x[1])

                    # White keys
                    blurred_w = gaussianBlurring(cropped_gray_keys, blur_sq = wh_blur_sq)
                    T = mahotas.thresholding.otsu(blurred_w)*1.3
                    thresh_keys_w = cropped_gray_keys.copy()
                    thresh_keys_w[thresh_keys_w>T] = 255
                    thresh_keys_w[thresh_keys_w<T] = 0
                    final_labels_w, key_width_w = connectedComponents(thresh_keys_w, cropped_keys, False)

                    if len(final_labels_w) == num_wh_keys: 
                        first_note = "A0"
                        for j in range(num_wh_keys):
                            final_labels_w[j][0] = first_note
                            first_note = getNextNote(first_note)

                        final_labels_w = sorted(final_labels_w, key=lambda x: x[1])

                    # Determine if they sum to 88 keys (36 black, 52 white) - if not, try next frame
                    if len(final_labels_bl) == num_bl_keys and len(final_labels_w) == num_wh_keys:
                        self.black_keys = final_labels_bl
                        self.white_keys = final_labels_w
                        self.black_width = key_width_bl
                        self.white_width = key_width_w
                        self.keyboard_img = frame
                        camera.release()         # Release cv2 camera object
                        cv2.destroyAllWindows()  # Destroy any cv2 windows
                        print('Key detection complete.')
                        break

                #     #Show the frame + drawn rectangle
                #     cv2.imshow("Face", thresh_keys)

    #                 #Can break early by pressing "q"
    #                 if cv2.waitKey(1) & 0xFF == ord("q"):
    #                     break

            # If at this point we've looped through everything and we don't have 72 keys
            camera.release()
            cv2.destroyAllWindows()

            if len(final_labels_bl) != 36 and len(final_labels_w) != 52:
                print('Error in processing file - exiting program!')
                sys.exit()
                
                
                
        # Glob did not find a valid file
        else:
            print('Error - video path could not be accessed!')
            sys.exit()
            
            
            
    # Assign end-of-range for each key
    def key_ranges(self):

        # Array
        full_key_list = self.black_keys + self.white_keys
        full_key_list = sorted(full_key_list, key=lambda x: x[1].astype(float))
        full_key_list = np.array(full_key_list)

        # Empty list
        tmp_list = np.empty([len(full_key_list), 2], dtype='object')

        # Loop through and assign end-of-range for each key
        # CASE: white key adjacent to black key: end of the white key range is the adjacent black key's centroid - black/2
        #       black key:                       end of the black key range is the black key's centroid + black/2
        #       white key adjacent to white key: end of the white key range is the half-way point between the adjacent centroids
        for i in range(0,len(full_key_list)-1):
            if len(full_key_list[i,0])==1 and len(full_key_list[i+1,0])>1: # White adjacent to black
                tmp_list[i,1] = full_key_list[i+1,1].astype(float) - self.black_width/2
            elif len(full_key_list[i,0])>1: # Black key
                tmp_list[i,1] = full_key_list[i,1].astype(float) + self.black_width/2
            else: # White key adjacent to white key
                tmp_list[i,1] = (full_key_list[i,1].astype(float)+ full_key_list[i+1,1].astype(float))/2

            # No change to the actual note (only the distances, above) for the first key
            tmp_list[i,0] = full_key_list[i,0]

        #For the last key, just take it to infinity
        tmp_list[-1,1] = np.inf
        tmp_list[-1,0] = full_key_list[-1,0]

        full_key_list = tmp_list

        self.keyboard_array = full_key_list
            
    
    # Return the centroid of the black/white keys alongside the median width of each
    def getKeys(self):
        return [self.black_keys, self.white_keys, self.black_width, self.white_width]
    
    # Return the frame that 88 keys were successfully identified from
    def getFrame(self):
        return self.keyboard_img
    
    def getFullKeyList(self):
        return self.keyboard_array

In [30]:
#Download a YouTube video and process it to determine where the centroid of the black and white keys are
keyboard = Keyboard('https://www.youtube.com/watch?v=skFugVOqBM4')
# keyboard.downloadYouTube()
# keyboard.detect_keys(resize_width = 600, bl_blur_sq = 5, bl_canny_th1 = 200, bl_canny_th2 = 200, 
#                     bl_thresh1 = 90, bl_thresh2 = 150, wh_blur_sq = 7)
# keyboard.key_ranges()

# [black, white, black_width, white_width] = keyboard.getKeys()
# key_img = keyboard.getFrame()
# keyboard_array = keyboard.getFullKeyList()

Detecting keys...
Key detection complete.


In [32]:
keyboard_array
#Find the appropriate index for any given centroid via:
# note, index = key_pressed(full_key_list, 999)
# print(index)

87


In [None]:
# Process and output...

In [11]:
# Need to delete the processed video
# keyboard.clear_previous()

# IGNORE BELOW THIS LINE ##########################
___________________________________________________________________________________________________________________________

## 1. Load image in

Always start at the first frame - before ANY keys are clicked. This is because if a note is coloured on the keys themselves, our thresholding set-up doesn't work.

**There is an issue when people have intros...**

In [None]:
# #reading in image
# keys = cv2.imread("images\synthesia.png")
# keys = keys[keys.shape[0]//2:,:]

# #grayscale
# gray_keys = cv2.cvtColor(keys, cv2.COLOR_BGR2GRAY)

# ################################
# ################# edge detection
# std_dev = 0
# k = 5
# t1 = 200
# t2 = 200
# blurred = cv2.GaussianBlur(gray_keys, (k,k), std_dev)
# edges = cv2.Canny(blurred, t1,t2, apertureSize = 3)

# ################################
# ################# hough transform 
# lines = cv2.HoughLines(edges, 1, np.pi/180, 300) 
# y_cord = [] #the y-value of the lines generated from hough transform

# #iterating through lines
# for line in lines: 
#     rho, theta = line[0]
#     a = np.cos(theta)
#     b = np.sin(theta)
#     x0 = a * rho
#     y0 = b * rho
#     y_cord.append(y0) #appending to list

# print(y_cord)

# #Only want to two lines from hough transform that crops out image of piano
# if (len(y_cord) > 2):
#     y_cord.sort(reverse=True)
#     y_cord.pop()
    
    

# #crop gray
# cropped_keys      = keys[int(y_cord[1])+20:int(y_cord[0])]
# cropped_gray_keys = gray_keys[int(y_cord[1])+20:int(y_cord[0])]

# ################################
# ################# thresholding
# _, th1 = cv2.threshold(cropped_gray_keys, 90, 150, cv2.THRESH_BINARY_INV)

# plt.imshow(edges)
# # plt.axis('off')
# plt.show()

In [None]:
# plt.figure(figsize=(10,10))
# plt.imshow(th1, cmap = "gray")
# plt.show()

# 2. Label Black Keys
We don't need to do distance math anymore. If we detect 36 black keys, we know that the first black key is A#.

*final_labels* is the 36 integer indices that tell us if the returned centroids are black keys. 

For example, if centroids returns 39 possible connected components, final_label is the list of 36 indices that we consider are black keys.

## 1.1 Get the average width of a black key
### Also get all balck key info (centroid etc)
This will be used to determine the discrete range of each key.

In [None]:
# ################################
# ################# Connected Components (for black keys)
# #SWITCH TRUE TO FALSE IF YOU DONT WANT TO SHOW OUTPUT
# final_labels_bl, key_width_bl = connectedComponents(th1, cropped_keys, False)

In [None]:
# ################################
# ################# Labelling (for black keys)
# if len(final_labels_bl) == 36: 
#     first_note = "A#0"
#     for i in range(36):
#         final_labels_bl[i][0] = first_note
#         first_note = getNextNote(first_note)

# final_labels_bl = sorted(final_labels_bl, key=lambda x: x[1])

# #just displaying to test
# # displayCentroid(final_labels_bl, keys)

# 2. Label White Keys
Testing white keys

In [None]:
# ################################
# ################# Connected Components (for white keys)
# k = 7
# blurred = cv2.GaussianBlur(cropped_gray_keys, (k,k), 0)
# T = mahotas.thresholding.otsu(blurred)*1.3
# th2 = cropped_gray_keys.copy()
# th2[th2>T] = 255
# th2[th2<T] = 0
# final_labels_w, key_width_w = connectedComponents(th2, cropped_keys, False)

In [None]:
# ################################
# ################# Labelling (for white keys)
# if len(final_labels_w) == 52: 
#     first_note = "A0"
#     for i in range(52):
#         final_labels_w[i][0] = first_note
#         first_note = getNextNote(first_note)

# final_labels_w = sorted(final_labels_w, key=lambda x: x[1])

# #just displaying to test
# # displayCentroid(final_labels_w, keys)

# 3. Assign Ranges
Order both the white and black keys together. 

For the range x:0 -> end, we assign a specify range to each key. For example, A: 0 - 10, A#: 10 - 15.

Our assumption is that the centroid of the key played will land in a discrete range with no overlap/ambiguity.

In [None]:
# ################################
# ################# Assign ranges
# full_key_list = final_labels_bl + final_labels_w
# full_key_list = sorted(full_key_list, key=lambda x: x[1].astype(float))
# # for ls in full_key_list: 
# #     ls = ls.reverse()
# full_key_list = np.array(full_key_list)
# # print(full_key_list)

The code below maps the actual range for each discrete key.

We know that black keys are skinnier than white keys, and we took the median width of the black keys above. For each black key, it's range is ***centroid - black_key_width/2 < x < centroid + black_key_width/2***.

For white keys adjacent to black keys, the above axiom provides one of the bounds.

For white keys adjacent to white keys, we simply take the mid-way point between their centroids as one of the bounds.

In [None]:
# test_list = np.empty([len(full_key_list), 2], dtype='object')

# for i in range(0,len(full_key_list)-1):
#     if len(full_key_list[i,0])==1 and len(full_key_list[i+1,0])>1: # White adjacent to black
#         test_list[i,1] = full_key_list[i+1,1].astype(float) - key_width_bl/2
#     elif len(full_key_list[i,0])>1: # Black key
#         test_list[i,1] = full_key_list[i,1].astype(float) + key_width_bl/2
#     else: # White key adjacent to white key
#         test_list[i,1] = (full_key_list[i,1].astype(float)+full_key_list[i+1,1].astype(float))/2
    
#     # No change to the actual note (only the distances, above)
#     test_list[i,0] = full_key_list[i,0]

# #For the last key, just take it to infinity
# test_list[-1,1] = np.inf
# test_list[-1,0] = full_key_list[-1,0]

# full_key_list = test_list
# #print(full_key_list)

The function below **inserts** any given value between our established key ranges. It returns an index where the given value *would* be inserted, which gives us our corresponding key pressed.

In [None]:
# note, index = key_pressed(full_key_list, 999)
# # print(index)

# 4. Testing with Notes

In [None]:
# #sample piano image from  youtube
# img_notes = cv2.imread("images/IMG_very_close_notes.png")

# #converting to gray
# gray_notes = cv2.cvtColor(img_notes, cv2.COLOR_BGR2GRAY)
# img_notes_rgb = cv2.cvtColor(img_notes, cv2.COLOR_BGR2RGB)

# plt.imshow(img_notes_rgb)
# # plt.axis('off')
# plt.show()

*y_cord[1]* is the position of the top of the keyboard. We calculated this using **half** the original image, so we need to add this back in.

# I added it back paps =3

In [None]:
# #Cropped
# top_keys_index = y_cord[1] + img_notes.shape[0]//2
# crop_img_notes = img_notes[20:(int(top_keys_index)-30)] #Crop the top 20 pixels and bottom 30

# plt.imshow(crop_img_notes)

Identify notes.

In [None]:
# crop_img_notes_gray = cv2.cvtColor(crop_img_notes, cv2.COLOR_BGR2GRAY)
# # # k = 3
# # # blurred = cv2.GaussianBlur(crop_img_notes_gray, (k,k), 0)

# # #Using standard threshold to create contrast between white/black keys
# # _, th_notes = cv2.threshold(blurred, 90, 150, cv2.THRESH_BINARY)

# _, th_notes = cv2.threshold(crop_img_notes_gray, 90, 150, cv2.THRESH_BINARY)

# plt.figure(figsize=(10,10))
# plt.imshow(th_notes, cmap = "gray")
# plt.show()

In [None]:
# #####################################################################################
# ####### using connected component detection algorithm to separate all the black notes
# connectivity = 1
# output = cv2.connectedComponentsWithStats(th_notes, connectivity, cv2.CV_32S)
# num_labels = output[0]
# labels = output[1]
# stats = output[2]
# centroids = output[3]

# final_labels = []
# note_list = [] #creating a list of all the relavent notes. 


# output = img_notes_rgb.copy()
# font = cv2.FONT_HERSHEY_SIMPLEX

# #For loop only used for displaying 
# for i in range(1, num_labels):
#     x = stats[i, cv2.CC_STAT_LEFT]
#     y = stats[i, cv2.CC_STAT_TOP] + 20 # We cropped out the first 20 pixels
#     w = stats[i, cv2.CC_STAT_WIDTH]
#     h = stats[i, cv2.CC_STAT_HEIGHT]
#     area = stats[i, cv2.CC_STAT_AREA]
#     (cX, cY) = centroids[i]
#     cY = cY + 20 # We cropped out the first 20 pixels
#     if (20 < area < np.inf): #filtering out relavent detections (the ones big enough to be black keys)
#         final_labels.append(i)
#         cv2.rectangle(output, (x,y), (x+w, y+h), (255,0,0),1)
#         dist_to_edge = h/2 #getting the distance from centroid to bottom edge for better detection later on
#         cv2.circle(output, (int(cX), int(cY+dist_to_edge)), 1, (0,122,255), 3)
#         componentMask = (labels == i).astype("uint8") * 255
        
#         note = Note(cX, cY+dist_to_edge) #creating note object and adding to list
#         note_list.append(note)
        
#         note_played, _ = key_pressed(full_key_list, note.centroid_x)
#         cv2.putText(output, note_played, (int(note.centroid_x), int(note.y_dot)), font, 0.5, (0,255,0), 1)

# #         display_img("Output", output)
# #         display_img("Connected Component", componentMask)
# #         cv2.waitKey(0)

# # print(final_labels)
# # cv2.destroyAllWindows()
# plt.figure(figsize=(10,10))
# plt.imshow(output)
# plt.show()

In [None]:
camera = cv2.VideoCapture('videos/Nocturne Opus 9 No 2_trim.mp4')

frames = camera.get(cv2.CAP_PROP_FRAME_COUNT)
fps = camera.get(cv2.CAP_PROP_FPS)
seconds_per_frame = fps/frames
counter = 0

notes_pressed = []

keys_timed = []
for x in full_key_list:
    keys_timed.append([x[0]])


keys_timed_update = []
for x in full_key_list:
    keys_timed_update.append([x[0]])

    
testing_screen = []
testing_mask = []

while (camera.isOpened()):
    #print(counter)
    #print('-'*100)
    
    #grabbed is a boolean than tells us if there is a valid frame
    (grabbed, frame) = camera.read()

    frame_number = camera.get(cv2.CAP_PROP_POS_FRAMES)
    elapsed = frame_number/fps

    
    if not grabbed:
        break

    counter += seconds_per_frame
        
         
    frame = imutils.resize(frame,width = keys.shape[1]) #resize or else it won't work
    #print(frame.shape)
    
    crop_frame = frame[20:int(top_keys_index)-50] #Crop the top 20 pixels and bottom 50
    
    # threshold the cropped and grayed image
    crop_frame_gray = cv2.cvtColor(crop_frame, cv2.COLOR_BGR2GRAY)
    _, th_crop_frame = cv2.threshold(crop_frame_gray, 90, 150, cv2.THRESH_BINARY)

    
    #We have a thresholded image for use to use ConnectedComponents on
    #####################################################################################
    ####### using connected component detection algorithm to separate all the black notes
    connectivity = 8
    output = cv2.connectedComponentsWithStats(th_crop_frame, connectivity, cv2.CV_32S)
    num_labels = output[0]
    labels = output[1]
    stats = output[2]
    centroids = output[3]

    indices_to_pop = []


    output_img = frame.copy()
    font = cv2.FONT_HERSHEY_SIMPLEX

    
#     print("-"*50)
#     print("new frame")
    i=1
    #Loop through all the connected components
    while i < len(stats):

        curr_connected_w = stats[i, cv2.CC_STAT_WIDTH]

        #Determine if the WIDTH is much bigger than the width of a white key and less than 4x? (So we don't get extraneous video text, etc.)
        if curr_connected_w > key_width_w*1.25 and curr_connected_w < key_width_w*4:
            

            #Threshold just the large component of interest
            componentMask = (labels == i).astype("uint8") * 255
            threshMask = cv2.bitwise_and(crop_frame_gray, crop_frame_gray, mask = componentMask) #Replace this with video frame

#             print("We are here.")
#             testing_screen.append(frame.copy())
#             testing_mask.append(threshMask.copy())
            
            # Histogram segregation of black/white key
            # Grayscale has one channel so we use [0]
                #Possible values range from 0 to 256
            bin_scaler = 4
            hist = cv2.calcHist([threshMask], [0], None, [256/bin_scaler], [1, 256])


            #Use a Histogram to compute the dominant non-black (i.e. not the background) colour. Use ~90% of this to threshold the image.
            T = hist.argmax() * bin_scaler * .9
            th1 = threshMask.copy()
            th1[th1>T] = 255
            th1[th1<T] = 0

            #Detect the first set of keys
            connectivity = 8
            output = cv2.connectedComponentsWithStats(th1, connectivity, cv2.CV_32S)
            num_labels_th1 = output[0]
            labels_th1 = output[1]
            stats_th1 = output[2]
            centroids_th1 = output[3]

            #Loop through components and determine which ones may be keys
            for j in range(1, num_labels_th1):
                area = stats_th1[j, cv2.CC_STAT_AREA]
                if (20 < area < np.inf): #filtering out relavent detections (the ones big enough to be keys)

                    if j > 1:
                        ## We've added another label
                        num_labels +=1 
                        i +=1

                    ##Within labels_th1, we have a matrix that is the same size of the image that holds our split component
                    #First, cut out the original "fat" label
                    fat_mask = labels != i
                    labels = labels * fat_mask

                    #Next, increment each label above the cut one up to accomodate the new label
                    higher_mask = labels > i
                    labels = labels + higher_mask

                    #Then append our segregated key
                    new_mask = labels_th1 == j
                    new_labels = labels_th1 * new_mask
                    new_labels = i * new_labels
                    labels = labels + new_labels

                    ##Remove the original index for the stats and then add the new one
                    if i < len(stats):
                        stats = np.delete(stats,i,0)
                        stats = np.insert(stats,i,stats_th1[j],0)
                    elif j == 1:
                        stats = stats[:-1,:]
                        stats = np.concatenate((stats,stats_th1[j][None,:]),0)
                    else:
                        stats = np.concatenate((stats,stats_th1[j][None,:]),0)

                    ##Remove the original index for the centroids and then add the new one
                    if i < len(centroids):
                        centroids = np.delete(centroids,i,0)
                        centroids = np.insert(centroids,i,centroids_th1[j],0)
                    elif j==1:
                        centroids = centroids[:-1,:]
                        centroids = np.concatenate((centroids,centroids_th1[j][None,:]),0)  
                    else:
                        centroids = np.concatenate((centroids,centroids_th1[j][None,:]),0)  
                        

                        
                    print("i: {}, stats: {}".format(i,len(stats)))
                    #Plot immediately so indexing doesn't get messed up
                    x = stats[i, cv2.CC_STAT_LEFT]
                    y = stats[i, cv2.CC_STAT_TOP] + 20 # We cropped out the first 20 pixels
                    w = stats[i, cv2.CC_STAT_WIDTH]
                    h = stats[i, cv2.CC_STAT_HEIGHT]
                    area = stats[i, cv2.CC_STAT_AREA]
                    (cX, cY) = centroids[i]
                    cY = cY + 20 # We cropped out the first 20 pixels
                    cv2.rectangle(output_img, (x,y), (x+w, y+h), (255,0,0),1)
                    dist_to_edge = h/2 #getting the distance from centroid to bottom edge for better detection later on
                    top_dot = cY-dist_to_edge
                    bottom_dot = cY+dist_to_edge
                    cv2.circle(output_img, (int(cX), int(bottom_dot)), 1, (0,122,255), 3)
                    cv2.circle(output_img, (int(cX), int(top_dot)), 1, (0,122,255), 3)

                    note = Note(cX, cY+dist_to_edge) #creating note object and adding to list
            
            
                    if ( (int(bottom_dot) >= int(y_cord[0])) and (int(bottom_dot) <= int(y_cord[0])+2) ):
                        note_played, index = key_pressed(full_key_list, cX)
                        #print(str(note_played) + " at " + str(counter))
                        keys_timed_update[index].append([elapsed])

                    if ( (int(top_dot) >= int(y_cord[0])) and (int(top_dot) <= int(y_cord[0])+2) ):
                        note_played, index = key_pressed(full_key_list, cX)
                        keys_timed_update[index].append([elapsed])

                    note_played, _ = key_pressed(full_key_list, note.centroid_x)
                    cv2.putText(output_img, note_played, (int(note.centroid_x), int(note.y_dot)), font, 0.5, (0,255,0), 1)




                #Detect the next set of keys
                th2 = threshMask.copy()
                th2[th2>T] = 0        
                k = 3
                blurred_th2 = cv2.GaussianBlur(th2, (k,k), 0)

                #Using standard threshold to create contrast between white/black keys
                _, th2_notes = cv2.threshold(blurred_th2, 90, 150, cv2.THRESH_BINARY)

                #Detect the second set of keys
                connectivity = 8
                output = cv2.connectedComponentsWithStats(th2_notes, connectivity, cv2.CV_32S)
                num_labels_th2 = output[0]
                labels_th2 = output[1]
                stats_th2 = output[2]
                centroids_th2 = output[3]

                #Loop through components and determine which ones may be keys
                for k in range(1, num_labels_th2):
                    area = stats_th2[k, cv2.CC_STAT_AREA]
                    if (20 < area < np.inf): #filtering out relavent detections (the ones big enough to be keys)

                        if k > 1:
                            ## We've added another label
                            num_labels +=1 
                            i+=1

                        ##Within labels_th1, we have a matrix that is the same size of the image that holds our split component
                        #For the second key WE DON'T NEED TO CUT anything
        #                 fat_mask = labels != i
        #                 labels = labels * fat_mask

                        #Next, increment each label above the cut one up to accomodate the new label
                        higher_mask = labels > i + 1
                        labels = labels + higher_mask

                        #Then append our segregated key
                        new_mask = labels_th2 == k
                        new_labels = labels_th2 * new_mask
                        new_labels = (i + 1) * new_labels
                        labels = labels + new_labels

                        ##Add
                        if i < len(stats):
                            stats = np.insert(stats,(i+1),stats_th2[k],0)                       
                        else:
                            stats = np.concatenate((stats,stats_th2[k][None,:]),0)
                        

                        ##Add
                        if i < len(centroids):
                            centroids = np.insert(centroids,(i+1),centroids_th2[k],0)
                        else:
                            centroids = np.concatenate((centroids,centroids_th2[k][None,:]),0)

                        #Plot immediately so indexing doesn't get messed up
                        x = stats[i, cv2.CC_STAT_LEFT]
                        y = stats[i, cv2.CC_STAT_TOP] + 20 # We cropped out the first 20 pixels
                        w = stats[i, cv2.CC_STAT_WIDTH]
                        h = stats[i, cv2.CC_STAT_HEIGHT]
                        area = stats[i, cv2.CC_STAT_AREA]
                        (cX, cY) = centroids[i]
                        cY = cY + 20 # We cropped out the first 20 pixels
                        cv2.rectangle(output_img, (x,y), (x+w, y+h), (255,0,0),1)
                        dist_to_edge = h/2 #getting the distance from centroid to bottom edge for better detection later on
                        top_dot = cY-dist_to_edge
                        bottom_dot = cY+dist_to_edge
                        cv2.circle(output_img, (int(cX), int(bottom_dot)), 1, (0,122,255), 3)
                        cv2.circle(output_img, (int(cX), int(top_dot)), 1, (0,122,255), 3)

                        note = Note(cX, cY+dist_to_edge) #creating note object and adding to list
            
            
                        if ( (int(bottom_dot) >= int(y_cord[0])) and (int(bottom_dot) <= int(y_cord[0])+2) ):
                            note_played, index = key_pressed(full_key_list, cX)
                            #print(str(note_played) + " at " + str(counter))
                            keys_timed_update[index].append([elapsed])

                        if ( (int(top_dot) >= int(y_cord[0])) and (int(top_dot) <= int(y_cord[0])+2) ):
                            note_played, index = key_pressed(full_key_list, cX)
                            keys_timed_update[index].append([elapsed])

                        note_played, _ = key_pressed(full_key_list, note.centroid_x)
                        cv2.putText(output_img, note_played, (int(note.centroid_x), int(note.y_dot)), font, 0.5, (0,255,0), 1)

        else:
            x = stats[i, cv2.CC_STAT_LEFT]
            y = stats[i, cv2.CC_STAT_TOP] + 20 # We cropped out the first 20 pixels
            w = stats[i, cv2.CC_STAT_WIDTH]
            h = stats[i, cv2.CC_STAT_HEIGHT]
            area = stats[i, cv2.CC_STAT_AREA]
            (cX, cY) = centroids[i]
            cY = cY + 20 # We cropped out the first 20 pixels
            if (20 < area < np.inf): #filtering out relavent detections (the ones big enough to be black keys)
                cv2.rectangle(output_img, (x,y), (x+w, y+h), (255,0,0),1)
                dist_to_edge = h/2 #getting the distance from centroid to bottom edge for better detection later on
                top_dot = cY-dist_to_edge
                bottom_dot = cY+dist_to_edge
                cv2.circle(output_img, (int(cX), int(bottom_dot)), 1, (0,122,255), 3)
                cv2.circle(output_img, (int(cX), int(top_dot)), 1, (0,122,255), 3)

                note = Note(cX, cY+dist_to_edge) #creating note object and adding to list
            
                if ( (int(bottom_dot) >= int(y_cord[0])) and (int(bottom_dot) <= int(y_cord[0])+2) ):
                    note_played, index = key_pressed(full_key_list, cX)
                    #print(str(note_played) + " at " + str(counter))
                    keys_timed_update[index].append([elapsed])

                if ( (int(top_dot) >= int(y_cord[0])) and (int(top_dot) <= int(y_cord[0])+2) ):
                    note_played, index = key_pressed(full_key_list, cX)
                    keys_timed_update[index].append([elapsed])

                note_played, _ = key_pressed(full_key_list, note.centroid_x)
                cv2.putText(output_img, note_played, (int(note.centroid_x), int(note.y_dot)), font, 0.5, (0,255,0), 1)

        i+=1
    
    
    #Show the frame + drawn rectangle
    cv2.imshow("Video", output_img)

    #Can break early by pressing "q"
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break
# print(keys_timed_update)    
camera.release()
cv2.destroyAllWindows()

In [None]:
new_list = []

for i in range (len(keys_timed_update)):
    temp = []
    temp.append(keys_timed_update[i][0])
    if (len(keys_timed_update[i]) > 1):
        for j in range(1,len(keys_timed_update[i])):
            temp.append(keys_timed_update[i][j][0])

    new_list.append(temp)



df = pd.DataFrame(new_list)
df.to_csv('notes_info.csv', index=False, header=False)