# Script to annotate audio files

1. Write `True` after `practice_run` below to try it. 
2. Click the play button (or shift+Return)
3. You will be prompted to sign in to your google drive, enter the email associated to your Drive account you saved the folder in.
4. After clicking on the play button, it will prompt you to enter your initials. Then the different audio files will be played in turn. Follow instructions to annotate each audio file.
5. Write `False` after `practice_run` to annotate and save your annotations.
5. After annotating, share the `output` folder with me.

Important:
- Set your computer's sound volume to the highest your comfortable with (to listen to background noise) and DO NOT change throughout annotation to compare all audio samples under similar conditions (e.g., use same speaker/headset at the same volume throughout).
- **Save and quit:** You can save and quit at any time by typing "quit" (it will save your work in `output` folder). When you press play again, it will automatically start where you left off, no need to do anything other than press the play button and enter the same initial you did the last time. 
- **Edit response:** the idea is you don't review your answers. Do not open the output file after annotating. 
  - In the case of a typo, you can edit your responses for an audio file you are currently editing. Just type quit. And click on play button again. It will repeat the current audio annotation you just quit so you can redo your annotations for that audio file (prior annotations will be saved). 
  - If you want to edit the annotations for the prior audio file (e.g., you notice a big mistake/typo), let me know the file number (above the audio display) and what your edit is and I'll make the edit, but don't review your answers beyond the last one. 

In [None]:
practice_run = True #Write True (will run a few samples) or False (will start full annotation and you cannot edit again)

In [None]:
#@title

import os 
import sys
import random
import pandas as pd
import numpy as np
import random
import datetime
from IPython import display
from scipy.io import wavfile
from google.colab import drive
import ipywidgets as widgets

# On google colab
# Mount GDrive and attach it to the colab for data I/O
print('Accessing Google Drive...')
drive.mount('/content/drive')
print('Accessing Google Drive complete.\n')

# config
# ===================================================
data_dir = 'audios_speech_full'
input_dir = f'/content/drive/My Drive/annotations/input/{data_dir}/'
output_dir = '/content/drive/My Drive/annotations/output/'
filter_phrase = 'Speech'

instructions_general = "\n=============\nAt any point, enter quit as your response to save and quit\n=============\n\n" # or one of the following to edit that option: noise, uvfp, severity, roughness, breathiness, strain, pitch, loudness or comments to edit your response.\n========\n\n',
instructions_labels = [
                'noise: 0= NO background noise, 1 = SOME background noise, 2 = HIGH background noise, 99 = unsure', #first to be unbiased by reasoning as to whether they are UVFP 
                'uvfp: Healthy voice or UVFP [0 / 1]',
                'severity: CAPE-V scale  (from 0 to 100)',
                'roughness: CAPE-V scale (from 0 to 100)',
                'breathiness: CAPE-V scale (from 0 to 100)',
                'strain: CAPE-V scale (from 0 to 100)',
                'pitch: CAPE-V scale (from 0 to 100)',
                'loudness (in person): CAPE-V scale (from 0 to 100)',
                'loudness (recording): 1: low, 2: medium, 3: high',
                # 'comments: annotation issue or characterize abnormal resonance or features (e.g., diplophonia, fry, falsetto, asthenia, aphonia, pitch instability, tremor, wet/gurgly)'

                ]

instructions_col_names = ['noise', 'vfp', 'severity', 'roughness', 'breathiness', 'strain', 'pitch', 'loudness (in person)', 'loudness (recording)', 
                          # 'comments'
                          ]
instructions_labels = dict(zip(instructions_col_names, instructions_labels))


# functions
# ===================================================

def save_annotation(annotation_current, annotation_past,output_dir, data_dir, annotator_name, instructions_col_names,practice_run=False):  
  annotation_df = pd.DataFrame(annotation_current, columns = ['file'] + instructions_col_names)
  annotation_past = pd.DataFrame(annotation_past)
  if not annotation_past.empty:
    annotation_df = annotation_past.append(annotation_df).reset_index(drop=True)
  ts = datetime.datetime.utcnow().strftime('%y-%m-%dT%H-%M-%S')
  annotator_name = annotator_name.lower().replace(' ', '-').replace('.', '-')
  if practice_run:
    annotation_df.to_csv(output_dir+f'practicerun_{annotator_name}_{ts}.csv')
  else:
    annotation_df.to_csv(output_dir+f'annotations_{annotator_name}_{ts}.csv')
  print('SAVED THE FOLLOWING FILE:', output_dir+f'annotations_{annotator_name}_{ts}.csv')
  return 



def text_to_bits(text):
    """
    This makes initials return a certain digit, which is used as a seed for reproducibility.
    >>> text_to_bits("Hi")
    [0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1]
    """
    bits = bin(int.from_bytes(text.encode(), 'big'))[2:]
    return list(map(int, bits.zfill(8 * ((len(bits) + 7) // 8))))

# Main script
# ====================================================================

# load all audio file paths
annotation_past = []
files = os.listdir(input_dir)
files = [n for n in files if filter_phrase in n] #filter certain files
annotator_name = input('\nYour initials in lower case and no period (e.g., mine would be "dml"). Make sure to always use the same one: ')
annotator_name = annotator_name.lower().replace('"', '')
SEED = int(''.join([str(n) for n in text_to_bits(annotator_name)])) #unique seed associated to initials, shuffled by initials to bytes
random.Random(SEED).shuffle(files)

# reload saved annotations and remove completed files from files to be annotated
annotation_files = os.listdir(output_dir)
annotation_and_initials = f'annotations_{annotator_name}_'
annotation_files = [n for n in annotation_files if annotation_and_initials in n and '.csv' in n]
files_completed = []
if len(annotation_files)>0:
  for file_i in annotation_files:
    df_i = pd.read_csv(output_dir+file_i, index_col = 0)
    annotation_past.append(df_i)
  annotation_past = pd.concat(annotation_past).drop_duplicates().reset_index(drop = True)
  files_completed = annotation_past['file'].values
  files = [n for n in files if n not in files_completed] #remove completed files

print(f'\nfiles_completed: {len(files_completed)} of {len(files)}')

if practice_run:
  files_subset = ['VFP18_','VFPNorm61_','VFP40_','VFPNorm2_','VFPNorm3_',]#will be played in order of files which is shuffled by initials to bytes
  files = [n for n in files if n.startswith(tuple(files_subset))]
  print(f'\nRunning practice run of {len(files)}...')

# begin
print(instructions_general)
annotation_current = []
for i, file_i in enumerate(files):
  # for each audio file
  annotation_i = [] #labels for single audio file
  labels_completed = []
  samplerate, data = wavfile.read(input_dir+file_i)
  print(f'======= file #{i+len(files_completed)}, {np.round(data.shape[0]/samplerate,1)} sec (ignore box right below audio)')
  
  display.display(display.Audio(input_dir+file_i, autoplay=True), widgets.Textarea('')) # Display speech
  print('\n\n')
  # for each label 
  for instruction_name in instructions_col_names:
    instruction = instructions_labels.get(instruction_name)
    resp = input(instruction+'\n')
    if resp == 'quit' or resp == "'quit'":
      # don't save labels for current participant, just all prior ones. 
      save_annotation(annotation_current, annotation_past, output_dir, data_dir, annotator_name, instructions_col_names,practice_run=practice_run)
      sys.exit()
    # ToDo: so they can edit
    # elif resp in instructions_col_names:
    #   instruction = instructions_labels.get(instruction_name)
    #   resp = input(instruction+'\n')
    #   annotation_i.append(resp) 
    #   # annotation_i[instruction_name]=resp
    else:
      annotation_i.append(resp)
  # when all labels have been filled out, append  
  annotation_current.append([file_i]+annotation_i)
  if i%5==0:
    save_annotation(annotation_current, annotation_past, output_dir, data_dir, annotator_name, instructions_col_names,practice_run=practice_run)

save_annotation(annotation_current,annotation_past, output_dir, data_dir, annotator_name, instructions_col_names,practice_run=practice_run)

