<a href="https://colab.research.google.com/github/etgins/Mice_ASD_Detection/blob/main/audio_feature_extraction_REDUCTION_BY_RECORDING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

----------------------------------------------
Written by Itamar Ginsberg & Alon Schreuer, October 2021



# **1. Import data and pre-process**

---

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

## FOR COLAB USE:
## ITAMAR:
dataset = pd.read_excel('/content/drive/MyDrive/Project_A/Project_A_files/total_data UPDATED EM 020821.xlsx')

# in case we need to convert from xls to csv:
#dataset.to_csv (r'/content/drive/MyDrive/Project_A/Project_A_files/total_data UPDATED EM 020821.csv', index = None, header=True)

# extract only the relevant columns / features
X = dataset[["Name", "Start Point (Hz)", "End Point (Hz)", "Duration (s)", "Syllable number", "Recording Number", "Mother Genotype", "Offspring Genotype"]]
# clean NaN values from dataset or X
X = X.dropna(axis = 0, how='any')
# print(X)


# encode binary feature - mother genotype
y = X[["Mother Genotype"]]
# print(y)
# encode labels
l1 = LabelEncoder()
l1.fit(y)
y = l1.transform(y)
X[["Mother Genotype"]] = y
# print(X)

# encode binary feature - offspring genotype
y = X[["Offspring Genotype"]]
# print(y)
# encode labels
l1 = LabelEncoder()
l1.fit(y)
y = l1.transform(y)
X[["Offspring Genotype"]] = y
# print(X)

# convert duration to micro-seconds from 'datetime.time' type to 'timedelta' object
from datetime import datetime
duration_vec = X[["Duration (s)"]]
# print(duration_vec)
duration_vec2 = np.array(duration_vec)
# print(duration_vec2)
duration_vec3 = np.zeros([1, np.size(duration_vec2)])
# print(duration_vec3)
for i in range (len(duration_vec)):
  duration_single = duration_vec2[i,0]
  # print(duration_single)
  duration_vec3[0,i] = int(duration_single.strftime(format = '%f'))
  # print(duration_vec3[0,i])
X[["Duration (s)"]] = np.transpose(duration_vec3)
print(X)




# X.describe()

Mounted at /content/drive
        Name  Start Point (Hz)  ...  Mother Genotype  Offspring Genotype
1     17470O        57648.8278  ...                1                   1
2     17470O        55558.5268  ...                1                   1
3     17470O        54513.3763  ...                1                   1
4     17470O        55976.5870  ...                1                   1
5     17470O        59739.1288  ...                1                   1
...      ...               ...  ...              ...                 ...
3167  08121P        73793.2416  ...                1                   1
3170  08130I        65382.9415  ...                0                   0
3171  08130I        68936.4532  ...                0                   0
3172  08130I        82105.3495  ...                0                   0
3173  08130I        77297.6572  ...                0                   0

[3035 rows x 8 columns]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


# **2. Setup for feature extraction**

---

In [2]:
def take_data_by_name(name_searched, Data, name_list):
  # print(name_list)
  ind = name_list == name_searched
  # print(ind)
  matching_data = Data[ind].reset_index()
  
  # print(matching_data)
  return matching_data


def take_data_by_recording(recording_searched, Data, recording_list):
  # print('recording list: ', recording_list, '\n')
  ind = recording_list == recording_searched
  # print('matching indices: ', ind, '\n')
  matching_data = Data[ind].reset_index()
  
  # print('matching data: ', matching_data, '\n')
  return matching_data



# find set of mouse names
name_column = X["Name"]
unique_name_set = set(name_column)
unique_name_list = list(unique_name_set)
# unique_name_list = unique_name_list[0:-1]   # use if first entry is column name

number_of_mice = len(unique_name_list)
# print(unique_name_list)
# print(number_of_mice, 'mice in data')

# if needed - remove nan from list of names and update
unique_name_list = [x for x in unique_name_list if str(x) != 'nan']
number_of_mice = len(unique_name_list)

# print(unique_name_list)
# print(number_of_mice, 'mice in data')


# SAVED FOR FUTURE REFERENCE - NOT USED (instead, each mouse is split into recordings below)
# # find set of recordings
# recording_column = X["Recording Number"]
# unique_recording_set = set(recording_column)
# unique_recording_list = list(unique_recording_set)
# # unique_name_list = unique_name_list[0:-1]   # first entry is column name

# number_of_recordings = len(unique_recording_list)
# # print(unique_name_list)
# # print(number_of_mice, 'mice in data')

# # remove nan from list of recordings
# unique_recording_list = [x for x in unique_recording_list if str(x) != 'nan']
# number_of_recordings = len(unique_recording_list)

# # print(unique_recording_list)
# # print(number_of_recordings, 'recordings in data')

# **3. Extract features**

---

In [4]:
  # final data is made up of: 
    # 1. avg. start freq. per syllable
    # 2. avg. end freq. per syllable
    # 3. avg. syllable duration per syllable
    # 4. syllable distribution - percentage of each syllable
    # 5. mother genotype
    # ? Bandwidth - TBD

num_of_syllables = 10
final_data_size = 4*num_of_syllables + 1  # last 1 is for feature 5

# Initialize final data variable: mouse_final_data
# Start with one row and add one each iteration
mouse_final_data = np.zeros([1,(final_data_size+1)])

""" 
Work on each mouse and each of its recordings separately and calculate all features.
Place the results in mouse_final_data[row,:]
"""

row = 0   # output data row - each mouse has a row for each of its recordings

for idx in range (0,number_of_mice):
    mouse_split_data = take_data_by_name(unique_name_list[idx], X, name_column)
    print('mouse name:', unique_name_list[idx],', ' 'idx:', idx)
    # print('matching data:', '\n',mouse_split_data)
    # print('headers are: ', list(mouse_split_data.columns))

    # find set of recordings this specific mouse has
    ## -------------------------------------------
    recording_column = mouse_split_data["Recording Number"]
    unique_recording_set = set(recording_column)
    unique_recording_list = list(unique_recording_set)
    # unique_recording_list = unique_recording_list[0:-1]   # use if first entry is column name

    # remove nan from list of recordings
    unique_recording_list = [x for x in unique_recording_list if str(x) != 'nan']
    number_of_recordings = len(unique_recording_list)
    
    print('these are the unique recordings for this mouse: ', unique_recording_list, '\n')
    print('total of: ', number_of_recordings, 'recordings \n')




    for recording in range (number_of_recordings):
      # take data by recording from single mouse we already split
      ## -------------------------------------------
      print('recording number:', recording, '\n')
      recording_split_data = take_data_by_recording(unique_recording_list[recording], mouse_split_data, recording_column)
      # print('this is its data:', '\n', recording_split_data)


      # calculate feature 1 - average start freq for each syllable
      ## -------------------------------------------
      print("\n", "--- CALCULATING FEATURE 1: recording's average start freq for each syllable ---", "\n")
      """
      - take start frequencies grouped by syllable number, and calculate the average
      - fill a 10-long vector with the final result (and add zeros to non-existing syllables)
      - place in mouse_final_data[0:10]
      """

      start_frequencies = np.array(recording_split_data.groupby(by = 'Syllable number', as_index=False)["Start Point (Hz)"].mean())
      # print('start_frequencies :','\n',start_frequencies,'\n')
      
      feature1_vec = np.zeros([1,10])
      for i in range(len(start_frequencies)):
        feature1_vec[0,int(start_frequencies[i,0])-1] = start_frequencies[i,1]
      
      # PRINT RESULT:
      # print('each syllable mean start freq:','\n', feature1_vec, '\n')

      mouse_final_data[row,0:10] = feature1_vec
      # print('mouse_final_data[row] so far:',mouse_final_data[row])




      # calculate feature 2 - average end freq for each syllable
      ## -------------------------------------------
      print("\n", "--- CALCULATING FEATURE 2: recording's average end freq for each syllable ---", "\n")
      

      """
      ITAMAR:
      - take end frequencies grouped by syllable number, and calculate the average
      - fill a 10-long vector with the final result (and add zeros to non-existing syllables)
      - place in mouse_final_data[0:10]
      """

      end_frequencies = np.array(recording_split_data.groupby(by = 'Syllable number', as_index=False)["End Point (Hz)"].mean())
      # print('end_frequencies :','\n',end_frequencies, '\n')
      
      feature2_vec = np.zeros([1,10])
      for i in range(len(end_frequencies)):
        feature2_vec[0,int(end_frequencies[i,0])-1] = end_frequencies[i,1]
      
      # PRINT RESULT:
      # print('each syllable mean end freq:','\n', feature2_vec, '\n')

      mouse_final_data[row,10:20] = feature2_vec

      # print('mouse_final_data[row] so far:',mouse_final_data[row])




      ## -------------------------------------------
      # calculate feature 3 - syllable distribution
      ## -------------------------------------------
      print("\n", "--- CALCULATING FEATURE 3: recording's syllable distribution ---", "\n")

    # take syll numbers
      syllable_num_row = np.transpose(np.array(recording_split_data["Syllable number"]))
      # print(syllable_num_row)

      # create a vector of all-zeros, and change values only for syllables that have been recorded
      distribution = np.zeros([1,10])
      # print(distribution)

    # increment syllable count, using distribution vector (as a histogram)
      for i in range(len(syllable_num_row)):
        # print("iteration", i)
        syll = int(syllable_num_row[i])
        # print("found syllable", syll)
        # print("old distribution:", distribution)
        distribution[0,syll-1] += 1 
        # print("new distribution:", distribution)

    # normalize histogram to distribution
      distribution = distribution / distribution.sum()
      # if all are zeros, dividing by zero will create nan's - turn those into zeros
      distribution[np.isnan(distribution)] = 0  

      # PRINT RESULT:
      # print("final distribution:", "\n", distribution)

    # transfer results to mouse_final_data
      mouse_final_data[row,20:30] = distribution
      # print('mouse_final_data[row] so far:',mouse_final_data[row])




      ## -------------------------------------------
      # calculate feature 4 - average syllable duration for each syllable
      ## -------------------------------------------
      print("\n", "--- CALCULATING FEATURE 4: recording's average syllable duration for each syllable ---", "\n")
      
      # take the mean of the duration column for each syllable
      recording_duration = recording_split_data[["Name", "Syllable number","Duration (s)"]]
      # print('syllable duration: ', recording_duration, '\n')

      a = recording_duration.groupby('Syllable number', as_index=False)["Duration (s)"].mean()
      a = np.array(a)
      
      # PRINT RESULT:
      # print("for this mouse and this recording, each syllable's mean duration is:","\n", a)

      # create a vector of all-zeros, and change values only for syllables that have been recorded
      feature4_vec = np.zeros([1,10])
      # print(np.shape(a))
      for i in range (len(a)):
        # print('iteration ', i)
        # print(int(a[i,0]))
        syllable = int(a[i,0])
        feature4_vec[0,syllable-1] = int(a[i,1])
        # print(" means_vec:", feature4_vec)
      
      mouse_final_data[row,30:40] = feature4_vec
      # print('mouse_final_data[row] so far:',mouse_final_data[row])



      ## -------------------------------------------
      # calculate feature 5 - mother genotype
      ## -------------------------------------------
      print("\n", "--- CALCULATING FEATURE 5: mother genotype ---", "\n")
      # we already split by mouse and recording, so this should be the same for all recording_split_data. Take the first
      mouse_final_data[row,-2] = recording_split_data["Mother Genotype"].iloc[0]




      ## -------------------------------------------
      # calculate feature ? - Bandwidth (???) - TBD
      ## -------------------------------------------
      # print("\n", "--- CALCULATING FEATURE 5: Bandwidth ---", "\n")



      ## -------------------------------------------
      # find mouse's genotype label
      ## -------------------------------------------
      # we already split by mouse and recording, so this should be the same for all recording_split_data. Take the first
      mouse_final_data[row,-1] = recording_split_data["Offspring Genotype"].iloc[0]



      print("\n", "--- FINISHED features, show final data: --- (mouse ", idx, ", recording ", recording, ')\n')
      # print(mouse_final_data[row,:], "\n")

      # prepare for next iteration - add new row
      mouse_final_data = np.append(mouse_final_data, np.zeros([1,final_data_size+1]), axis=0)
      row += 1


# after all iterations, last line is added, but not needed - delete it
mouse_final_data = np.delete(mouse_final_data, -1, 0)

print("\n", "--- FINISHED EXTRACTION FOR ALL MICE --- " ,"\n")
print("\n", "--- FINAL DATA FOR ALL MICE --- ", "\n")
print(mouse_final_data, "\n")

# export data to csv file for further use - COLAB
np.savetxt("processed_data_for_final_classification_REDUCTION_BY_RECORDING.csv", X=mouse_final_data, delimiter=",")
!cp "processed_data_for_final_classification_REDUCTION_BY_RECORDING.csv" /content/drive/MyDrive/Project_A/Project_A_files

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


 --- CALCULATING FEATURE 4: recording's average syllable duration for each syllable --- 


 --- CALCULATING FEATURE 5: mother genotype --- 


 --- FINISHED features, show final data: --- (mouse  25 , recording  4 )

recording number: 5 


 --- CALCULATING FEATURE 1: recording's average start freq for each syllable --- 


 --- CALCULATING FEATURE 2: recording's average end freq for each syllable --- 


 --- CALCULATING FEATURE 3: recording's syllable distribution --- 


 --- CALCULATING FEATURE 4: recording's average syllable duration for each syllable --- 


 --- CALCULATING FEATURE 5: mother genotype --- 


 --- FINISHED features, show final data: --- (mouse  25 , recording  5 )

recording number: 6 


 --- CALCULATING FEATURE 1: recording's average start freq for each syllable --- 


 --- CALCULATING FEATURE 2: recording's average end freq for each syllable --- 


 --- CALCULATING FEATURE 3: recording's syllable distr