# Audio Feature Engineering

This file contains all of the code used to extract speech/audio features for our analysis


In [1]:
import os


In [2]:
# Functions are save in a .py file
from speech_feature_extraction import *


## Setup

In [3]:
# File paths
m4a_path = './data/m4a_files/' # Original audio files from sponsor
wav_path = './data/wav_files/' # Full wav files
transcript_path = './data/transcript_files/' # Transcripts from temi (downloaded as .txt column format with speaker numbers and timestamps)
teacher_wav_path = './data/teacher_wav_files/' # Teacher only wav files



In [4]:
# Create empty directories for output files (if it doesn't exist)
if not os.path.exists(wav_path):
    os.makedirs(wav_path)
    
# Create empty directory for teacher only wav files (if it doesn't exist)
if not os.path.exists(teacher_wav_path):
    os.makedirs(teacher_wav_path)


In [5]:
# List of file names
file_list = os.listdir(m4a_path) # List all files in original directory
# Updated list of files names
# remove extension and skip files that start with '.' (e.g. ipynb checkpoints)
file_list = [x.replace('.m4a', '')for x in file_list if '.m4a' in x] # Remove extension

In [6]:
# # Temporarily testing on a small list of files
# file_list = file_list[0:2]

In [7]:
# Print length of file_list (expected number of files/participants)
print(len(file_list))

89


In [8]:
# Check that we have a transcript file for all of these
transript_file_list = [x.replace('.txt', '')for x in os.listdir(transcript_path) if '.txt' in x] 
for file in file_list:
    if file not in transript_file_list:
        print(file)
        

## 1. Convert Audio Files to Wav Files

In [9]:
# Convert all audio files to wav format
for file in file_list:
    convert_m4a_to_wav(file, m4a_path, wav_path)


In [10]:
# Check that we have a wav file for all original files
wav_file_list = [x.replace('.wav', '')for x in os.listdir(wav_path) if '.wav' in x] 

assert len(file_list) == len(wav_file_list)
for file in file_list:
    if file not in wav_file_list:
        print(file)
        

## 2. Extract Only the Teacher's Audio

Using timestamps from Temi transcriptions to cut out the avatar voices to get a .wav file with just the teacher's audio. 


In [11]:
# Get lookup of teacher speaker numbers
speaker_num_df = pd.read_excel('../speaker_identification.xlsx')
speaker_num_dict = dict(zip(speaker_num_df.file_name, speaker_num_df.teacher))
speaker_num_dict['348th_11.4.21.txt'] = speaker_num_dict['348_11.4.21.txt'] # manual update for one weird file name


In [12]:
# Extract just the teacher's audio into new wav file
for file in file_list:
    extract_teacher_audio(file, wav_path, teacher_wav_path, transcript_path, speaker_num_dict[file + '.txt'])


In [13]:
# Check that we have a teacher only file for all original files
teacher_audio_file_list = [x.replace('.wav', '')for x in os.listdir(teacher_wav_path) if '.wav' in x] 

assert len(file_list) == len(teacher_audio_file_list)
for file in file_list:
    if file not in teacher_audio_file_list:
        print(file)

## 3. Extract Features from Transcripts

Exploring some feature creation from the transcript files such as:
- **Duration** including duration in seconds (total, teacher, and student), average duration (total, teacher, and student), and percent of the time that the teacher is the speaker.
- **Word count** including total count (total, teacher, and student), percent of words said by teacher, and word rate (total, teacher, and student).
- **Line count** (aka number of changes in speakers) including number of speaker changes (total line count) and number of time student/teacher speaks (student/teacher line count).


These features could be used to analyze the frequency of student interruptions, speed of speech, and balance of talking between teacher and students.

In [14]:
# Create empty dataframe to save transcript features
transcript_features = pd.DataFrame()

In [15]:
# Extract transcript features
for file in file_list:
    try:
        # Extract features
        temp_df, temp_df_summary = extract_transcript_features(file, transcript_path, wav_path, speaker_num_dict[file + '.txt'])
        # Add features to final dataframe
        transcript_features = pd.concat([transcript_features, temp_df_summary])
    except:
        print(f'Problem with {file}')
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[-1]['Timestamp_Pairs'] = (df.iloc[-1]['Timestamp_Pairs'][0], duration)


In [16]:
transcript_features

Unnamed: 0,ID,Total_Duration,Teacher_Duration,Student_Duration,Percent_Time_Teacher,Average_Speaker_Duration,Average_Teacher_Duration,Average_Student_Duration,Total_Word_Count,Teacher_Word_Count,Student_Word_Count,Teacher_Percent_Words,Total_Word_Rate,Teacher_Word_Rate,Student_Word_Rate,Total_Speaker_Line_Count,Teacher_Line_Count,Student_Line_Count
0,332,308.563991,141.0,167.563991,0.456955,9.075411,8.294118,9.856705,697,305,392,0.437590,2.258851,2.163121,2.339405,34,17,17
0,348,309.289320,86.0,223.289320,0.278057,6.580624,3.739130,9.303722,698,195,503,0.279370,2.256787,2.267442,2.252683,47,23,24
0,335,313.918662,145.0,168.918662,0.461903,6.975970,6.590909,7.344290,1021,567,454,0.555338,3.252435,3.910345,2.687684,45,22,23
0,320,308.307982,180.0,128.307982,0.583832,5.817132,6.923077,4.752147,651,407,244,0.625192,2.111525,2.261111,1.901674,53,26,27
0,328,315.305329,202.0,113.305329,0.640649,7.332682,9.619048,5.150242,856,637,219,0.744159,2.714829,3.153465,1.932831,43,21,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,218,318.203379,194.0,124.203379,0.609673,7.761058,9.238095,6.210169,803,602,201,0.749689,2.523543,3.103093,1.618313,41,21,20
0,223,319.588413,204.0,115.588413,0.638321,7.432289,9.714286,5.254019,868,583,285,0.671659,2.715993,2.857843,2.465645,43,21,22
0,347,306.451995,191.0,115.451995,0.623262,8.282486,10.611111,6.076421,975,684,291,0.701538,3.181575,3.581152,2.520528,37,18,19
0,360,319.870658,190.0,129.870658,0.593990,6.527973,7.916667,5.194826,643,435,208,0.676516,2.010188,2.289474,1.601593,49,24,25


In [17]:
assert transcript_features.shape[0] == len(file_list)

## 4. Speech Feature Engineering

In [18]:
# Create empty dataframe to save audio features
audio_features = pd.DataFrame()

In [19]:
for file in file_list:
    try:
        # Extract features
        temp_df_summary = extract_audio_features(file, teacher_wav_path, num_mfccs = 13)
        # Add features to final dataframe
        audio_features = pd.concat([audio_features, temp_df_summary])
    except:
        print(f'Problem with {file}')

In [20]:
audio_features

Unnamed: 0,ID,number_ of_syllables,number_of_pauses,rate_of_speech,articulation_rate,speaking_duration,original_duration,balance,f0_mean,f0_std,...,Flatness_Max,Flatness_Std,Zero_Crossing_Rate_Mean,Zero_Crossing_Rate_Min,Zero_Crossing_Rate_Max,Zero_Crossing_Rate_Std,Loudness_Mean,Loudness_Min,Loudness_Max,Loudness_Std
0,332,411,62,3,4,93.5,144.6,0.6,211.67,53.96,...,1.484376e-06,0.101342,0.106655,0.724121,0.0,0.111820,-41.060013,-12.580391,-92.580391,19.036415
0,348,255,39,3,5,50.5,86,0.6,244.32,56.67,...,2.166050e-06,0.168635,0.102524,0.819824,0.0,0.115759,-43.753429,-14.232292,-94.232292,21.297735
0,335,427,87,3,5,87.3,141,0.6,239.45,66.01,...,5.494624e-07,0.140694,0.100872,0.642578,0.0,0.102428,-41.438087,-12.843327,-92.843323,23.901451
0,320,483,87,3,4,108.2,180,0.6,218.64,48.18,...,1.154836e-06,0.134986,0.113569,0.738281,0.0,0.117673,-43.237362,-16.844372,-96.844376,19.977518
0,328,614,105,3,5,130.7,202,0.6,148.77,45.43,...,8.892575e-07,0.107515,0.101777,0.606445,0.0,0.098971,-42.857697,-16.831387,-96.831390,18.386736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,218,314,42,3,5,62.8,90,0.7,258.72,73.09,...,1.595255e-06,0.009985,0.140677,0.608398,0.0,0.099093,-32.689915,-11.363259,-67.924858,13.973388
0,223,780,83,4,5,154.1,204,0.8,224.38,43.21,...,1.655184e-07,0.002108,0.075508,0.522461,0.0,0.061578,-33.415398,-13.524834,-72.490852,13.480139
0,347,693,84,4,5,138.9,191,0.7,239.71,53.91,...,5.416586e-07,0.092835,0.117964,0.691406,0.0,0.119356,-36.221664,-13.817157,-93.817154,18.311182
0,360,437,88,2,5,94.1,190,0.5,151.52,54.59,...,2.598544e-06,0.216559,0.087531,0.575684,0.0,0.089756,-50.064537,-14.589852,-94.589851,24.071793


In [22]:
assert audio_features.shape[0] == len(file_list)

## 4. Combine Features into Single Dataframe

In [23]:
# Merge the two dataframes with features into one
df = transcript_features.merge(audio_features, on = 'ID')

In [24]:
df

Unnamed: 0,ID,Total_Duration,Teacher_Duration,Student_Duration,Percent_Time_Teacher,Average_Speaker_Duration,Average_Teacher_Duration,Average_Student_Duration,Total_Word_Count,Teacher_Word_Count,...,Flatness_Max,Flatness_Std,Zero_Crossing_Rate_Mean,Zero_Crossing_Rate_Min,Zero_Crossing_Rate_Max,Zero_Crossing_Rate_Std,Loudness_Mean,Loudness_Min,Loudness_Max,Loudness_Std
0,332,308.563991,141.0,167.563991,0.456955,9.075411,8.294118,9.856705,697,305,...,1.484376e-06,0.101342,0.106655,0.724121,0.0,0.111820,-41.060013,-12.580391,-92.580391,19.036415
1,348,309.289320,86.0,223.289320,0.278057,6.580624,3.739130,9.303722,698,195,...,2.166050e-06,0.168635,0.102524,0.819824,0.0,0.115759,-43.753429,-14.232292,-94.232292,21.297735
2,335,313.918662,145.0,168.918662,0.461903,6.975970,6.590909,7.344290,1021,567,...,5.494624e-07,0.140694,0.100872,0.642578,0.0,0.102428,-41.438087,-12.843327,-92.843323,23.901451
3,320,308.307982,180.0,128.307982,0.583832,5.817132,6.923077,4.752147,651,407,...,1.154836e-06,0.134986,0.113569,0.738281,0.0,0.117673,-43.237362,-16.844372,-96.844376,19.977518
4,328,315.305329,202.0,113.305329,0.640649,7.332682,9.619048,5.150242,856,637,...,8.892575e-07,0.107515,0.101777,0.606445,0.0,0.098971,-42.857697,-16.831387,-96.831390,18.386736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,218,318.203379,194.0,124.203379,0.609673,7.761058,9.238095,6.210169,803,602,...,1.595255e-06,0.009985,0.140677,0.608398,0.0,0.099093,-32.689915,-11.363259,-67.924858,13.973388
85,223,319.588413,204.0,115.588413,0.638321,7.432289,9.714286,5.254019,868,583,...,1.655184e-07,0.002108,0.075508,0.522461,0.0,0.061578,-33.415398,-13.524834,-72.490852,13.480139
86,347,306.451995,191.0,115.451995,0.623262,8.282486,10.611111,6.076421,975,684,...,5.416586e-07,0.092835,0.117964,0.691406,0.0,0.119356,-36.221664,-13.817157,-93.817154,18.311182
87,360,319.870658,190.0,129.870658,0.593990,6.527973,7.916667,5.194826,643,435,...,2.598544e-06,0.216559,0.087531,0.575684,0.0,0.089756,-50.064537,-14.589852,-94.589851,24.071793


In [25]:
# Save to csv file for further analysis
df.to_csv('Teacher_Mindfulness_Audio_Transcript_Features_20230214.csv', index = False)