# Chunk Files

Since our data sample is relatively small we want to try chunking the data so instead of 1 row (set of features) for each eacher we actually have 5 (1 row for each minute of the experiment). However, since we're using the transcripts to get the speaker labels to extract just the teacher's audio I need chunks of the transcripts to align with the video chunks. The transcript timestamps are only labeled when there is a change in speaker so I will use the closest timestamp to determine the start/end time to use for each chunk so it will not be perfect 1 minute segments. We will also need to normalize some features by duration (like words per minute instead of count).




In [1]:
import pandas as pd
import numpy as np
import os
import wave
from datetime import datetime, timedelta


In [3]:
def convert_time_to_seconds(timestamp):
    # Parse the timestamp as a datetime object
    time_obj = datetime.strptime(timestamp, '%M:%S')
    
    # Convert the datetime object to a timedelta object
    time_delta = timedelta(minutes=time_obj.minute, seconds=time_obj.second)
    
    # Convert the timedelta object to seconds
    return time_delta.total_seconds()

In [73]:
wav_path = '../data/wav_files/'
transcript_path = '../data/transcript_files/'

chunked_wav_path = '../data/chunked_wav_files/'
chunked_transcript_path = '../data/chunked_transcript_files/'

file_name = '201_1.24.20_S_SC'

In [82]:
def chunk_wav_transcript_file(file_name, wav_path, transcript_path, chunked_wav_path, chunked_transcript_path, num_chunks = 5):
    '''
    Split the .wav audio files and transcript .txt files into 1 minute chunk
    so that we have num_chunks approximately 1 minute long chunks
    
    
    file_name: name of original file (identifies participant id)
    wav_path: directory containing .wav file
    transcript_path: directory containing transcript .txt files
    chunked_wav_path: directory for chunked .wav files
    chunked_transcript_path: directory for chunked transcript files
    num_chunks: number of 1 minute chunks to create (5 because videos are approximately 5 minutes long)
    '''
    
    with wave.open(wav_path + file_name + '.wav', 'rb') as wave_file:
        # Calculate duration of audio file
        # Get the number of frames and the frame rate
        num_frames = wave_file.getnframes()
        frame_rate = wave_file.getframerate()
        # Calculate duration of audio file (in seconds)
        duration = num_frames / float(frame_rate)

        # Read in transcript file
        df = pd.read_csv(transcript_path + file_name + '.txt', 
                         engine = 'python', 
                         delimiter = "                                             ",
                         header = None)
        df.columns = ['Speaker', 'Timestamp', 'Text']
        df['Timestamp_Secs'] = df['Timestamp'].apply(convert_time_to_seconds)

        # Calculate the row index of the closest timestamp value to each chunk's seconds
        row_indices = [abs(df['Timestamp_Secs'] - (i+1)*60).idxmin() for i in range(num_chunks)]
        # Add first row to be start
        row_indices.insert(0, 0)
        # Update last entry to be last row
        row_indices[-1] = df.shape[0] - 1

        # Chunk transcript and audio file
        for i in range(num_chunks):
            current_idx = row_indices[i]
            next_idx = row_indices[i+1]
            # If first chunk, update start time to be 0
            start_time = 0 if i == 0 else df.iloc[current_idx]['Timestamp_Secs']
            # If last chunk, update end time to be total duration of file
            end_time = duration if i == (num_chunks - 1) else df.iloc[next_idx]['Timestamp_Secs']
            # Get timestamp pair
            timestamp_pair = (start_time, end_time)

            # Extract this chunk's portion of the audio file
            start_frame = int(start_time * frame_rate)
            end_frame = int(end_time * frame_rate)
            wave_file.setpos(start_frame)
            segment_frames = wave_file.readframes(end_frame - start_frame)
            segment = np.frombuffer(segment_frames, dtype=np.int16)

            # Create a new .wav file for this chunk of the audio file
            with wave.open(chunked_wav_path + file_name + f'_Chunk_{i}' + '.wav', 'wb') as output_wave_file:
                output_wave_file.setnchannels(1)
                output_wave_file.setsampwidth(2)
                output_wave_file.setframerate(frame_rate)
                output_wave_file.setnframes(len(segment))
                output_wave_file.writeframes(segment.tobytes())

            # Extract this chunk's portion of the transcript
            if i != num_chunks - 1:
                df_temp = df.iloc[current_idx: next_idx]
            else:
                df_temp = df.iloc[current_idx: ]
            # Remove timestamp in seconds columns that I created
            df_temp = df_temp.drop('Timestamp_Secs', axis = 1)
            # Save
            df_temp.to_csv(chunked_transcript_path + file_name + f'_Chunk_{i}' + '.txt', 
                           sep = "\t",
                           index=False, header=False)



In [84]:
chunk_file('334_10.29.21', wav_path, transcript_path, chunked_wav_path, chunked_transcript_path, num_chunks = 5)