# Features from Transcript Timestamps

Exploring some feature creation from the transcript files such as:
- How many times are there changes in speakers?
- What percent of the time is the teacher the speaker?

In [1]:
import os
import wave
import numpy as np
import pandas as pd
from datetime import datetime, timedelta


In [2]:
# File paths
transcript_path = './data/transcript_files/'
wav_path = './data/wav_files/'

In [3]:
# File name to test on
file_name = "228_3.4.20_S_SC"
teacher_speaker_num = 2 

In [4]:
def extract_transcript_features(file_name, transcript_path, wav_path, teacher_speaker_num):
    # Read in transcript
    df = pd.read_csv(transcript_path + file_name + '.txt', 
                 engine = 'python', 
                 delimiter = "                                             ",
                 header = None)
    df.columns = ['Speaker', 'Timestamp', 'Text']
    
    # Create a column that converts the timestamps to seconds
    df['Timestamp_Secs'] = df['Timestamp'].apply(convert_time_to_seconds)
    
    # Create a column with pairs of timestamps
    df['Timestamp_Pairs'] = df.apply(create_timestamp_pairs, col = 'Timestamp_Secs', df = df, axis=1)
    
    # Create a column that is the duration (in seconds) of talking for each row
    df['Duration'] = df['Timestamp_Pairs'].apply(lambda x: x[1] - x[0])
    
    # Generate a few features from this data
    # How many changes in speaker are there?
    num_speaker_changes = df.shape[0]
    
    # What percent of the total time is the teacher speaking?
    # First need to update final timestamp's end time
    duration = get_total_duration(wav_path, file_name)
    df.iloc[-1]['Timestamp_Pairs'] = (df.iloc[-1]['Timestamp_Pairs'][0], duration)
    teacher_duration = df[df['Speaker'] == f'Speaker {teacher_speaker_num}:'].Duration.sum()
    percent_teacher = teacher_duration/duration
    
    # Create a dataframe row with these features
    df_summary = pd.DataFrame({'ID': file_name[0:3], 
                               'Num_Speaker_Changes': num_speaker_changes,
                               'Percent_Time_Teacher': percent_teacher
                              }, index = [0])
    
    return df, df_summary

In [5]:
def convert_time_to_seconds(timestamp):
    # Parse the timestamp as a datetime object
    time_obj = datetime.strptime(timestamp, '%M:%S')
    
    # Convert the datetime object to a timedelta object
    time_delta = timedelta(minutes=time_obj.minute, seconds=time_obj.second)
    
    # Convert the timedelta object to seconds
    return time_delta.total_seconds()

In [6]:
def create_timestamp_pairs(row, col, df):
    # Get the index of the current row
    index = row.name
    
    # Get the timestamp for the current row
    current_timestamp = row[col]

    # Check if there is a next row
    if index < len(df) - 1:
        # If there is, get the timestamp for the next row
        next_timestamp = df.loc[index + 1, col]
    else:
        # If there isn't, use the current timestamp value as a placeholder
        next_timestamp = current_timestamp

    # Create the tuple
    return (current_timestamp, next_timestamp)



In [7]:
def get_total_duration(wav_path, file_name):
    with wave.open(wav_path + file_name + '.wav', 'rb') as wave_file:
        # Get the number of frames and the frame rate
        num_frames = wave_file.getnframes()
        frame_rate = wave_file.getframerate()

        # Calculate duration (in seconds)
        duration = num_frames / float(frame_rate)
        
    return duration

In [8]:
df, df_summary = extract_transcript_features(file_name, transcript_path, wav_path, teacher_speaker_num)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[-1]['Timestamp_Pairs'] = (df.iloc[-1]['Timestamp_Pairs'][0], duration)


In [9]:
df.head()

Unnamed: 0,Speaker,Timestamp,Text,Timestamp_Secs,Timestamp_Pairs,Duration
0,Speaker 1:,00:05,We start simulation.,5.0,"(5.0, 9.0)",4.0
1,Speaker 2:,00:09,"Hi everybody, I'm Ms. Murphy.",9.0,"(9.0, 11.0)",2.0
2,Speaker 1:,00:11,Hey.,11.0,"(11.0, 12.0)",1.0
3,Speaker 2:,00:12,Hi. Hi there. So I just wanted to review some ...,12.0,"(12.0, 33.0)",21.0
4,Speaker 1:,00:33,"Oh, it helps. I think everybody, um, maintain ...",33.0,"(33.0, 37.0)",4.0


In [10]:
df_summary

Unnamed: 0,ID,Num_Speaker_Changes,Percent_Time_Teacher
0,228,60,0.576387
