# Features from Transcript Timestamps

Exploring some feature creation from the transcript files such as:
- **Duration** including duration in seconds (total, teacher, and student), average duration (total, teacher, and student), and percent of the time that the teacher is the speaker.
- **Word count** including total count (total, teacher, and student), percent of words said by teacher, and word rate (total, teacher, and student).
- **Line count** (aka number of changes in speakers) including number of speaker changes (total line count) and number of time student/teacher speaks (student/teacher line count).


These features could be used to analyze the frequency of student interruptions, speed of speech, and balance of talking between teacher and students.

In [1]:
import os
import wave
import numpy as np
import pandas as pd
from datetime import datetime, timedelta


In [2]:
# File paths
transcript_path = './data/transcript_files/'
wav_path = './data/wav_files/'

In [3]:
# File name to test on
file_name = "228_3.4.20_S_SC"
teacher_speaker_num = 2 

In [4]:
def extract_transcript_features(file_name, transcript_path, wav_path, teacher_speaker_num):
    # Read in transcript
    df = pd.read_csv(transcript_path + file_name + '.txt', 
                 engine = 'python', 
                 delimiter = "                                             ",
                 header = None)
    df.columns = ['Speaker', 'Timestamp', 'Text']
    
    # Create a column that converts the timestamps to seconds
    df['Timestamp_Secs'] = df['Timestamp'].apply(convert_time_to_seconds)
    
    # Create a column with pairs of timestamps
    df['Timestamp_Pairs'] = df.apply(create_timestamp_pairs, col = 'Timestamp_Secs', df = df, axis=1)
    
    # Create a column that is the duration (in seconds) of talking for each row
    df['Duration'] = df['Timestamp_Pairs'].apply(lambda x: x[1] - x[0])
    
    # Create a word count column
    df['Word_Count'] = df['Text'].str.split().str.len()
    
    # Update final timestamp's end time to total audio duration
    duration = get_total_duration(wav_path, file_name)
    df.iloc[-1]['Timestamp_Pairs'] = (df.iloc[-1]['Timestamp_Pairs'][0], duration)
    
    # Calculate speaking duration (for teacher vs students)
    df_teacher = df[df['Speaker'] == f'Speaker {teacher_speaker_num}:']
    df_student = df[df['Speaker'] != f'Speaker {teacher_speaker_num}:']
    teacher_duration = df_teacher.Duration.sum()
    student_duration = duration - teacher_duration
    
    # Count of lines (times spoken by teacher/students, number of speaker changes)
    total_lines = df.shape[0]
    teacher_lines = df_teacher.shape[0]
    student_lines = df_student.shape[0]
    
    # Word count calculations
    total_word_count = df.Word_Count.sum()
    teacher_word_count = df_teacher.Word_Count.sum()
    student_word_count = df_student.Word_Count.sum()
    
    # Create a dictionary with some features we may want from this data
    summary_dict = {'ID': file_name[0:3], 
                     # Duration realted features
                     'Total_Duration': duration,
                     'Teacher_Duration': teacher_duration,
                     'Student_Duration': student_duration,
                     'Percent_Time_Teacher': teacher_duration / duration,
                     'Average_Speaker_Duration': duration / total_lines,
                     'Average_Teacher_Duration': teacher_duration / teacher_lines,
                     'Average_Student_Duration': student_duration / student_lines,
                     # Word count features
                     'Total_Word_Count': total_word_count,
                     'Teacher_Word_Count': teacher_word_count,
                     'Student_Word_Count': student_word_count,
                     'Teacher_Percent_Words': teacher_word_count / total_word_count, 
                     'Total_Word_Rate': total_word_count / duration, 
                     'Teacher_Word_Rate': teacher_word_count / teacher_duration, 
                     'Student_Word_Rate': student_word_count / student_duration,
                     # Line count (times spoken, changes in speakers)
                     'Total_Speaker_Line_Count': total_lines,
                     'Teacher_Line_Count': teacher_lines,
                     'Student_Line_Count': student_lines,
                    }
    # Create dataframe with summary
    df_summary = pd.DataFrame(summary_dict, index = [0])
    
    return df, df_summary

In [5]:
def convert_time_to_seconds(timestamp):
    # Parse the timestamp as a datetime object
    time_obj = datetime.strptime(timestamp, '%M:%S')
    
    # Convert the datetime object to a timedelta object
    time_delta = timedelta(minutes=time_obj.minute, seconds=time_obj.second)
    
    # Convert the timedelta object to seconds
    return time_delta.total_seconds()

In [6]:
def create_timestamp_pairs(row, col, df):
    # Get the index of the current row
    index = row.name
    
    # Get the timestamp for the current row
    current_timestamp = row[col]

    # Check if there is a next row
    if index < len(df) - 1:
        # If there is, get the timestamp for the next row
        next_timestamp = df.loc[index + 1, col]
    else:
        # If there isn't, use the current timestamp value as a placeholder
        next_timestamp = current_timestamp

    # Create the tuple
    return (current_timestamp, next_timestamp)



In [7]:
def get_total_duration(wav_path, file_name):
    with wave.open(wav_path + file_name + '.wav', 'rb') as wave_file:
        # Get the number of frames and the frame rate
        num_frames = wave_file.getnframes()
        frame_rate = wave_file.getframerate()

        # Calculate duration (in seconds)
        duration = num_frames / float(frame_rate)
        
    return duration

In [8]:
df, df_summary = extract_transcript_features(file_name, transcript_path, wav_path, teacher_speaker_num)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[-1]['Timestamp_Pairs'] = (df.iloc[-1]['Timestamp_Pairs'][0], duration)


In [11]:
df.head(10)

Unnamed: 0,Speaker,Timestamp,Text,Timestamp_Secs,Timestamp_Pairs,Duration,Word_Count
0,Speaker 1:,00:05,We start simulation.,5.0,"(5.0, 9.0)",4.0,3
1,Speaker 2:,00:09,"Hi everybody, I'm Ms. Murphy.",9.0,"(9.0, 11.0)",2.0,5
2,Speaker 1:,00:11,Hey.,11.0,"(11.0, 12.0)",1.0,1
3,Speaker 2:,00:12,Hi. Hi there. So I just wanted to review some ...,12.0,"(12.0, 33.0)",21.0,51
4,Speaker 1:,00:33,"Oh, it helps. I think everybody, um, maintain ...",33.0,"(33.0, 37.0)",4.0,9
5,Speaker 2:,00:37,"Yeah, totally. What else? Well,",37.0,"(37.0, 40.0)",3.0,5
6,Speaker 1:,00:40,"Well, why didn't you raise your hand? We've be...",40.0,"(40.0, 49.0)",9.0,21
7,Speaker 2:,00:49,"Mina, do you have something you're saying over...",49.0,"(49.0, 51.0)",2.0,9
8,Speaker 1:,00:51,"Me alone? Oh, sorry. We're just trying to get ...",51.0,"(51.0, 57.0)",6.0,14
9,Speaker 2:,00:57,"Oh, well that's nice. Nice for you to think of...",57.0,"(57.0, 65.0)",8.0,29


In [10]:
df_summary.T

Unnamed: 0,0
ID,228.0
Total_Duration,314.024943
Teacher_Duration,181.0
Student_Duration,133.024943
Percent_Time_Teacher,0.576387
Average_Speaker_Duration,5.233749
Average_Teacher_Duration,6.033333
Average_Student_Duration,4.434165
Total_Word_Count,830.0
Teacher_Word_Count,523.0
