In [41]:
import pandas as pd
import datetime
import re

In [68]:
## Fix the structure of the `transcripts` datasets like the first ones

# Read the text file
with open('Dataset/transcripts/Episode-5.txt', 'r') as f:
    text = f.read()

# Remove any line breaks within a single sentence
text = re.sub(r'(?<=[a-z]),?-?\n(?=[a-z])', ' ', text, flags=re.IGNORECASE)
print(text)
# Split the text into lines based on the end of a sentence
lines = re.split(r'(?<=[.!?]) +(?=[A-Z])', text)

# Preprocess the transcript data
output_lines = []
for line in lines:
    if re.match(r'^\d+\.\d+', line) and output_lines:
        output_lines.append('\n' + line)
    else:
        output_lines.append(line)

# Update the original text file with the preprocessed data
with open('Dataset/transcripts/Episode-5.txt', 'w') as f:
    f.write('\n'.join(output_lines))

0.0 ANDREW HUBERMAN: Welcome to the Huberman Lab podcast,
2.25 where we discuss science and science-based tools
4.92 for everyday life.
5.7 [MUSIC PLAYING]
9.07 I'm Andrew Huberman,
and I'm a professor
10.93 of neurobiology and ophthalmology
12.94 at Stanford School of Medicine.
14.41 Today we are discussing fertility.
16.57 We will discuss male fertility and female fertility.
19.97 And I should mention that today's discussion is not just
23.14 for people who are seeking to conceive children
25.45 or who want to know how their children were conceived,
27.79 but it's really for everybody.
29.47 And I say that because it is the story of all of us.
32.95 All of us are here because a specialized set
35.8 of cells, called germ cells--
38.2 that is the sperm and the egg.
39.853 And I'll make it very clear why they're called
41.77 germ cells a little bit later.
43.02 It has nothing to do with infection.
44.62 But it's because a sperm cell and an egg cell arrived at one
50.29 another, either i

In [34]:
## Read the first dataset containing the YouTube chapters into a pandas DataFrame:
chapters = []
with open('Dataset/timestamps/Episode-5.txt', 'r') as f:
    for line in f:
        columns = line.strip().split(' ')
        time = columns[0]
        chapter = ' '.join(columns[1:])
        chapters.append([time, chapter])

chapters_df = pd.DataFrame(chapters, columns=['time', 'chapter'])

In [35]:
## Read the second dataset containing the YouTube transcripts into a pandas DataFrame:
transcripts = []
with open('Dataset/transcripts/Episode-5.txt', 'r') as f:
    for line in f:
        columns = line.strip().split(' ')
        second = columns[0]
        text = ' '.join(columns[1:])
        transcripts.append([second, text])

transcripts_df = pd.DataFrame(transcripts, columns=['second', 'text'])

In [23]:
## Aggregate the datasets
dataset = chapters_df

## Add the transcript column 
dataset['transcript'] = ''

for index, row in dataset.iterrows():
    # Set the start time of the chapter | Convert it to seconds
    start_time = row['time']
    start_time_object = datetime.datetime.strptime(start_time, "%H:%M:%S")
    start_time_seconds = start_time_object.hour * 3600 + start_time_object.minute * 60 + start_time_object.second

    # Get the next row
    if index < len(dataset)-1:
        end_time = dataset.iloc[index+1]['time']
    else:
        next_time = None
        next_text = None
        break
        
    # Set the end time of the chapter | Convert it to seconds
    end_time_object = datetime.datetime.strptime(end_time, "%H:%M:%S")
    end_time_seconds = end_time_object.hour * 3600 + end_time_object.minute * 60 + end_time_object.second

    # convert the 'time' column from string to float
    transcripts_df['second'] = transcripts_df['second'].astype(float)

    # select the rows with time between start_time and end_time, and concatenate the text values
    selected_text = transcripts_df.loc[(transcripts_df['second'] >= start_time_seconds) & (transcripts_df['second'] <= end_time_seconds), 'text'].str.cat(sep=' ')
    
    dataset.loc[index, 'transcript'] = str(selected_text)

ValueError: could not convert string to float: 'the'

In [24]:
## Save the aggregated dataset as a csv file
dataset.to_csv('Dataset/Aggregated Dataset/Episode_5.csv', index=False, columns=['time', 'chapter', 'transcript'])