# Text Summarization of Andrew Huberman Podcast Transcripts

## Import Libraries

In [100]:
import pandas as pd
import datetime
import re
import spacy
from time import time
import matplotlib.pyplot as plt

## Preprocess & Aggregate the datasets

In [79]:
## Fix the structure of the `transcripts` datasets like the first ones

# Read the text file
with open('Dataset/transcripts/Episode-5.txt', 'r') as f:
    text = f.read()

# Remove any line breaks within a single sentence
text = re.sub(r'(?<=[a-z]),?-?\n(?=[a-z])', ' ', text, flags=re.IGNORECASE)

# Split the text into lines based on the end of a sentence
lines = re.split(r'(?<=[.!?]) +(?=[A-Z])', text)

# Preprocess the transcript data
output_lines = []
for line in lines:
    if re.match(r'^\d+\.\d+', line) and output_lines:
        output_lines.append('\n' + line)
    else:
        output_lines.append(line)

# Update the original text file with the preprocessed data
with open('Dataset/transcripts/Episode-5.txt', 'w') as f:
    f.write('\n'.join(output_lines))

In [72]:
## Read the first dataset containing the YouTube chapters into a pandas DataFrame:
chapters = []
with open('Dataset/timestamps/Episode-5.txt', 'r') as f:
    for line in f:
        columns = line.strip().split(' ')
        time = columns[0]
        chapter = ' '.join(columns[1:])
        chapters.append([time, chapter])

chapters_df = pd.DataFrame(chapters, columns=['time', 'chapter'])

In [73]:
## Read the second dataset containing the YouTube transcripts into a pandas DataFrame:
transcripts = []
with open('Dataset/transcripts/Episode-5.txt', 'r') as f:
    for line in f:
        columns = line.strip().split(' ')
        second = columns[0]
        text = ' '.join(columns[1:])
        transcripts.append([second, text])

transcripts_df = pd.DataFrame(transcripts, columns=['second', 'text'])

In [76]:
## Aggregate the datasets
dataset = chapters_df

## Add the transcript column 
dataset['transcript'] = ''

temp_text = ""

for index, row in dataset.iterrows():
    # Set the start time of the chapter | Convert it to seconds
    start_time = row['time']
    start_time_object = datetime.datetime.strptime(start_time, "%H:%M:%S")
    start_time_seconds = start_time_object.hour * 3600 + start_time_object.minute * 60 + start_time_object.second

    # Get the next row
    if index < len(dataset)-1:
        end_time = dataset.iloc[index+1]['time']
    else:
        next_time = None
        next_text = None
        break
        
    # Set the end time of the chapter | Convert it to seconds
    end_time_object = datetime.datetime.strptime(end_time, "%H:%M:%S")
    end_time_seconds = end_time_object.hour * 3600 + end_time_object.minute * 60 + end_time_object.second

    # convert the 'time' column from string to float
    for row in
    try :
        transcripts_df['second'] = transcripts_df['second'].astype(float)
    except:
        temp_text = transcripts_df['second'] + transcripts_df['text']
        print(temp_text)
        print('----------------------------------------------------------')
        continue
    # select the rows with time between start_time and end_time, and concatenate the text values
    selected_text = transcripts_df.loc[(transcripts_df['second'] >= start_time_seconds) & (transcripts_df['second'] <= end_time_seconds), 'text'].str.cat(sep=' ')
    
    dataset.loc[index, 'transcript'] = str(temp_text) + str(selected_text)
    temp_text = ""

## Save the aggregated datasets

In [77]:
## Save the aggregated dataset as a csv file
dataset.to_csv('Dataset/Aggregated Dataset/Episode_5.csv', index=False, columns=['time', 'chapter', 'transcript'])

## Build the initial model

In [89]:
## Read one of the datasets
aggregated_dataset = pd.read_csv('Dataset/Aggregated Dataset/Episode_1_Summary_Included.csv')

In [90]:
# Remove non-alphabetic characters (Data Cleaning)
def text_strip(column):

    for row in column:
        row = re.sub("(\\t)", " ", str(row)).lower()
        row = re.sub("(\\r)", " ", str(row)).lower()
        row = re.sub("(\\n)", " ", str(row)).lower()

        # Remove _ if it occurs more than one time consecutively
        row = re.sub("(__+)", " ", str(row)).lower()

        # Remove - if it occurs more than one time consecutively
        row = re.sub("(--+)", " ", str(row)).lower()

        # Remove ~ if it occurs more than one time consecutively
        row = re.sub("(~~+)", " ", str(row)).lower()

        # Remove + if it occurs more than one time consecutively
        row = re.sub("(\+\++)", " ", str(row)).lower()

        # Remove . if it occurs more than one time consecutively
        row = re.sub("(\.\.+)", " ", str(row)).lower()

        # Remove the characters - <>()|&©ø"',;?~*!
        row = re.sub(r"[<>()|&©ø\[\]\'\",;?~*!]", " ", str(row)).lower()

        # Remove mailto:
        row = re.sub("(mailto:)", " ", str(row)).lower()

        # Remove \x9* in text
        row = re.sub(r"(\\x9\d)", " ", str(row)).lower()

        # Replace INC nums to INC_NUM
        row = re.sub("([iI][nN][cC]\d+)", "INC_NUM", str(row)).lower()

        # Replace CM# and CHG# to CM_NUM
        row = re.sub("([cC][mM]\d+)|([cC][hH][gG]\d+)", "CM_NUM", str(row)).lower()

        # Remove punctuations at the end of a word
        row = re.sub("(\.\s+)", " ", str(row)).lower()
        row = re.sub("(\-\s+)", " ", str(row)).lower()
        row = re.sub("(\:\s+)", " ", str(row)).lower()

        # Replace any url to only the domain name
        try:
            url = re.search(r"((https*:\/*)([^\/\s]+))(.[^\s]+)", str(row))
            repl_url = url.group(3)
            row = re.sub(r"((https*:\/*)([^\/\s]+))(.[^\s]+)", repl_url, str(row))
        except:
            pass

        # Remove multiple spaces
        row = re.sub("(\s+)", " ", str(row)).lower()

        # Remove the single character hanging between any two spaces
        row = re.sub("(\s+.\s+)", " ", str(row)).lower()

        yield row

In [91]:
aggregated_dataset.head(5)

Unnamed: 0.1,Unnamed: 0,time,chapter,transcript,summary
0,1,0:00:00,"Endurance: Benefits, Mechanics & Breathing",welcome to the huberman Lab podcast where we d...,"In this episode of the Huberman Lab podcast, P..."
1,2,0:07:30,Tool: “Exercise Snacks”,huberman Lab podcast is now partnered with mom...,The Huberman Lab podcast has partnered with Mo...
2,3,0:14:21,"Momentous, Levels, LMNT",inflammatory responses so let's talk about the...,The text discusses the different types of head...
3,4,0:18:01,Endurance Categories,splinter into a particular uh you know skin ar...,The text discusses the different causes of inf...
4,5,0:22:16,Fat Loss & Respiration; Carbon Cycles & Storag...,experience pain whether or not it's a pin pric...,The text explains that pain is neural in origi...


In [92]:
processed_text = text_strip(aggregated_dataset['transcript'])
processed_summary = text_strip(aggregated_dataset['summary'])

In [102]:
## Load the data as batches using the pipe() method provided by spaCy.
## This ensures that all pieces of text and summaries possess the string data type.

nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) 

# Process text as batches and yield Doc objects in order
text = [str(doc) for doc in nlp.pipe(processed_text, batch_size=5000)]

summary = ['_START_ '+ str(doc) + ' _END_' for doc in nlp.pipe(processed_summary, batch_size=5000)]

In [103]:
## Store the text and summary lists in pandas objects.

aggregated_dataset['cleaned_text'] = pd.Series(text)
aggregated_dataset['cleaned_summary'] = pd.Series(summary)

In [105]:
aggregated_dataset.head()

Unnamed: 0.1,Unnamed: 0,time,chapter,transcript,summary,cleaned_text,cleaned_summary
0,1,0:00:00,"Endurance: Benefits, Mechanics & Breathing",welcome to the huberman Lab podcast where we d...,"In this episode of the Huberman Lab podcast, P...",welcome to the huberman lab podcast where we d...,_START_ in this episode of the huberman lab po...
1,2,0:07:30,Tool: “Exercise Snacks”,huberman Lab podcast is now partnered with mom...,The Huberman Lab podcast has partnered with Mo...,huberman lab podcast is now partnered with mom...,_START_ the huberman lab podcast has partnered...
2,3,0:14:21,"Momentous, Levels, LMNT",inflammatory responses so let's talk about the...,The text discusses the different types of head...,inflammatory responses so let talk about the n...,_START_ the text discusses the different types...
3,4,0:18:01,Endurance Categories,splinter into a particular uh you know skin ar...,The text discusses the different causes of inf...,splinter into particular uh you know skin area...,_START_ the text discusses the different cause...
4,5,0:22:16,Fat Loss & Respiration; Carbon Cycles & Storag...,experience pain whether or not it's a pin pric...,The text explains that pain is neural in origi...,experience pain whether or not it a pin prick ...,_START_ the text explains that pain is neural ...


In [None]:
## Plot a graph to determine the frequency ranges tied to the lengths of text and summary, 
## i.e., determine the range of length of words where the maximum number of texts and summaries fall into.

