In [1]:
import pandas as pd
import re

In [5]:
# Load the dataset
file_path = '/content/podcastdata_dataset.csv'
df = pd.read_csv(file_path)

In [6]:
# Inspect the data
print(df.head())
print(df.info())

   id            guest                    title  \
0   1      Max Tegmark                 Life 3.0   
1   2    Christof Koch            Consciousness   
2   3    Steven Pinker  AI in the Age of Reason   
3   4    Yoshua Bengio            Deep Learning   
4   5  Vladimir Vapnik     Statistical Learning   

                                                text  
0  As part of MIT course 6S099, Artificial Genera...  
1  As part of MIT course 6S099 on artificial gene...  
2  You've studied the human mind, cognition, lang...  
3  What difference between biological neural netw...  
4  The following is a conversation with Vladimir ...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319 entries, 0 to 318
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      319 non-null    int64 
 1   guest   319 non-null    object
 2   title   319 non-null    object
 3   text    319 non-null    object
dtypes: int64(1), object(3)
memory usage: 1

In [7]:
# Check for missing values
print(df.isnull().sum())

id       0
guest    0
title    0
text     0
dtype: int64


In [12]:
# Fill or drop missing values (this example drops rows with missing transcripts)
df.dropna(subset=['text'], inplace=True)

In [13]:
# Normalize text (convert to lowercase, remove special characters, etc.)
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    return text

In [14]:
df['text'] = df['text'].apply(clean_text)

In [15]:
# Optional: Split transcripts into smaller segments (if necessary)
# This step can help manage long transcripts by breaking them into smaller chunks
def split_transcript(text, max_length=500):
    words = text.split()
    segments = [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
    return segments

In [17]:
df['segments'] = df['text'].apply(split_transcript)

In [18]:
# Flatten the DataFrame to have one segment per row (if splitting)
df = df.explode('segments').reset_index(drop=True)

In [19]:
# Save the cleaned data
cleaned_file_path = '/content/cleaned_podcastdata.csv'
df.to_csv(cleaned_file_path, index=False)
print(f"Cleaned data saved to {cleaned_file_path}")

Cleaned data saved to /content/cleaned_podcastdata.csv


In [None]:
from google.colab import files

files.download('cleaned_podcastdata.csv')