In [1]:
import pandas as pd
import re

In [3]:
# Load the dataset
file_path = '/content/podcastdata_dataset.csv'
df = pd.read_csv(file_path)

In [4]:
# Inspect the data
print(df.head())

   id            guest                    title  \
0   1      Max Tegmark                 Life 3.0   
1   2    Christof Koch            Consciousness   
2   3    Steven Pinker  AI in the Age of Reason   
3   4    Yoshua Bengio            Deep Learning   
4   5  Vladimir Vapnik     Statistical Learning   

                                                text  
0  As part of MIT course 6S099, Artificial Genera...  
1  As part of MIT course 6S099 on artificial gene...  
2  You've studied the human mind, cognition, lang...  
3  What difference between biological neural netw...  
4  The following is a conversation with Vladimir ...  


In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319 entries, 0 to 318
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      319 non-null    int64 
 1   guest   319 non-null    object
 2   title   319 non-null    object
 3   text    319 non-null    object
dtypes: int64(1), object(3)
memory usage: 10.1+ KB
None


In [6]:
# Check for missing values
print(df.isnull().sum())

id       0
guest    0
title    0
text     0
dtype: int64


In [7]:
# Fill or drop missing values (this example drops rows with missing transcripts)
df.dropna(subset=['text'], inplace=True)

In [10]:
!pip install nltk



In [26]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [27]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [32]:
def preprocess_text(text):
    """
    Cleans and preprocesses the input text by tokenizing, removing common stopwords,
    and preserving numbers and sentence structure.

    Parameters:
    - text (str): The input text to preprocess.

    Returns:
    - str: The cleaned and preprocessed text.
    """
    # Tokenize the text
    words = word_tokenize(text)

    # Load stopwords
    stop_words = set(stopwords.words('english'))

    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Remove common stopwords while preserving important context words and numbers
    cleaned_words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words or word.isnumeric() or word.isalpha()]

    # Join the words back into a single string
    cleaned_text = ' '.join(cleaned_words)

    return cleaned_text

def load_and_clean_csv(input_file_path, output_file_path):
    """
    Reads a CSV file, cleans the text data, and saves the cleaned data to a new CSV file.

    Parameters:
    - input_file_path (str): The file path to the input CSV file.
    - output_file_path (str): The file path to the output CSV file.

    Returns:
    - None
    """
    try:
        # Read the CSV file
        df = pd.read_csv(input_file_path)

        # Check if the DataFrame is empty
        if df.empty:
            raise ValueError("The input CSV file is empty.")

        # Apply the preprocess_text function to each cell in the DataFrame
        df_cleaned = df.applymap(lambda x: preprocess_text(str(x)) if isinstance(x, str) else x)

        # Save the cleaned DataFrame to a new CSV file
        df_cleaned.to_csv(output_file_path, index=False)
        print(f"Cleaned data has been saved to {output_file_path}")

    except FileNotFoundError:
        raise FileNotFoundError(f"The file {input_file_path} does not exist.")
    except pd.errors.EmptyDataError:
        raise pd.errors.EmptyDataError(f"The file {input_file_path} is empty or malformed.")
    except Exception as e:
        print(f"An error occurred: {e}")

input_file_path = "/content/podcastdata_dataset.csv"
output_file_path = "/content/cleaned_podcastdata_dataset.csv"
load_and_clean_csv(input_file_path, output_file_path)

Cleaned data has been saved to /content/cleaned_podcastdata_dataset.csv


In [34]:
# Read the cleaned CSV to verify
df_cleaned = pd.read_csv(output_file_path)
df_cleaned.head()

Unnamed: 0,id,guest,title,text
0,1,max tegmark,life 3.0,"a part of mit course 6s099 , artificial genera..."
1,2,christof koch,consciousness,a part of mit course 6s099 on artificial gener...
2,3,steven pinker,ai in the age of reason,"you 've studied the human mind , cognition , l..."
3,4,yoshua bengio,deep learning,what difference between biological neural netw...
4,5,vladimir vapnik,statistical learning,the following is a conversation with vladimir ...


In [37]:
import zipfile
import os

def compress_csv(input_file_path, output_zip_path):
    with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(input_file_path, os.path.basename(input_file_path))

input_file_path = "cleaned_podcastdata_dataset.csv"
output_zip_path = "cleaned_podcastdata.zip"
compress_csv(input_file_path, output_zip_path)
print(f"Compressed file has been saved to {output_zip_path}")

Compressed file has been saved to cleaned_podcastdata.zip


In [38]:
from google.colab import files
files.download('cleaned_podcastdata.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>