# PODCAST SUMMARIZER USING PYTHON  SCRIPT

Importing libraries


In [1]:
#importing libraries
import pandas as pd
import transformers
import openai
from transformers import BartTokenizer, BartForConditionalGeneration
import os

In [None]:


openai.api_key = "YOUR_OPENAI_API_KEY"

def summarize_podcast(podcast_transcript, max_tokens=4096):
    prompt = f"Summarize the following podcast transcript:\n\n{podcast_transcript}"

    response = openai.Completion.create(
        engine="text-davinci-002",  # GPT-3.5
        prompt=prompt,
        max_tokens=max_tokens,
        temperature=0.7,
    )

    return response.choices[0].text.strip()

# Example podcast transcript (replace this with your actual transcript)
podcast_transcript = """
Your long podcast transcript here...
"""

# Summarize the podcast
summary = summarize_podcast(podcast_transcript)
print(summary)


In [15]:
#specifying file path
file_path = r"116_diarized_timestamped_transcript.txt"


def summarize_podcast(file_path, max_input_length= 1024, max_output_length= 500):
    # Load the podcast transcript from the text file
    with open(file_path, 'r', encoding='utf-8') as file:
        podcast_transcript = file.read()

    # Tokenize and truncate the input text
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
    inputs = tokenizer(podcast_transcript, max_length=max_input_length, return_tensors="pt", truncation=True)

    # Load pre-trained BART model
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

    # Generate the summary
    summary_ids = model.generate(inputs.input_ids, max_length=max_output_length, num_beams=4, length_penalty=2.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

# Example usage
# file_path = "path/to/your/podcast_transcript.txt"
summary = summarize_podcast(file_path)
print(summary)


George Regan was a huge underdog in his fight against Richard Changuo. Regan says he's had a vision for so long to be a professional boxer. He says he had a big moment in the sun before the fight, but he wasn't letting it happen yet.


In [1]:
#specifying file path
file_path = r"116_diarized_timestamped_transcript.txt"

In [3]:
import re

def extract_topics_and_quotes(transcript_file):
    with open(transcript_file, 'r', encoding='utf-8') as file:
        transcript = file.read()

    # Replace placeholders with actual details
    guest_name = "[Guest]"
    industry = "[industry]"
    guest_background = "[Guest's Background]"
    specific_topics = "[specific topics]"

    # Extract quotes
    quotes = re.findall(r'["\'](.*?)["\']', transcript)

    # Extract major topics
    topics = re.findall(r'(### .*?)(?=\n|$)', transcript)

    # Generate shownotes
    shownotes = f"### Who is {guest_name}\n- Joe Rogan welcomes {guest_name}, a renowned {industry} known for {guest_background}. The conversation promises to delve into {specific_topics}, offering listeners a unique perspective from {guest_name}'s wealth of experience and knowledge.\n\n"

    for topic in topics:
        # Extract time stamp
        time_stamp = re.search(r'\((\d+:\d+)\)', topic).group(1)
        # Add topic heading to shownotes
        shownotes += f"{topic}\n"
        # Extract 3-4 bullet points for each topic
        for _ in range(3):
            bullet_point = input(f"Enter bullet point for {topic}: ")
            shownotes += f"- {bullet_point}\n"

    # Add memorable quotes to shownotes
    shownotes += "\n### Memorable Quotes\n"
    for quote in quotes[:5]:
        speaker = guest_name if guest_name.lower() in quote.lower() else "Joe Rogan"
        context = input(f"Enter context for the quote '{quote}': ")
        shownotes += f"- '{quote}' - {speaker} ({context})\n"

    # Write shownotes to a new file
    with open('shownotes.md', 'w', encoding='utf-8') as output_file:
        output_file.write(shownotes)

if __name__ == "__main__":
    #specifying file path
    file_path = r"116_diarized_timestamped_transcript.txt"
    extract_topics_and_quotes(file_path)


In [10]:
import re

def parse_transcript(transcript_file):
    with open(transcript_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    sections = []
    current_section = None

    for line in lines:
        if re.match(r'^[A-Z][a-z]+\s[\'A-Za-z]+', line):
            if current_section:
                sections.append(current_section)
            current_section = {'title': line.strip(), 'timestamps': []}
        elif re.match(r'^[0-9]{2}:[0-9]{2}:[0-9]{2}', line):
            time_stamp, speaker, *words = line.split()
            current_section['timestamps'].append((f"{time_stamp} {speaker}", ' '.join(words).strip()))

    if current_section:
        sections.append(current_section)

    return sections

def generate_summary(sections):
    summary = ""
    
    for section in sections:
        summary += f"{section['title']}\n"
        
        if 'timestamps' in section:
            for timestamp, content in section['timestamps']:
                summary += f"- {content} ({timestamp})\n"
        
        summary += '\n'

    return summary

if __name__ == "__main__":
    transcript_file = "path/to/podcast_transcript.txt"
    podcast_sections = parse_transcript(transcript_file)
    
    for section in podcast_sections:
        print(section)

    podcast_summary = generate_summary(podcast_sections)

    with open("podcast_summary.txt", 'w', encoding='utf-8') as output_file:
        output_file.write(podcast_summary)
