In [34]:
import streamlit as st

st.title("PDF to Podcast Converter")

uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
if uploaded_file is not None:
    st.success("File uploaded successfully!")
else:
    st.info("Awaiting PDF file upload.")




In [35]:
import PyPDF2

def extract_text_from_pdf(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

if uploaded_file is not None:
    raw_text = extract_text_from_pdf(uploaded_file)
    st.subheader("Extracted Text")
    st.write(raw_text)


In [36]:
import nltk
from nltk.tokenize import sent_tokenize

def split_text_into_chunks(text, max_chunk_size=500):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chunk_size:
            current_chunk += " " + sentence
        else:
            chunks.append(current_chunk)
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

if uploaded_file is not None:
    raw_text = extract_text_from_pdf(uploaded_file)
    text_chunks = split_text_into_chunks(raw_text)


In [37]:
from sentence_transformers import SentenceTransformer
import numpy as np

def generate_embeddings(chunks):
    model_name = 'all-MiniLM-L6'
    model = SentenceTransformer(model_name)
    embeddings = model.encode(chunks)
    return embeddings



In [38]:
def select_top_n_chunks(chunks, embeddings, n=5):
    # Calculate the centroid of embeddings
    centroid = np.mean(embeddings, axis=0)
    # Calculate similarity of each chunk to the centroid
    similarities = np.dot(embeddings, centroid)
    # Get indices of top N similar chunks
    top_n_indices = np.argsort(similarities)[-n:]
    selected_chunks = [chunks[i] for i in top_n_indices]
    return selected_chunks

if uploaded_file is not None:
    selected_chunks = select_top_n_chunks(text_chunks, chunk_embeddings, n=5)


In [39]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def generate_podcast_script_with_embeddings(chunks):
    model_name = 'facebook/bart-large-cnn'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    
    podcast_script = ""
    for chunk in chunks:
        inputs = tokenizer.encode(chunk, return_tensors='pt', max_length=1024, truncation=True)
        summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        podcast_script += summary + " "
    return podcast_script

if uploaded_file is not None:
    podcast_script = generate_podcast_script_with_embeddings(selected_chunks)
    st.subheader("Generated Podcast Script")
    st.write(podcast_script)


In [40]:
from gtts import gTTS

def text_to_speech(text):
    tts = gTTS(text)
    tts.save("podcast.mp3")

if uploaded_file is not None:
    text_to_speech(podcast_script)
    audio_file = open("podcast.mp3", "rb")
    audio_bytes = audio_file.read()
    st.audio(audio_bytes, format='audio/mp3')


In [41]:
def download_audio():
    with open("podcast.mp3", "rb") as file:
        btn = st.download_button(
            label="Download Podcast",
            data=file,
            file_name="podcast.mp3",
            mime="audio/mpeg"
        )


In [42]:
if uploaded_file is not None:
    raw_text = extract_text_from_pdf(uploaded_file)
    st.subheader("Extracted Text")
    st.write(raw_text)
    
    text_chunks = split_text_into_chunks(raw_text)
    chunk_embeddings = generate_embeddings(text_chunks)
    selected_chunks = select_top_n_chunks(text_chunks, chunk_embeddings, n=5)
    
    podcast_script = generate_podcast_script_with_embeddings(selected_chunks)
    st.subheader("Generated Podcast Script")
    st.write(podcast_script)
    
    text_to_speech(podcast_script)
    audio_file = open("podcast.mp3", "rb")
    audio_bytes = audio_file.read()
    st.audio(audio_bytes, format='audio/mp3')
    
    download_audio()
else:
    st.info("Please upload a PDF file to proceed.")


