In [1]:
!pip install python-magic python-pptx PyPDF2 SpeechRecognition pydub


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-magic
  Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)
Collecting python-pptx
  Downloading python-pptx-0.6.21.tar.gz (10.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m76.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting SpeechRecognition
  Downloading SpeechRecognition-3.10.0-py2.py3-none-any.whl (32.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.8/32.8 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading Xl

In [17]:
import os
import magic
import PyPDF2
import speech_recognition as sr
from pptx import Presentation
from pydub import AudioSegment
from pydub.silence import split_on_silence

def identify_file_type(file_path):
    file_mime_type = magic.from_file(file_path, mime=True)
    print(file_mime_type)
    if file_mime_type == 'application/vnd.openxmlformats-officedocument.presentationml.presentation':
        return 'pptx'
    elif file_mime_type == 'application/pdf':
        return 'pdf'
    elif file_mime_type.startswith('video/'):
        return 'video'
    else:
        return None

def extract_text_from_ppt(ppt_path, output_path):
    prs = Presentation(ppt_path)
    extracted_text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                extracted_text += shape.text + "\n"

    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write(extracted_text)

def extract_text_from_pdf(pdf_path, output_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)

        extracted_text = ""
        for page_number in range(num_pages):
            page = reader.pages[page_number]
            extracted_text += page.extract_text()

    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write(extracted_text)

def extract_text_from_video(video_path, output_path):
    # Convert video to audio
    audio_path = 'temp_audio.wav'
    video = AudioSegment.from_file(video_path)
    audio = video.set_channels(1).set_frame_rate(16000)
    audio.export(audio_path, format='wav')

    # Split audio on silence
    audio = AudioSegment.from_wav(audio_path)
    chunks = split_on_silence(audio, min_silence_len=500, silence_thresh=-40)

    # Initialize speech recognizer
    recognizer = sr.Recognizer()
    extracted_text = ""

    # Process audio chunks
    for i, chunk in enumerate(chunks):
        chunk.export(f'temp_chunk_{i}.wav', format='wav')
        with sr.AudioFile(f'temp_chunk_{i}.wav') as audio_file:
            audio_data = recognizer.record(audio_file)
            try:
              text = recognizer.recognize_google(audio_data)
              extracted_text += text + " "
            except sr.UnknownValueError:
              extracted_text += "[Unrecognized Speech] "

    with open(output_path, 'w', encoding='utf-8') as output_file:
        output_file.write(extracted_text)

    # Clean up temporary files
    os.remove(audio_path)
    for i in range(len(chunks)):
        os.remove(f'temp_chunk_{i}.wav')


video/x-m4v


In [None]:
# Example usage
file_path = '/content/videofile.m4v'
output_file_path = '/content/video_to_text.txt'

file_type = identify_file_type(file_path)
if file_type == 'pptx':
    extract_text_from_ppt(file_path, output_file_path)
elif file_type == 'pdf':
    extract_text_from_pdf(file_path, output_file_path)
elif file_type == 'video':
  extract_text_from_video(file_path, output_file_path)