In [16]:
!pip install requests PyPDF2 python-pptx python-docx -q


In [19]:
# Install required libraries
!pip install requests PyPDF2 python-pptx python-docx -q

# Import required libraries
from google.colab import files
import io
import requests
from PyPDF2 import PdfReader
from pptx import Presentation
from docx import Document
from getpass import getpass

# Hugging Face API Configuration
API_URL = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.1"  # Mistral 7B Model
API_KEY = getpass("Enter your Hugging Face API key: ")

# Define API Headers
headers = {"Authorization": f"Bearer {API_KEY}"}

def process_uploaded_file(uploaded_file):
    """Handle different file formats from Colab upload"""
    file_name = next(iter(uploaded_file))
    content = uploaded_file[file_name]

    if file_name.endswith('.pdf'):
        return process_pdf(io.BytesIO(content))
    elif file_name.endswith('.txt'):
        return content.decode('utf-8')
    elif file_name.endswith(('.ppt', '.pptx')):
        return process_pptx(io.BytesIO(content))
    elif file_name.endswith('.docx'):
        return process_docx(io.BytesIO(content))
    else:
        raise ValueError("❌ Unsupported file format. Please upload PDF, TXT, PPT, or DOCX.")

def process_pdf(file_stream):
    """Extract text from PDF"""
    reader = PdfReader(file_stream)
    return '\n'.join([page.extract_text() for page in reader.pages if page.extract_text()])

def process_pptx(file_stream):
    """Extract text from PowerPoint"""
    prs = Presentation(file_stream)
    text = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text") and shape.text.strip():
                text.append(shape.text.strip())
    return '\n'.join(text)

def process_docx(file_stream):
    """Extract text from DOCX"""
    doc = Document(file_stream)
    return '\n'.join([para.text.strip() for para in doc.paragraphs if para.text.strip()])

def clean_content(content):
    """Clean and prepare content by removing extra spaces and newlines."""
    cleaned_text = ' '.join(content.split())  # Remove extra spaces and newlines
    return cleaned_text[:6000]  # Limit to 6000 characters for efficiency

def summarize_text(content):
    """Use Hugging Face API (Mistral 7B) for summarization"""
    # Optimized and task-specific prompt
    prompt = f"""
    You are tasked with analyzing a client-provided document.
    Please extract and summarize the following:
    - Key requirements specified by the client
    - Deliverables, tasks, and expected outcomes
    - Deadlines or any timeline expectations
    - Constraints, special instructions, or guidelines
    \nDocument Content:\n{content}
    """

    # API Payload
    payload = {
        "inputs": prompt,
        "parameters": {
            "max_length": 1000,  # Limit response length
            "temperature": 0.3,  # Lower temp for more factual results
        },
    }

    # Send request to Hugging Face API
    response = requests.post(API_URL, json=payload, headers=headers)

    # Handle API response
    if response.status_code == 200:
        result = response.json()
        # Handle possible response formats
        if isinstance(result, list) and len(result) > 0 and "generated_text" in result[0]:
            return result[0]["generated_text"]
        elif isinstance(result, dict) and "generated_text" in result:
            return result["generated_text"]
        elif isinstance(result, dict) and "text" in result:
            return result["text"]
        else:
            return "⚠️ Error: Unexpected API response format."
    elif response.status_code == 503:
        return "⏳ Error: Model is overloaded. Please wait and try again."
    elif response.status_code == 401:
        return "❌ Error: Invalid API key. Please check your credentials."
    else:
        return f"❌ API request failed with status code {response.status_code}: {response.text}"

# File Upload Interface
print("📂 Please upload your file (PDF, TXT, PPT, or DOCX):")
uploaded = files.upload()

# Process Uploaded File
if uploaded:
    # Clean and prepare content
    file_content = clean_content(process_uploaded_file(uploaded))
    print("✅ File uploaded and processed successfully!")

    # Get summary
    summary = summarize_text(file_content)

    # Save and Download Summary
    with open('summary.txt', 'w') as f:
        f.write(summary.strip())  # Trim any extra spaces or newlines

    files.download('summary.txt')
    print("🎉 Summary generated and ready for download!")
else:
    print("⚠️ No file uploaded! Please try again.")


Enter your Hugging Face API key: ··········
📂 Please upload your file (PDF, TXT, PPT, or DOCX):


Saving test.pdf to test (13).pdf
✅ File uploaded and processed successfully!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

🎉 Summary generated and ready for download!
