In [52]:
import tkinter as tk
from tkinter import filedialog, simpledialog, messagebox
import docx
import pdfplumber
from transformers import pipeline, T5Tokenizer
import threading
import torch
import textwrap

def read_document(file_path):
    """Reads text from a Word or PDF document and returns it as a string."""
    if file_path.endswith(".docx"):
        document = docx.Document(file_path)
        paragraphs = document.paragraphs
        text = ""
        for paragraph in paragraphs:
            text += paragraph.text + "\n"
        return text
    elif file_path.endswith(".pdf"):
        with pdfplumber.open(file_path) as pdf:
            text = ""
            for page in pdf.pages:
                text += page.extract_text() + "\n"
            return text
    else:
        return None

def split_text_into_chunks(text, max_chunk_length):
    """Splits a text into smaller chunks of the specified length."""
    paragraphs = text.split("\n")
    chunks = []
    current_chunk = ""

    for paragraph in paragraphs:
        if len(current_chunk) + len(paragraph) < max_chunk_length:
            current_chunk += paragraph + "\n"
        else:
            chunks.append(current_chunk.strip())
            current_chunk = paragraph + "\n"

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def summarize_document():
    if uploaded_file_path:
        threading.Thread(target=background_summarize, args=(uploaded_file_path,)).start()
        progress_label.config(text="Summarizing...", fg="blue")
    else:
        progress_label.config(text="Please upload a Word or PDF document.", fg="red")

def background_summarize(file_path):
    document_text = read_document(file_path)

    if document_text:
        summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base")

        # Split the text into chunks
        text_chunks = split_text_into_chunks(document_text, max_chunk_length=1500)
        summarized_chunks = []

        for chunk in text_chunks:
            # Manually set max_length to a lower value
            max_len = 115  # Adjust as needed
            summary = summarizer(chunk, max_length=max_len, min_length=100, do_sample=True, top_k=50, top_p=0.95)
            summarized_chunks.append(summary[0]['summary_text'])

        # Join the summarized chunks
        summarized_text = "\n".join(summarized_chunks)

        # Display the summary in the tkinter interface
        result_text.delete(1.0, tk.END)
        result_text.insert(tk.END, summarized_text)

        progress_label.config(text="Summarize", fg="black")
        show_checkmark()
    else:
        progress_label.config(text="Unsupported document type.", fg="red")




def max_length(input_length):
    """Creates a summary with a maximum length of 30% of the input text.

    Args:
        input_length (str): The input text.

    Returns:
        int: The maximum length of the summary.
    """
    input_length = len(input_length)
    if not isinstance(input_length, int):
        input_length = int(input_length)

    return int(input_length * 0.3)

def upload_word_document():
    # Opens a file selection dialog and selects the Word document
    global uploaded_file_path
    uploaded_file_path = filedialog.askopenfilename(filetypes=[("Word Document", "*.docx"), ("PDF Document", "*.pdf")])
    
    if uploaded_file_path:
        show_checkmark()  # Displays a green checkmark

def ask_question():
    # Provides a warning to the user if they have not uploaded a file
    if not uploaded_file_path:
        progress_label.config(text="Please upload a Word document.", fg="red")
        return

    # Opens a dialog to ask the user a question about the uploaded text
    question = simpledialog.askstring("Ask a Question!", "Ask a question about the uploaded text:")
    if question:
        # Initiates the question-answering process in a separate thread
        threading.Thread(target=background_answer_question, args=(uploaded_file_path, question)).start()

def background_answer_question(file_path, question):
    # Provides feedback to the user while the question-answering process is running
    progress_label.config(text="Answering...", fg="blue")

    # Retrieves the answer to the question
    answer = answer_question(file_path, question)

    # Displays the answer on the screen
    result_text.delete(1.0, tk.END)  # Clears the previous content
    result_text.insert(tk.END, "Question: " + question + "\nAnswer: " + answer)

    # Resets the progress label and shows a green checkmark when the process is complete
    progress_label.config(text="Answered", fg="black")
    show_checkmark()

def answer_question(file_path, question):
    # Retrieves the text from the file path
    document_text = read_document(file_path)
    question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad')
    
    # Uses the model to find the answer to the question
    answer = question_answerer(context=document_text, question=question)
    
    return answer["answer"]

def show_checkmark():
    # Displays a green checkmark (large and next to it)
    checkmark_label.config(text="✓", fg="green", font=("Arial", 24))  # Large and green checkmark

# Create a tkinter window
root = tk.Tk()
root.title("Word Document Summarizer")

# Upload file button
upload_file_button = tk.Button(root, text="Upload Document", command=upload_word_document)
upload_file_button.pack(side=tk.LEFT, padx=10)

# Summarize button
summarize_button = tk.Button(root, text="Summarize Document", command=summarize_document)
summarize_button.pack(side=tk.LEFT, padx=10)

# Uploaded file path (initially empty)
uploaded_file_path = None

# Ask a question button
question_button = tk.Button(root, text="Ask a Question!", command=ask_question, bg="red", fg="white", font=("Arial", 11))
question_button.pack(side=tk.BOTTOM, padx=5, pady=5)

# Text box to display the summarized text
result_text = tk.Text(root, wrap=tk.WORD, width=40, height=10)
result_text.pack(padx=10, pady=10)

# Label to display progress during processing
progress_label = tk.Label(root, text="", fg="black")
progress_label.pack()

# Label to display a green checkmark
checkmark_label = tk.Label(root, text="", fg="green")
checkmark_label.pack()

root.mainloop()


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
