In [None]:
import os
from typing import List

def reformat_txt_files(folders: List[str]) -> None:
    """
    Reformats .txt files in the specified folders to give them a clean, professional look.

    Args:
        folders (List[str]): List of folder paths containing the .txt files to reformat.

    Returns:
        None
    """
    for folder in folders:
        for filename in os.listdir(folder):
            if filename.endswith(".txt"):
                file_path = os.path.join(folder, filename)
                with open(file_path, "r") as file:
                    content = file.read()

                # Clean up the content
                content = content.strip()  # Remove leading/trailing whitespace
                content = " ".join(content.split())  # Remove extra whitespace between words

                # Add blank line between paragraphs
                content = content.replace("\n", "\n\n")

                # Add horizontal line between sections
                content = content.replace("---", "\n\n" + "-" * 40 + "\n\n")

                # Add bullet points
                content = content.replace("- ", "\n- ")

                # Add numbered lists
                content = content.replace("1. ", "\n1. ")

                # Add bold formatting
                content = content.replace("**", "**")

                # Add italic formatting
                content = content.replace("*", "*")

                # Add underline formatting
                content = content.replace("__", "__")

                # Add strikethrough formatting
                content = content.replace("~~", "~~")

                # Add code formatting
                content = content.replace("`", "`")

                # Add blockquote formatting
                content = content.replace("> ", "\n> ")

                # Add table formatting
                content = content.replace("|", " | ")

                # Add header formatting
                content = content.replace("# ", "\n# ")
                content = content.replace("## ", "\n## ")
                content = content.replace("### ", "\n### ")

                # Add horizontal rule
                content = content.replace("***", "\n\n" + "*" * 40 + "\n\n")

                # Add task list
                content = content.replace("- [ ] ", "\n- [ ] ")
                content = content.replace("- [x] ", "\n- [x] ")

                # Add definition list
                content = content.replace(": ", "\n: ")

                # Add footnote
                content = content.replace("[^", "\n[^")

                # Add superscript
                content = content.replace("^", "^")

                # Add subscript
                content = content.replace("~", "~")

                # Add emoji
                content = content.replace(":)", "ðŸ˜Š")
                content = content.replace(":(", "ðŸ˜¢")

                # Add mathematical equations
                content = content.replace("$", "$")

                # Add code blocks
                content = content.replace("```", "\n```\n")

                # Add links
                content = content.replace("[", "[")
                content = content.replace("](", "](")

                # Add images
                content = content.replace("![", "![")

                # Add page numbers if the document is multiple pages
                lines = content.split("\n")
                if len(lines) > 50:
                    content = ""
                    for i, line in enumerate(lines, start=1):
                        content += line + "\n"
                        if i % 50 == 0:
                            content += f"\nPage {i // 50}\n\n"

                # Save the reformatted content
                new_filename = "_".join(filename.split())  # Replace spaces with underscores
                new_file_path = os.path.join(folder, new_filename)
                with open(new_file_path, "w") as file:
                    file.write(content)

                print(f"Reformatted: {new_file_path}")

# Example usage
folders_to_process = [
    "/path/to/folder1",
    "/path/to/folder2",
    "/path/to/folder3"
]
reformat_txt_files(folders_to_process)


In [1]:
import os
import csv
from typing import Dict, List
from collections import defaultdict
def get_files_with_extensions(dir_path: str) -> Dict[str, List[str]]:
    ext_files = defaultdict(list)
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            file_path = os.path.join(root, file).replace("\\", "/")
            _, ext = os.path.splitext(file)
            ext_files[ext].append(file_path)
    return ext_files


def write_to_csv(file_path: str, ext_files: Dict[str, List[str]]) -> None:
    with open(file_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        max_len = max(len(v) for v in ext_files.values())
        writer.writerow(ext_files.keys())
        for i in range(max_len):
            row = [ext_files[k][i] if i < len(ext_files[k]) else '' for k in ext_files.keys()]
            writer.writerow(row)

dir_path = "E:/LLMS/hemanth/"  # replace with your directory path
csv_file_path = 'csvfile.csv'  # replace with your CSV file path
ext_files = get_files_with_extensions(dir_path)
write_to_csv(csv_file_path, ext_files)


In [3]:
!pip install PyMuPDF


Collecting PyMuPDF
  Obtaining dependency information for PyMuPDF from https://files.pythonhosted.org/packages/11/c3/487544356045e9b4b67c54a1268ecc868808ff405736824b647b16cd06aa/PyMuPDF-1.23.26-cp310-none-win_amd64.whl.metadata
  Downloading PyMuPDF-1.23.26-cp310-none-win_amd64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.23.22 (from PyMuPDF)
  Obtaining dependency information for PyMuPDFb==1.23.22 from https://files.pythonhosted.org/packages/a7/79/2822a5c60909fdacaa1bc455c91e2b2dec9fc79537860b538f09ccad229d/PyMuPDFb-1.23.22-py3-none-win_amd64.whl.metadata
  Downloading PyMuPDFb-1.23.22-py3-none-win_amd64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.23.26-cp310-none-win_amd64.whl (3.4 MB)
   ---------------------------------------- 0.0/3.4 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.4 MB 660.6 kB/s eta 0:00:06
   - -------------------------------------- 0.1/3.4 MB 1.7 MB/s eta 0:00:02
   --- ------------------------------------ 0.3/3.4 MB 2.2 MB/s eta 0:00:02
   -


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import os
from pathlib import Path
import pandas as pd
import fitz  # PyMuPDF
from typing import Optional

def clean_text(text: str) -> str:
    """
    Clean the extracted text from the PDF.
    This function can be customized based on the cleaning requirements.

    Parameters:
    - text (str): The text extracted from the PDF.

    Returns:
    - str: The cleaned text.
    """
    cleaned_text = ' '.join(text.split())  # Removing extra whitespaces
    # Add more cleaning rules as needed.
    return cleaned_text

def split_and_save_text(cleaned_text: str, base_output_path: Path, max_size_bytes: int = 50 * 1024 * 1024) -> None:
    """
    Split the cleaned text into multiple files, each smaller than the specified max size, and save them.

    Parameters:
    - cleaned_text (str): The cleaned text to be split and saved.
    - base_output_path (Path): The base path where the text files will be saved.
    - max_size_bytes (int): Maximum size of the text file in bytes.
    """
    part_num = 1
    text_part = ""
    for line in cleaned_text.split('\n'):
        if len(text_part.encode('utf-8')) + len(line.encode('utf-8')) < max_size_bytes:
            text_part += line + '\n'
        else:
            # Save the current part and start a new one
            output_path = base_output_path.with_suffix(f'.part{part_num}.txt')
            with open(output_path, 'w', encoding='utf-8') as file:
                file.write(text_part)
            part_num += 1
            text_part = line + '\n' # Start new part with the current line
    
    # Save the last part
    if text_part:
        output_path = base_output_path.with_suffix(f'.part{part_num}.txt')
        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(text_part)

def convert_pdf_to_text(pdf_path: str, output_folder: str) -> None:
    """
    Convert a PDF file to text files, splitting contents to ensure each resulting file is less than 50 MB.
    
    Parameters:
    - pdf_path (str): Path to the PDF file.
    - output_folder (str): Path to the folder where the text files will be saved.
    """
    # Ensure the output folder exists, create it if it does not
    output_folder_path = Path(output_folder)
    output_folder_path.mkdir(parents=True, exist_ok=True)
    
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        
        cleaned_text = clean_text(text)
        
        base_output_path = output_folder_path / Path(pdf_path).stem
        split_and_save_text(cleaned_text, base_output_path)
    except Exception as e:
        print(f"An error occurred while converting {pdf_path}: {str(e)}")

def process_pdfs_from_csv(csv_path: str, output_folder: str) -> None:
    """
    Process PDFs listed in a CSV file, converting them to text files and ensuring each part is less than 50 MB.

    Parameters:
    - csv_path (str): Path to the CSV file containing paths to PDF files.
    - output_folder (str): Path to the folder where text files will be stored.
    """
    pdf_paths = pd.read_csv(csv_path, encoding='latin1')
    # pdf_paths = pd.read_csv(csv_path)
    for pdf_path in pdf_paths['.pdf']:
        convert_pdf_to_text(pdf_path, output_folder)
        
        
process_pdfs_from_csv(csv_path='csvfile.csv', output_folder='E:/LLMS/hemanth/output')


  pdf_paths = pd.read_csv(csv_path, encoding='latin1')


An error occurred while converting E:/LLMS/hemanth/Hemanth/Deep_learning/berkeley_deep learning/project_assignment.pdf: cannot open broken document
An error occurred while converting E:/LLMS/hemanth/Hemanth/Deep_learning/MLSP/11_785_HW2P2_S23_v2.pdf: cannot open broken document
An error occurred while converting E:/LLMS/hemanth/Hemanth/Deep_learning/MLSP/IDL_S23_Recitation_8__RNN_Basics.pdf: cannot open broken document
An error occurred while converting E:/LLMS/hemanth/Hemanth/Deep_learning/MLSP/Recitation_10_s23.pdf: cannot open broken document
An error occurred while converting E:/LLMS/hemanth/Hemanth/Deep_learning/MLSP/S23_Bootcamp 1_HW1P2.pdf: cannot open broken document
An error occurred while converting E:/LLMS/hemanth/Hemanth/Deep_learning/MLSP/s23_hw1_hackathon.pdf: cannot open broken document
An error occurred while converting E:/LLMS/hemanth/Hemanth/Deep_learning/MLSP/s23_hw1_hackathon2.pdf: cannot open broken document
An error occurred while converting E:/LLMS/hemanth/Hemant

In [None]:
import os
import shutil
from typing import List

def write_to_file(content: str, output_dir: str, file_counter: int) -> int:
    filename = os.path.join(output_dir, f'output_{file_counter}.txt')
    with open(filename, 'w') as f:
        f.write(content)
    return file_counter + 1

def process_files(input_dir: str, output_dir: str) -> None:
    file_counter = 1
    content = ''
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r',encoding='utf-8') as f:
                    file_content = f.read()
                    if len(content + file_content) > 50 * 1024 * 1024:  # 50MB limit
                        file_counter = write_to_file(content, output_dir, file_counter)
                        content = file_content
                    else:
                        content += file_content
            except Exception as e:
                print(f"Error reading file {file_path}: {str(e)}")
    if content:
        write_to_file(content, output_dir, file_counter)

def main(input_dir: str, output_dir: str) -> None:
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    process_files(input_dir, output_dir)

if __name__ == "__main__":
    input_dir = 'C:/Users/heman/Desktop/deeplearning'
    output_dir = 'C:/Users/heman/Desktop/deeplearning/data1'
    main(input_dir, output_dir)


In [13]:
import os
from typing import List

def is_text_file(file_path: str) -> bool:
    try:
        with open(file_path, 'tr') as f:
            f.read()
        return True
    except:
        return False

def write_to_file(content: str, output_dir: str, file_counter: int) -> int:
    filename = os.path.join(output_dir, f'output_{file_counter}.txt')
    with open(filename, 'w') as f:
        f.write(content)
    return file_counter + 1

def process_files(input_dir: str, output_dir: str) -> None:
    file_counter = 1
    content = ''
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            file_path = os.path.join(root, file)
            if is_text_file(file_path):
                try:
                    with open(file_path, 'r') as f:
                        file_content = f.read()
                        if len(content + file_content) > 50 * 1024 * 1024:  # 50MB limit
                            file_counter = write_to_file(content, output_dir, file_counter)
                            content = file_content
                        else:
                            content += file_content
                except Exception as e:
                    print(f"Error reading file {file_path}: {str(e)}")
    if content:
        write_to_file(content, output_dir, file_counter)

def main(input_dir: str, output_dir: str) -> None:
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    process_files(input_dir, output_dir)

if __name__ == "__main__":
    input_dir = 'C:/Users/heman/Desktop/deeplearning/file_operations-/'
    output_dir = 'C:/Users/heman/Desktop/deeplearning/data1'
    main(input_dir, output_dir)


In [None]:
import os
import re
import argparse
from typing import List, Tuple

def get_file_paths(folder_path: str) -> List[str]:
    file_paths = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)
    return file_paths

def extract_text_content(file_path: str) -> str:
    try:
        with open(file_path, 'r', errors='ignore') as file:
            content = file.read()
            cleaned_content = re.sub(r'[^\x00-\x7F]+', ' ', content)
            return cleaned_content.strip()
    except IOError as e:
        print(f"Error reading file: {file_path}. Error: {str(e)}")
        return ""

def write_to_txt_files(content_list: List[str], output_folder: str, max_file_size: int = 50000000 ) -> None:
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    file_counter = 1
    current_file_size = 0
    current_file_content = []

    for content in content_list:
        current_file_size += len(content.encode('utf-8'))
        current_file_content.append(content)

        if current_file_size >= max_file_size:
            output_file_path = os.path.join(output_folder, f"output_{file_counter}.txt")
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write('\n'.join(current_file_content))
            file_counter += 1
            current_file_size = 0
            current_file_content = []

    if current_file_content:
        output_file_path = os.path.join(output_folder, f"output_{file_counter}.txt")
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write('\n'.join(current_file_content))

def main(input_folder: str, output_folder: str, max_file_size: int) -> None:
    file_paths = get_file_paths(input_folder)
    content_list = [extract_text_content(file_path) for file_path in file_paths]
    write_to_txt_files(content_list, output_folder, max_file_size)
    print("Text extraction and writing completed.")


main('C:/Users/heman/Desktop/deeplearning', 'C:/Users/heman/Desktop/deeplearning/data1', 50000000)


In [None]:
def convert_pdf_to_text(pdf_path: str, output_folder: str) -> None:
    """
    Convert a PDF file to text files, splitting contents to ensure each resulting file is less than 50 MB.
    
    Parameters:
    - pdf_path (str): Path to the PDF file.
    - output_folder (str): Path to the folder where the text files will be saved.
    """
    # Ensure the output folder exists, create it if it does not
    output_folder_path = Path(output_folder)
    output_folder_path.mkdir(parents=True, exist_ok=True)
    
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        
        cleaned_text = clean_text(text)
        
        base_output_path = output_folder_path / Path(pdf_path).stem
        split_and_save_text(cleaned_text, base_output_path)
    except Exception as e:
        print(f"An error occurred while converting {pdf_path}: {str(e)}")
