In [None]:
import os
import csv
from typing import Dict, List ,Optional
from collections import defaultdict
from pathlib import Path
import pandas as pd
import fitz  # PyMuPDF
import json
import yaml
import re
from pptx import Presentation
from io import StringIO
from datasets import load_dataset
EXTENSION_READERS = {
    '.md': lambda f: f.read(),
    '.py': lambda f: f.read(),
    '.csv': lambda f: pd.read_csv(f),
    '.json': lambda f: json.load(f),
    '.yaml': lambda f: yaml.safe_load(f),
    '.txt': lambda f: f.read(),
    '.xml': lambda f: f.read(),
    '.html': lambda f: f.read(),
    '.css': lambda f: f.read(),
    '.js': lambda f: f.read(),
    '.java': lambda f: f.read(),
    '.cpp': lambda f: f.read(),
    '.h': lambda f: f.read(),
    '.php': lambda f: f.read(),
    '.rb': lambda f: f.read(),
    '.sql': lambda f: f.read(),
    '.xls': lambda f: pd.read_excel(f),
    '.xlsx': lambda f: pd.read_excel(f),
    '.ppt': lambda f: read_pptx(f),
    '.pptx': lambda f: read_pptx(f)
}

def read_pptx(file):
    """Custom function to read .pptx files with python-pptx"""
    prs = Presentation(file)
    text = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text.append(shape.text)
    return "\n".join(text)

# Utilize regular expressions to match any of the file extensions
EXTENSION_PATTERN = r".*\.(md|py|csv|json|yaml|txt|xml|html|css|js|java|cpp|h|php|rb|sql|xls|xlsx|ppt|pptx)$"


def get_files_with_extensions(dir_path: str) -> Dict[str, List[str]]:
    ext_files = defaultdict(list)
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            file_path = os.path.join(root, file).replace("\\", "/")
            _, ext = os.path.splitext(file)
            ext_files[ext].append(file_path)
    return ext_files


def write_to_csv(file_path: str, ext_files: Dict[str, List[str]]) -> None:
    with open(file_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        max_len = max(len(v) for v in ext_files.values())
        writer.writerow(ext_files.keys())
        for i in range(max_len):
            row = [ext_files[k][i] if i < len(ext_files[k]) else '' for k in ext_files.keys()]
            writer.writerow(row)


def clean_text(text: str) -> str:
    """
    Clean the extracted text from the PDF.
    This function can be customized based on the cleaning requirements.

    Parameters:
    - text (str): The text extracted from the PDF.

    Returns:
    - str: The cleaned text.
    """
    cleaned_text = ' '.join(text.split())  # Removing extra whitespaces
    # Add more cleaning rules as needed.
    return cleaned_text

def split_and_save_text(cleaned_text: str, base_output_path: Path, max_size_bytes: int = 50 * 1024 * 1024) -> None:
    """
    Split the cleaned text into multiple files, each smaller than the specified max size, and save them.

    Parameters:
    - cleaned_text (str): The cleaned text to be split and saved.
    - base_output_path (Path): The base path where the text files will be saved.
    - max_size_bytes (int): Maximum size of the text file in bytes.
    """
    part_num = 1
    text_part = ""
    for line in cleaned_text.split('\n'):
        if len(text_part.encode('utf-8')) + len(line.encode('utf-8')) < max_size_bytes:
            text_part += line + '\n'
        else:
            # Save the current part and start a new one
            output_path = base_output_path.with_suffix(f'.part{part_num}.txt')
            with open(output_path, 'w', encoding='utf-8') as file:
                file.write(text_part)
            part_num += 1
            text_part = line + '\n' # Start new part with the current line
    
    # Save the last part
    if text_part:
        output_path = base_output_path.with_suffix(f'.part{part_num}.txt')
        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(text_part)

def convert_pdf_to_text(pdf_path: str, output_folder: str) -> None:
    """
    Convert a PDF file to text files, splitting contents to ensure each resulting file is less than 50 MB.
    
    Parameters:
    - pdf_path (str): Path to the PDF file.
    - output_folder (str): Path to the folder where the text files will be saved.
    """
    # Ensure the output folder exists, create it if it does not
    output_folder_path = Path(output_folder)
    output_folder_path.mkdir(parents=True, exist_ok=True)
    
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        
        cleaned_text = clean_text(text)
        
        base_output_path = output_folder_path / Path(pdf_path).stem
        split_and_save_text(cleaned_text, base_output_path)
    except Exception as e:
        print(f"An error occurred while converting {pdf_path}: {str(e)}")

def process_pdfs_from_csv(csv_path: str, output_folder: str) -> None:
    """
    Process PDFs listed in a CSV file, converting them to text files and ensuring each part is less than 50 MB.

    Parameters:
    - csv_path (str): Path to the CSV file containing paths to PDF files.
    - output_folder (str): Path to the folder where text files will be stored.
    """
    pdf_paths = pd.read_csv(csv_path, encoding='latin1')
    # pdf_paths = pd.read_csv(csv_path)
    for pdf_path in pdf_paths['.pdf']:
        convert_pdf_to_text(pdf_path, output_folder)
        



def list_files_with_extensions(directory_path):
    try:
        files = os.listdir(directory_path)
        return [file for file in files if re.match(EXTENSION_PATTERN, file)]
    except FileNotFoundError:
        print(f"The directory {directory_path} was not found.")
        return None

def read_file_content(directory_path, filename):
    try:
        extension = os.path.splitext(filename)[1]
        with open(os.path.join(directory_path, filename), 'r') as file:
            file_reader = EXTENSION_READERS.get(extension)
            return file_reader(file) if file_reader else None
    except Exception as e:
        print(f"An error occurred while reading the file {filename}: {e}")



def process_files_txtfile(directory_path: str, user_folder: str) -> Optional[str]:
    """
    This function processes all files in a given directory and writes their content to a user-specific text file.
    Each text file is ensured to be less than 50 MB in size.

    Args:
    directory_path (str): The path to the directory containing the files to be processed.
    user_folder (str): The name of the user-specific folder where the text files will be written.

    Returns:
    str: The path to the user-specific folder, or None if an error occurred.
    """
    # Create the user-specific folder if it doesn't exist
    user_folder_path = os.path.join(directory_path, user_folder)
    os.makedirs(user_folder_path, exist_ok=True)

    # Get a list of all files in the directory
    files = list_files_with_extensions(directory_path)

    if files is None:
        return

    # Initialize the text file counter and size
    txt_file_counter = 1
    txt_file_size = 0

    for filename in files:
        content = read_file_content(directory_path, filename)
        if content is not None:
            # Create a new text file if the size is over 50 MB
            if txt_file_size >= 50 * 1024 * 1024:
                txt_file_counter += 1
                txt_file_size = 0

            # Open the text file in append mode
            txt_file_path = os.path.join(user_folder_path, f"{user_folder}_{txt_file_counter}.txt")
            with open(txt_file_path, "a") as f:
                if isinstance(content, pd.DataFrame):
                    # Convert DataFrame to CSV string without index and write to file
                    content_csv = content.to_csv(index=False)
                    f.write(content_csv)
                    txt_file_size += len(content_csv)
                elif isinstance(content, dict):
                    # Convert dict to pretty-printed string using json.dumps and write to file
                    content_str = json.dumps(content, indent=4)
                    f.write(content_str)
                    txt_file_size += len(content_str)
                else:
                    # If it's not a DataFrame or dict, convert it to a string
                    content_str = str(content)
                    f.write(content_str)
                    txt_file_size += len(content_str)

            print(f"--- File: {filename} ---")
            print(f"Content written to {txt_file_path}")
            print("-------------------------------\n")

    return user_folder_path
def loading_folder_using_datasets(folder_path:str):
    
    
    dataset = load_dataset('text', data_files=folder_path+'/*.txt')
    return dataset
    
##=========================||    Extraction  OF DATA         ||==================================
dir_path = "E:/LLMS/hemanth/"  # replace with your directory path
csv_file_path = 'csvfile.csv'  # replace with your CSV file path
ext_files = get_files_with_extensions(dir_path)
write_to_csv(csv_file_path, ext_files)
process_pdfs_from_csv(csv_path='csvfile.csv', output_folder='E:/LLMS/hemanth/output')
process_files_txtfile(dir_path,  "E:/LLMS/hemanth/output")
dataset=loading_folder_using_datasets(folder_path='E:/LLMS/hemanth/output')


In [3]:
!pip install gradio

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting gradio
  Downloading gradio-4.21.0-py3-none-any.whl.metadata (15 kB)
Collecting altair<6.0,>=4.2.0 (from gradio)
  Using cached altair-5.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting gradio-client==0.12.0 (from gradio)
  Downloading gradio_client-0.12.0-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.9.15-cp311-none-win_amd64.whl.metadata (50 kB)
     ---------------------------------------- 0.0/50.7 kB ? eta -:--:--
     -------------------------------- ------- 41.0/50.7 kB 1.9 MB/s eta 0:00:01
     -------------------------------------- 50.7/50.7 kB 860.7 kB/s eta 0:00:00
Collecting pydantic>=2.0 (from gradio)
  Downloading pydantic-2.6.4-py3-none-any.whl.metadata (85 

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chromadb 0.4.6 requires fastapi<0.100.0,>=0.95.2, but you have fastapi 0.110.0 which is incompatible.
chromadb 0.4.6 requires pydantic<2.0,>=1.9, but you have pydantic 2.6.4 which is incompatible.
privategpt 0.0.26 requires fastapi==0.96.0, but you have fastapi 0.110.0 which is incompatible.
privategpt 0.0.26 requires urllib3>=2.0.2, but you have urllib3 1.26.16 which is incompatible.
promptinject 0.1.0 requires openai<0.26.0,>=0.25.0, but you have openai 1.11.0 which is incompatible.
ragatouille 0.0.6b4 requires ruff<0.2.0,>=0.1.9, but you have ruff 0.3.2 which is incompatible.
spacy-transformers 1.2.5 requires transformers<4.31.0,>=3.4.0, but you have transformers 4.37.2 which is incompatible.

[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run

In [4]:
import gradio as gr



demo = gr.Interface(fn=get_answer, inputs="text", outputs="text")
demo.launch()  

ImportError: cannot import name 'RootModel' from 'pydantic' (c:\Users\heman\AppData\Local\Programs\Python\Python311\Lib\site-packages\pydantic\__init__.cp311-win_amd64.pyd)

In [None]:

folderpath='E:/LLMS/hemanth/Hemanth/file_operations-/File_and_Operations/Coding_from_colab'
# Loading all or specific extension like .pdf .py .csv .json .txt .md
from langchain_community.document_loaders import DirectoryLoader
print("============================* all files  *==============================")
ALL_DOC = DirectoryLoader(folderpath, glob= "**/[!.]*",show_progress=True, use_multithreading=True,silent_errors=True) #,txt ,.py, .csv ,.pdf,.md,.csv,.json "**/[!.]*"
Documents = ALL_DOC.load()


In [None]:

folderpath='E:/LLMS/hemanth/Hemanth/file_operations-/File_and_Operations/Coding_from_colab'
# Loading all or specific extension like .pdf .py .csv .json .txt .md
from langchain_community.document_loaders import DirectoryLoader
print("============================* all files  *==============================")
ALL_DOC = DirectoryLoader(folderpath, glob= "**/[!.]*",show_progress=True, use_multithreading=True,silent_errors=True) #,txt ,.py, .csv ,.pdf,.md,.csv,.json "**/[!.]*"
Documents = ALL_DOC.load()


In [None]:
for i in range(10):
   print(Documents[i].page_content)


In [19]:
from pathlib import Path
from typing import Dict, Any, List,Union,Optional
from datasets import (load_dataset, 
                      DatasetDict,
                      concatenate_datasets
                      )


#Load the datset
def load_and_prepare_dataset(
    input_source: Union[str, Path, Dict[str, List[Union[str, Path]]]],
    split_ratios: tuple = (0.8, 0.1, 0.1),
    seed: int = 42,
    streaming: bool = False
) -> DatasetDict:
    """
    Load a dataset from various input sources and prepare it by splitting into train, test, and eval sets.

    :param input_source: A dataset name, path to a folder, a single file, multiple files, or a dictionary specifying train, test, and eval files.
    :param split_ratios: A tuple containing the ratios for train, test, and eval splits (default is (0.8, 0.1, 0.1)).
    :param seed: A random seed for reproducibility of the split (default is 42).
    :param streaming: Whether to use streaming to handle large files (default is False).
    :return: A DatasetDict containing the split datasets.
    
    Example:
    # Example usage with streaming for large files:
    # dataset_dict = load_and_prepare_dataset({
    #     'train': ['train_file_1.csv', 'train_file_2.csv'],
    #     'test': ['test_file.csv'],
    #     'eval': ['eval_file.csv']
    # }, streaming=True)
    # print(dataset_dict)
    OUTPUT1:
    DatasetDict({
    train: DatasetDict({
        train: Dataset({
            features: ['act', 'prompt'],
            num_rows: 459
        })
    })
    test: DatasetDict({
        train: Dataset({
            features: ['act', 'prompt'],
            num_rows: 459
        })
    })
    eval: DatasetDict({
        train: Dataset({
            features: ['act', 'prompt'],
            num_rows: 153
        })
    })
    })
    EXAMPLE2:
    dataset=load_and_prepare_dataset('fka/awesome-chatgpt-prompts')
    DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 122
    })
    test: Dataset({
        features: ['act', 'prompt'],
        num_rows: 15
    })
    eval: Dataset({
        features: ['act', 'prompt'],
        num_rows: 16
    })
    })
    EXAMPLE3:
    datset_path=load_and_prepare_dataset('/content/awesome-chatgpt-prompts')
DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 122
    })
    test: Dataset({
        features: ['act', 'prompt'],
        num_rows: 15
    })
    eval: Dataset({
        features: ['act', 'prompt'],
        num_rows: 16
    })
    })

    """
    # Load dataset from different types of input sources
    if isinstance(input_source, (str, Path)):
        # Dataset name, single file or path to folder
        dataset = load_dataset(input_source, streaming=streaming)
        dataset = DatasetDict(dataset)
    elif isinstance(input_source, dict):
        # Dictionary with specified train, test, and eval files
        formats = ['csv', 'json', 'jsonl', 'parquet', 'txt']
        datasets = {}
        for split, files in input_source.items():
            format_detected = None
            for fmt in formats:
                if any(str(file).endswith(fmt) for file in files):
                    format_detected = fmt
                    break
            if format_detected is None:
                raise ValueError(f"No supported file format detected for files: {files}")
            datasets[split] = load_dataset(format_detected, data_files=files, streaming=streaming)
        dataset = DatasetDict(datasets)
    else:
        raise ValueError("Input source should be a dataset name, path to a folder, a single file, multiple files, or a dictionary.")

    # Perform the split if needed and if not in streaming mode
    if not streaming:
        train_size, test_size, eval_size = split_ratios
        assert 0.0 < train_size < 1.0 and 0.0 < test_size < 1.0 and 0.0 < eval_size < 1.0 and (train_size + test_size + eval_size) == 1.0, \
            "Split ratios must be between 0 and 1 and sum up to 1."

        if "train" not in dataset or "test" not in dataset or "eval" not in dataset:
            # Assuming all splits are to be derived from the 'train' dataset
            full_dataset = concatenate_datasets(list(dataset.values())) if isinstance(dataset, dict) else dataset
            split_dataset = full_dataset.train_test_split(train_size=train_size, seed=seed)
            test_eval_split = split_dataset['test'].train_test_split(test_size=test_size / (test_size + eval_size), seed=seed)

            dataset = DatasetDict({
                "train": split_dataset["train"],
                "test": test_eval_split["train"],
                "eval": test_eval_split["test"]
            })

    return dataset


In [21]:
from datasets import load_dataset

# Specify the path to the text files
path_to_text_files = 'E:/LLMS/hemanth/output'

# Load the dataset
dataset = load_dataset('text', data_files=path_to_text_files+'/*.txt')





Resolving data files:   0%|          | 0/852 [00:00<?, ?it/s]

In [1]:
!rm E:/LLMS/hemanth/output


'rm' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
import os
from pathlib import Path
import pandas as pd
import fitz  # PyMuPDF
from typing import Optional

def clean_text(text: str) -> str:
    """
    Clean the extracted text from the PDF.
    This function can be customized based on the cleaning requirements.

    Parameters:
    - text (str): The text extracted from the PDF.

    Returns:
    - str: The cleaned text.
    """
    cleaned_text = ' '.join(text.split())  # Removing extra whitespaces
    # Add more cleaning rules as needed.
    return cleaned_text

def split_and_save_text(cleaned_text: str, base_output_path: Path, max_size_bytes: int = 50 * 1024 * 1024) -> None:
    """
    Split the cleaned text into multiple files, each smaller than the specified max size, and save them.

    Parameters:
    - cleaned_text (str): The cleaned text to be split and saved.
    - base_output_path (Path): The base path where the text files will be saved.
    - max_size_bytes (int): Maximum size of the text file in bytes.
    """
    part_num = 1
    text_part = ""
    for line in cleaned_text.split('\n'):
        if len(text_part.encode('utf-8')) + len(line.encode('utf-8')) < max_size_bytes:
            text_part += line + '\n'
        else:
            # Save the current part and start a new one
            output_path = base_output_path.with_suffix(f'.part{part_num}.txt')
            with open(output_path, 'w', encoding='utf-8') as file:
                file.write(text_part)
            part_num += 1
            text_part = line + '\n' # Start new part with the current line
    
    # Save the last part
    if text_part:
        output_path = base_output_path.with_suffix(f'.part{part_num}.txt')
        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(text_part)

def convert_pdf_to_text(pdf_path: str, output_folder: str) -> None:
    """
    Convert a PDF file to text files, splitting contents to ensure each resulting file is less than 50 MB.
    
    Parameters:
    - pdf_path (str): Path to the PDF file.
    - output_folder (str): Path to the folder where the text files will be saved.
    """
    # Ensure the output folder exists, create it if it does not
    output_folder_path = Path(output_folder)
    output_folder_path.mkdir(parents=True, exist_ok=True)
    
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        
        cleaned_text = clean_text(text)
        
        base_output_path = output_folder_path / Path(pdf_path).stem
        split_and_save_text(cleaned_text, base_output_path)
    except Exception as e:
        print(f"An error occurred while converting {pdf_path}: {str(e)}")

def process_pdfs_from_csv(csv_path: str, output_folder: str) -> None:
    """
    Process PDFs listed in a CSV file, converting them to text files and ensuring each part is less than 50 MB.

    Parameters:
    - csv_path (str): Path to the CSV file containing paths to PDF files.
    - output_folder (str): Path to the folder where text files will be stored.
    """
    pdf_paths = pd.read_csv(csv_path, encoding='latin1')
    # pdf_paths = pd.read_csv(csv_path)
    for pdf_path in pdf_paths['.pdf']:
        convert_pdf_to_text(pdf_path, output_folder)
        
        
process_pdfs_from_csv(csv_path='csvfile.csv', output_folder='E:/LLMS/hemanth/output')


In [None]:
import os
import shutil
from typing import List

def write_to_file(content: str, output_dir: str, file_counter: int) -> int:
    filename = os.path.join(output_dir, f'output_{file_counter}.txt')
    with open(filename, 'w') as f:
        f.write(content)
    return file_counter + 1

def process_files(input_dir: str, output_dir: str) -> None:
    file_counter = 1
    content = ''
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r',encoding='utf-8') as f:
                    file_content = f.read()
                    if len(content + file_content) > 50 * 1024 * 1024:  # 50MB limit
                        file_counter = write_to_file(content, output_dir, file_counter)
                        content = file_content
                    else:
                        content += file_content
            except Exception as e:
                print(f"Error reading file {file_path}: {str(e)}")
    if content:
        write_to_file(content, output_dir, file_counter)

def main(input_dir: str, output_dir: str) -> None:
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    process_files(input_dir, output_dir)

if __name__ == "__main__":
    input_dir = 'C:/Users/heman/Desktop/deeplearning'
    output_dir = 'C:/Users/heman/Desktop/deeplearning/data1'
    main(input_dir, output_dir)


In [None]:
import os

def print_directory_structure(path, indent='', last=True):
    """
    Print directory structure in a tree-like format
    """

    if not os.path.exists(path):
        print("Error: Path not found.")
        return

    if os.path.isfile(path):
        print(indent + '├─ ' + os.path.basename(path))
        return

    files = sorted(os.listdir(path))

    for i, entry in enumerate(files):
        full_path = os.path.join(path, entry)
        is_last = i == len(files) - 1
        if os.path.isdir(full_path):
            if is_last:
                print(indent + '└─ ' + entry)
            else:
                print(indent + '├─ ' + entry)
            print_directory_structure(full_path, indent + '   ', is_last)
        else:
            if is_last:
                print(indent + '└─ ' + entry)
            else:
                print(indent + '├─ ' + entry)

if __name__ == "__main__":
    directory_path = "E:/LLMS/hemanth/Hemanth/file_operations-"
    print("Directory Structure:")
    print_directory_structure(directory_path)


In [8]:
import os
import json
import yaml
import re
import pandas as pd
from pptx import Presentation
from io import StringIO

from typing import Optional
EXTENSION_READERS = {
    '.md': lambda f: f.read(),
    '.py': lambda f: f.read(),
    '.csv': lambda f: pd.read_csv(f),
    '.json': lambda f: json.load(f),
    '.yaml': lambda f: yaml.safe_load(f),
    '.txt': lambda f: f.read(),
    '.xml': lambda f: f.read(),
    '.html': lambda f: f.read(),
    '.css': lambda f: f.read(),
    '.js': lambda f: f.read(),
    '.java': lambda f: f.read(),
    '.cpp': lambda f: f.read(),
    '.h': lambda f: f.read(),
    '.php': lambda f: f.read(),
    '.rb': lambda f: f.read(),
    '.sql': lambda f: f.read(),
    '.xls': lambda f: pd.read_excel(f),
    '.xlsx': lambda f: pd.read_excel(f),
    '.ppt': lambda f: read_pptx(f),
    '.pptx': lambda f: read_pptx(f)
}

def read_pptx(file):
    """Custom function to read .pptx files with python-pptx"""
    prs = Presentation(file)
    text = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text.append(shape.text)
    return "\n".join(text)

# Utilize regular expressions to match any of the file extensions
EXTENSION_PATTERN = r".*\.(md|py|csv|json|yaml|txt|xml|html|css|js|java|cpp|h|php|rb|sql|xls|xlsx|ppt|pptx)$"

def list_files_with_extensions(directory_path):
    try:
        files = os.listdir(directory_path)
        return [file for file in files if re.match(EXTENSION_PATTERN, file)]
    except FileNotFoundError:
        print(f"The directory {directory_path} was not found.")
        return None

def read_file_content(directory_path, filename):
    try:
        extension = os.path.splitext(filename)[1]
        with open(os.path.join(directory_path, filename), 'r') as file:
            file_reader = EXTENSION_READERS.get(extension)
            return file_reader(file) if file_reader else None
    except Exception as e:
        print(f"An error occurred while reading the file {filename}: {e}")



def process_files_txtfile(directory_path: str, user_folder: str) -> Optional[str]:
    """
    This function processes all files in a given directory and writes their content to a user-specific text file.
    Each text file is ensured to be less than 50 MB in size.

    Args:
    directory_path (str): The path to the directory containing the files to be processed.
    user_folder (str): The name of the user-specific folder where the text files will be written.

    Returns:
    str: The path to the user-specific folder, or None if an error occurred.
    """
    # Create the user-specific folder if it doesn't exist
    user_folder_path = os.path.join(directory_path, user_folder)
    os.makedirs(user_folder_path, exist_ok=True)

    # Get a list of all files in the directory
    files = list_files_with_extensions(directory_path)

    if files is None:
        return

    # Initialize the text file counter and size
    txt_file_counter = 1
    txt_file_size = 0

    for filename in files:
        content = read_file_content(directory_path, filename)
        if content is not None:
            # Create a new text file if the size is over 50 MB
            if txt_file_size >= 50 * 1024 * 1024:
                txt_file_counter += 1
                txt_file_size = 0

            # Open the text file in append mode
            txt_file_path = os.path.join(user_folder_path, f"{user_folder}_{txt_file_counter}.txt")
            with open(txt_file_path, "a") as f:
                if isinstance(content, pd.DataFrame):
                    # Convert DataFrame to CSV string without index and write to file
                    content_csv = content.to_csv(index=False)
                    f.write(content_csv)
                    txt_file_size += len(content_csv)
                elif isinstance(content, dict):
                    # Convert dict to pretty-printed string using json.dumps and write to file
                    content_str = json.dumps(content, indent=4)
                    f.write(content_str)
                    txt_file_size += len(content_str)
                else:
                    # If it's not a DataFrame or dict, convert it to a string
                    content_str = str(content)
                    f.write(content_str)
                    txt_file_size += len(content_str)

            print(f"--- File: {filename} ---")
            print(f"Content written to {txt_file_path}")
            print("-------------------------------\n")

    return user_folder_path
directory_path = "E:/LLMS/hemanth/"
user_folder = "E:/LLMS/hemanth/output"
process_files_txtfile(directory_path, user_folder)




--- File: download.py ---
Content written to E:/LLMS/hemanth/output_1.txt
-------------------------------

--- File: filetest.csv ---
Content written to E:/LLMS/hemanth/output_1.txt
-------------------------------

An error occurred while reading the file huggingface_repos.md: 'charmap' codec can't decode byte 0x9d in position 2671: character maps to <undefined>
An error occurred while reading the file llama.cpp: [Errno 13] Permission denied: 'E:/LLMS/hemanth/llama.cpp'
--- File: metting.txt ---
Content written to E:/LLMS/hemanth/output_1.txt
-------------------------------

An error occurred while reading the file output2.csv: 'charmap' codec can't decode byte 0x9d in position 21408: character maps to <undefined>
--- File: output_1.txt ---
Content written to E:/LLMS/hemanth/output_1.txt
-------------------------------

--- File: output_2.txt ---
Content written to E:/LLMS/hemanth/output_1.txt
-------------------------------

--- File: output_3.txt ---
Content written to E:/LLMS/hemant

'E:/LLMS/hemanth/output'

In [6]:


def process_files_txtfile(directory_path: str, user_folder: str) -> Optional[str]:
    """
    This function processes all files in a given directory and writes their content to a user-specific text file.
    Each text file is ensured to be less than 50 MB in size.

    Args:
    directory_path (str): The path to the directory containing the files to be processed.
    user_folder (str): The name of the user-specific folder where the text files will be written.

    Returns:
    str: The path to the user-specific folder, or None if an error occurred.
    """
    # Create the user-specific folder if it doesn't exist
    user_folder_path = os.path.join(directory_path, user_folder)
    os.makedirs(user_folder_path, exist_ok=True)

    # Get a list of all files in the directory
    files = list_files_with_extensions(directory_path)

    if files is None:
        return

    # Initialize the text file counter and size
    txt_file_counter = 1
    txt_file_size = 0

    for filename in files:
        content = read_file_content(directory_path, filename)
        if content is not None:
            # Create a new text file if the size is over 50 MB
            if txt_file_size >= 50 * 1024 * 1024:
                txt_file_counter += 1
                txt_file_size = 0

            # Open the text file in append mode
            txt_file_path = os.path.join(user_folder_path, f"{user_folder}_{txt_file_counter}.txt")
            with open(txt_file_path, "a") as f:
                if isinstance(content, pd.DataFrame):
                    # Convert DataFrame to CSV string without index and write to file
                    content_csv = content.to_csv(index=False)
                    f.write(content_csv)
                    txt_file_size += len(content_csv)
                elif isinstance(content, dict):
                    # Convert dict to pretty-printed string using json.dumps and write to file
                    content_str = json.dumps(content, indent=4)
                    f.write(content_str)
                    txt_file_size += len(content_str)
                else:
                    # If it's not a DataFrame or dict, it should already be a string
                    f.write(content)
                    txt_file_size += len(content)

            print(f"--- File: {filename} ---")
            print(f"Content written to {txt_file_path}")
            print("-------------------------------\n")

    return user_folder_path

directory_path = "E:/LLMS/hemanth/Hemanth/file_operations-/"
user_folder = "E:/LLMS/hemanth/Hemanth/hemanth1/"
process_files_txtfile(directory_path, user_folder)


An error occurred while reading the file DeepSpeed.csv: 'charmap' codec can't decode byte 0x8f in position 60753: character maps to <undefined>
--- File: downloading.py ---
Content written to E:/LLMS/hemanth/Hemanth/hemanth1/_1.txt
-------------------------------

--- File: duckduckgo.py ---
Content written to E:/LLMS/hemanth/Hemanth/hemanth1/_1.txt
-------------------------------

--- File: extractall.py ---
Content written to E:/LLMS/hemanth/Hemanth/hemanth1/_1.txt
-------------------------------

--- File: functions1.csv ---
Content written to E:/LLMS/hemanth/Hemanth/hemanth1/_1.txt
-------------------------------

--- File: paramiko.json ---
Content written to E:/LLMS/hemanth/Hemanth/hemanth1/_1.txt
-------------------------------

--- File: Pre_processing.py ---
Content written to E:/LLMS/hemanth/Hemanth/hemanth1/_1.txt
-------------------------------

--- File: requirements.txt ---
Content written to E:/LLMS/hemanth/Hemanth/hemanth1/_1.txt
-------------------------------

--- Fil

'E:/LLMS/hemanth/Hemanth/hemanth1/'

In [None]:
import csv
import json
import yaml
import os
def csv_to_json(csv_file_path:str, json_file_path:str):
  """ Converts a CSV file with variable X columns and Y rows into a JSON file.

  Args:
    csv_file_path (str): Path to the input CSV file.
    json_file_path (str): Path to the desired output JSON file.
  """
  try:
    # Open the CSV file for reading
    with open(csv_file_path, 'r', encoding='utf-8') as csv_file:
      # Create a CSV reader object
      csv_reader = csv.DictReader(csv_file)

      # Convert CSV rows into a list of dictionaries
      data = list(csv_reader)

    # Open the JSON file for writing
    with open(json_file_path, 'w', encoding='utf-8') as json_file:
      # Dump the list of dictionaries to the JSON file
      json.dump(data, json_file, ensure_ascii=False, indent=4)

    print(f'Successfully converted {csv_file_path} to {json_file_path}.')

  except FileNotFoundError:
    print(f'Error: CSV file {csv_file_path} not found.')

  except Exception as e:
    print(f'An error occurred: {e}')


def json_to_csv(json_file_path:str, csv_file_path:str):
  """ Converts a JSON file into a CSV file.

  Args:
    json_file_path (str): Path to the input JSON file.
    csv_file_path (str): Path to the desired output CSV file.
  """
  try:
    # Open the JSON file for reading
    with open(json_file_path, 'r', encoding='utf-8') as json_file:
      # Load the JSON data
      data = json.load(json_file)

    if not (isinstance(data, list) and all(isinstance(item, dict) for item in data)):
      raise ValueError('JSON data must be a list of dictionaries')

    # Get the keys of the first dictionary to use as CSV headers
    headers = set()
    for item in data:
      headers.update(item.keys())
    headers = list(headers)

    # Open the CSV file for writing
    with open(csv_file_path, 'w', encoding='utf-8', newline='') as csv_file:
      # Create a CSV writer object
      csv_writer = csv.DictWriter(csv_file, fieldnames=headers)

      # Write the headers to the CSV file
      csv_writer.writeheader()

      # Write each dictionary in the list to the CSV file
      for row in data:
        # Check if the row contains all the required keys
        if not set(headers).issubset(row.keys()):
          raise ValueError(f'Missing keys in row: {row}')

        csv_writer.writerow(row)

    print(f'Successfully converted {json_file_path} to {csv_file_path}.')

  except FileNotFoundError:
    print(f'Error: JSON file {json_file_path} not found.')
  except ValueError as e:
    print(f'Error: {e}')
  except Exception as e:
    print(f'An error occurred: {e}')


def csv_to_yaml(csv_file_path:str, yaml_file_path:str):
  """ Converts a CSV file with variable X columns and Y rows into a YAML file.

  Args:
    csv_file_path (str): Path to the input CSV file.
    yaml_file_path (str): Path to the desired output YAML file.
  """
  try:
    # Open the CSV file for reading
    with open(csv_file_path, 'r', encoding='utf-8') as csv_file:
      # Create a CSV reader object
      csv_reader = csv.DictReader(csv_file)

      # Convert CSV rows into a list of dictionaries
      data = list(csv_reader)

    # Open the YAML file for writing
    with open(yaml_file_path, 'w', encoding='utf-8') as yaml_file:
      # Dump the list of dictionaries to the YAML file
      yaml.dump(data, yaml_file, allow_unicode=True)

    print(f'Successfully converted {csv_file_path} to {yaml_file_path}.')

  except FileNotFoundError:
    print(f'Error: CSV file {csv_file_path} not found.')

  except Exception as e:
    print(f'An error occurred: {e}')



csv_to_json(csv_file_path='E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/hemanth.csv',json_file_path='E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/hemanth1.json')


Successfully converted E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/hemanth.csv to E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/hemanth1.json.


KeyboardInterrupt: 

In [None]:
json_to_csv('E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/hemanth1.json','E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/hemanth2.csv')


In [None]:
csv_to_yaml('E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/hemanth2.csv','output.yml')


In [None]:
import csv
import os
from typing import List
from langchain_community.document_loaders import DirectoryLoader,PyPDFLoader
def write_to_csv(file_path: str, data: dict, write_header: bool) -> None:
    """
    Function to append data into a CSV file.
    
    Args:
    file_path (str): The path to the CSV file.
    data (dict): The data to be appended into the CSV file.
    write_header (bool): Whether to write the header.
    """
    mode = 'a' if os.path.exists(file_path) else 'w'
    with open(file_path, mode, newline='', encoding='UTF-8', errors='ignore') as file:
        writer = csv.DictWriter(file, fieldnames=["content", "documents", "metasource"])
        if write_header:
            writer.writeheader()
        try:
            writer.writerow({k: data[k] for k in ["content", "documents", "metasource"]})
        except UnicodeEncodeError:
            print(f"Warning: UnicodeEncodeError encountered for file {data['documents']}. Skipping this file.")

def read_pdfs_from_folder(folder_path: str, csv_file_path: str) -> None:
    """
    Function to recursively read PDF files from a folder and its subfolders and extract their content.
    
    Args:
    folder_path (str): The path to the folder containing the PDF files.
    csv_file_path (str): The path to the CSV file.
    
    Example:
    ```
    folder_path = "path/to/pdf/folder"
    csv_file_path = "output.csv"
    read_pdfs_from_folder(folder_path, csv_file_path)
    ```
    
    Output:
    The content of the PDF files in the specified folder and its subfolders is extracted and written to the CSV file.
    """
    for root, dirs, files in os.walk(folder_path):
        for file_name in files:
            if file_name.endswith(".pdf"):
                full_file_path = os.path.join(root, file_name)
                loader = PyPDFLoader(full_file_path)
                pages = loader.load_and_split()
                for page in pages:
                    data = {
                        "content": page.page_content,
                        "documents": file_name,
                        "metasource": full_file_path
                    }
                    write_to_csv(csv_file_path, data, file_name == os.listdir(root)[0])

# Usage
folder_path = "E:/LLMS/hemanth/Hemanth/amazon/"
csv_file_path = "output.csv"
read_pdfs_from_folder(folder_path, csv_file_path)

# Usage



In [None]:
import os
import csv
from typing import List


def get_image_paths(folder: str) -> List[str]:
    """
    Recursively get a list of paths for all .png images in the given folder.
    Handles Windows-style paths properly.

    Args:
        folder: The folder path to search.

    Returns:
        A list of paths for all .png images found.
    """
    
    image_paths = []

    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith('.png'):
                path = os.path.join(root, file)
                path = path.replace('\\', '/') # convert Windows paths
                image_paths.append(path)

    return image_paths


def save_to_csv(image_paths: List[str], csv_file: str) -> None:
    """
    Save a list of image paths to a CSV file.

    Args:
        image_paths: A list of image path strings.
        csv_file: Path to the CSV file to save.
    """
    with open(csv_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        for path in image_paths:
            writer.writerow([path])


if __name__ == '__main__':
    folder = 'E:/LLMS/hemanth/Hemanth/'
    csv_file = 'image_paths.csv'

    image_paths = get_image_paths(folder)
    save_to_csv(image_paths, csv_file)


In [None]:
import os
import json
import csv
from typing import List, Dict, Any
from pathlib import Path

def save_to_json(data: List[Dict[str, Any]], filename: str) -> None:
    """
    Saves the provided data to a JSON file with the given filename.
    
    :param data: The data to be saved in JSON format.
    :param filename: The name of the file to save the JSON data to.
    """
    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=4)

def save_to_csv(data: List[Dict[str, Any]], filename: str) -> None:
    """
    Saves the provided data to a CSV file with the given filename.
    
    :param data: The data to be saved in CSV format.
    :param filename: The name of the file to save the CSV data to.
    """
    with open(filename, 'w', newline='') as csv_file:
        fieldnames = ['folder', 'extensions', 'files']
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for folder_info in data:
            writer.writerow(folder_info)

def get_extensions_and_paths(directory: str) -> List[Dict[str, Any]]:
    """
    Traverses the directory tree starting at the given directory and compiles a list of 
    dictionaries, each containing folder names, their file extensions, and file paths.
    
    :param directory: The root directory from which to start the folder traversal.
    :return: A list of dictionaries with folder names, extensions, and file paths.
    """
    folder_structure = []
    root_path = Path(directory)
    for root, dirs, files in os.walk(root_path):
        folder_name = os.path.basename(root)
        folder_info = {
            "folder": folder_name,
            "extensions": set(),
            "files": {}
        }
        for file in files:
            file_path = Path(root) / file
            extension = file_path.suffix
            if extension:
                folder_info["extensions"].add(extension)
                # Use as_posix() to convert the path to a string with forward slashes
                folder_info["files"].setdefault(extension, []).append(file_path.as_posix())
        
        # Convert the set of extensions to a sorted list
        folder_info["extensions"] = sorted(list(folder_info["extensions"]))
        folder_structure.append(folder_info)

    return folder_structure

if __name__ == "__main__":
    try:
        directory_to_scan = "E:/LLMS/hemanth/Hemanth/file_operations-"
        folder_data = get_extensions_and_paths(directory_to_scan)
        json_filename = "folder_structure.json"
        csv_filename = "folder_structure.csv"
        save_to_json(folder_data, json_filename)
        save_to_csv(folder_data, csv_filename)
        print(f"Data saved to {json_filename} and {csv_filename}.")
    except Exception as e:
        print(f"An error occurred: {e}")


Data saved to folder_structure.json and folder_structure.csv.


In [None]:
import csv
from pathlib import Path
from typing import List

def get_file_paths(folder: str, extensions: List[str]) -> List[List[str]]:
    """
    Get all file paths in the folder with the given extensions.

    Args:
        folder: A path to the folder to search in.
        extensions: A list of file extensions to include.

    Returns:
        A list of lists, where each sublist contains file paths with the same extension.
    """
    paths = [[] for _ in extensions]
    folder_path = Path(folder)

    for i, ext in enumerate(extensions):
        for file_path in folder_path.rglob('*' + ext):
            paths[i].append(str(file_path))

    return paths

def save_to_csv(file_paths: List[List[str]], csv_file: str) -> None:
    """
    Save lists of file paths to a CSV file.

    Args:
        file_paths: A list of lists where each sublist contains file paths of a certain type.
        csv_file: Path to the CSV file to save.
    """
    with open(csv_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        for row in zip(*file_paths):
            writer.writerow(row)

# Usage example:
if __name__ == '__main__':
    folder = 'E:/LLMS/hemanth/Hemanth/' 
    extensions = ['.py','.pdf',]
    csv_file = 'file_paths.csv'

    file_paths = get_file_paths(folder, extensions)
    save_to_csv(file_paths, csv_file)
    
    print('Done!')
    
    


In [None]:

from typing import List
import csv
from pathlib import Path
from typing import List

from typing import List
import csv
from pathlib import Path

def get_file_paths(folder: str, extensions: List[str]) -> List[List[str]]:
    """
    Get all file paths in the folder with the given extensions.
    ...
    """
    paths = [[] for _ in extensions]
    folder_path = Path(folder)

    for i, ext in enumerate(extensions):
        for file_path in folder_path.rglob('*' + ext):
            # Use as_posix() to get the file_path with forward slashes
            paths[i].append(file_path.as_posix())

    return paths
def save_to_csv(file_paths: List[List[str]], csv_file: str) -> None:
    """
    Save lists of file paths to a CSV file in columns based on file types.

    Args:
        file_paths: A list containing a list of paths for each file type.
        csv_file: Path to the CSV file to save.
    """

    max_rows = max(len(paths) for paths in file_paths)
    
    with open(csv_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        
        # Write header row with file types
        header = [ext.upper()[1:] for ext in extensions]
        writer.writerow(header)
        
        for i in range(max_rows):
            row = []
            for paths in file_paths:
                if i < len(paths):
                    row.append(paths[i])
                else:
                    row.append('')
            writer.writerow(row)

if __name__ == '__main__':
    folder = 'E:/LLMS/hemanth/Hemanth/'
    extensions = ['.png', '.jpg', '.pdf']
    csv_file = 'file_paths.csv'

    file_paths = get_file_paths(folder, extensions)
    save_to_csv(file_paths, csv_file)


In [None]:
import pandas as pd
import json
import os

MAX_FILE_SIZE = 50 * 1024 * 1024  # 50 MB

def check_create_folder(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

def next_file_index(folder_path, base_filename):
    files = os.listdir(folder_path)
    existing_indices = [int(f.replace(base_filename, '').replace('.txt', '')) for f in files if f.startswith(base_filename) and f.endswith('.txt')]
    return max(existing_indices + [0]) + 1

def get_file_path(folder_path, base_filename, index):
    return os.path.join(folder_path, f"{base_filename}{index}.txt")

def should_create_new_file(file_path):
    return os.path.exists(file_path) and os.path.getsize(file_path) >= MAX_FILE_SIZE

def adjust_content_and_write(content, file_path):
    if isinstance(content, pd.DataFrame):
        content = content.to_csv(index=False)
    elif isinstance(content, dict):
        content = json.dumps(content)
    elif isinstance(content, list):
        content = '\n'.join(content)
    elif not isinstance(content, str):
        raise ValueError("Unsupported content type for writing to file")

    with open(file_path, "a") as f:
        f.write(content)

def read_file_content(directory_path, filename):
    # Implement your file reading logic here, adjusted for different file formats
    # This is a placeholder function; the actual implementation depends on your specific file types and processing needs
    return "Placeholder for actual content"

def process_files_into_folders(directory_path, folder_base_path, base_filename):
    files = os.listdir(directory_path)
    check_create_folder(folder_base_path)
    
    index = next_file_index(folder_base_path, base_filename)
    file_path = get_file_path(folder_base_path, base_filename, index)
    
    for filename in files:
        file_full_path = os.path.join(directory_path, filename)
        content = read_file_content(directory_path, filename)  # Assume this returns content correctly based on file type
        
        if content is not None:
            if should_create_new_file(file_path):
                index += 1
                file_path = get_file_path(folder_base_path, base_filename, index)

        adjust_content_and_write(content, file_path)
        print(f"--- File: {filename} ---\nContent written to {file_path}\n-------------------------------")

# Correct paths and base_filename as per your directory structure and needs
directory_path = "E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/"
folder_base_path = "E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/"
base_filename = 'outputFile'
process_files_into_folders(directory_path, folder_base_path, base_filename)


--- File: Datasetsturture.py ---
Content written to E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/outputFile2.txt
-------------------------------
--- File: filepre_processing.py ---
Content written to E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/outputFile2.txt
-------------------------------
--- File: filetxtmd.py ---
Content written to E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/outputFile2.txt
-------------------------------
--- File: File_conversion.py ---
Content written to E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/outputFile2.txt
-------------------------------
--- File: File_operation_and_conversion.ipynb ---
Content written to E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/outputFile2.txt
-------------------------------
--- File: file_paths.csv ---
Content written to E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/outputFile2.txt
--------------

In [None]:
import os

# Constants
MAX_FILE_SIZE = 50 * 1024 * 1024  # 50 MB in bytes

def check_create_folder(folder_path):
    """Ensure the folder exists, create if not."""
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

def next_file_index(folder_path, base_filename):
    """Find the next file index that can be used."""
    files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith('.txt')]
    if not files:
        return 1
    highest_index = max([int(f.replace(base_filename, '').replace('.txt', '')) for f in files])
    return highest_index + 1

def get_file_path(folder_base_path, base_filename, index):
    """Construct a file path with the given index."""
    return os.path.join(folder_base_path, f"{base_filename}{index}.txt")

def should_create_new_file(file_path):
    """Determine if a new file needs to be created based on the size."""
    return os.path.exists(file_path) and os.path.getsize(file_path) >= MAX_FILE_SIZE

def adjust_content_and_write(content, file_path, add_separator=False):
    """Write content to the file, optionally adding a separator if the file already exists."""
    mode = "a" if os.path.exists(file_path) else "w"
    with open(file_path, mode, encoding='utf-8') as f:
        if add_separator and os.path.getsize(file_path) > 0:
            # Simulate a 'separator' with a newline
            f.write("\n\n" + ("-" * 80) + "\n\n")
        f.write(content)

def process_files_into_folders(user_given_directory_path, target_folder_base_path, base_filename):
    check_create_folder(target_folder_base_path)
    index = next_file_index(target_folder_base_path, base_filename)
    file_path = get_file_path(target_folder_base_path, base_filename, index)
    
    for filename in os.listdir(user_given_directory_path):
        if filename.endswith('.txt'):  # Assuming we are only interested in .txt files
            with open(os.path.join(user_given_directory_path, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                
                if should_create_new_file(file_path):
                    index += 1
                    file_path = get_file_path(target_folder_base_path, base_filename, index)
                
                adjust_content_and_write(content, file_path, add_separator=True)
                
                print(f"--- File: {filename} ---")
                print(f"Content written to {file_path}")
                print("-------------------------------\n")

# Example usage - adjust the paths and base_filename as necessary
directory_path = "E:/LLMS/hemanth/"
folder_base_path = "E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/"
base_filename = 'ou'
process_files_into_folders(directory_path, folder_base_path, base_filename)


ValueError: invalid literal for int() with base 10: 'tputFile1'

In [None]:
import os

# Constants
MAX_FILE_SIZE = 50 * 1024 * 1024  # 50 MB in bytes

def check_create_folder(folder_path):
    """Ensure the folder exists, create if not."""
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

def next_file_index(folder_path, base_filename):
    """Find the next file index that can be used."""
    files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith('.txt')]
    if not files:
        return 1
    highest_index = max([int(f.replace(base_filename, '').replace('.txt', '')) for f in files])
    return highest_index + 1

def get_file_path(folder_base_path, base_filename, index):
    """Construct a file path with the given index."""
    return os.path.join(folder_base_path, f"{base_filename}{index}.txt")

def should_create_new_file(file_path):
    """Determine if a new file needs to be created based on the size."""
    return os.path.exists(file_path) and os.path.getsize(file_path) >= MAX_FILE_SIZE

def adjust_content_and_write(content, file_path, add_separator=False):
    """Write content to the file, optionally adding a separator if the file already exists."""
    mode = "a" if os.path.exists(file_path) else "w"
    with open(file_path, mode, encoding='utf-8') as f:
        if add_separator and os.path.getsize(file_path) > 0:
            # Simulate a 'separator' with a newline
            f.write("\n\n" + ("-" * 80) + "\n\n")
        f.write(content)

def process_file(full_file_path):
    global index, file_path
    
    try:
        with open(full_file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            
            if should_create_new_file(file_path):
                index += 1
                file_path = get_file_path(target_folder_base_path, base_filename, index)
            
            adjust_content_and_write(content, file_path, add_separator=True)
    except UnicodeDecodeError:
        print(f"Skipping file: {full_file_path} (UnicodeDecodeError)")

def recursive_file_search(directory_path, callback):
    for root, _, files in os.walk(directory_path):
        for filename in files:
            callback(os.path.join(root, filename))

def process_files_into_folders(user_given_directory_path, target_folder_base_path, base_filename):
    global index, file_path
    check_create_folder(target_folder_base_path)
    index = next_file_index(target_folder_base_path, base_filename)
    file_path = get_file_path(target_folder_base_path, base_filename, index)
    recursive_file_search(user_given_directory_path, process_file)

# Example usage
directory_path = "E:/LLMS/hemanth/"
folder_base_path = "E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/"
base_filename = 'caing'
process_files_into_folders(directory_path, folder_base_path, base_filename)


Skipping file: E:/LLMS/hemanth/Applied-Deep-Learning\00 - Training.pdf (UnicodeDecodeError)
Skipping file: E:/LLMS/hemanth/Applied-Deep-Learning\03 - Multimodal Learning.pdf (UnicodeDecodeError)
Skipping file: E:/LLMS/hemanth/Applied-Deep-Learning\08 - Graph Neural Networks.pdf (UnicodeDecodeError)
Skipping file: E:/LLMS/hemanth/Applied-Deep-Learning\09 - Recommender Systems.pdf (UnicodeDecodeError)
Skipping file: E:/LLMS/hemanth/Applied-Deep-Learning\10 - Computational Biology.pdf (UnicodeDecodeError)
Skipping file: E:/LLMS/hemanth/Applied-Deep-Learning\.git\index (UnicodeDecodeError)
Skipping file: E:/LLMS/hemanth/Applied-Deep-Learning\.git\objects\pack\pack-2400e8f4e05af351bd2d7191f5d84ab8f68884d7.idx (UnicodeDecodeError)
Skipping file: E:/LLMS/hemanth/Applied-Deep-Learning\.git\objects\pack\pack-2400e8f4e05af351bd2d7191f5d84ab8f68884d7.pack (UnicodeDecodeError)
Skipping file: E:/LLMS/hemanth/Applied-Deep-Learning\.git\objects\pack\pack-2400e8f4e05af351bd2d7191f5d84ab8f68884d7.rev (

NameError: name 'target_folder_base_path' is not defined

In [6]:
import os

# Constants
MAX_FILE_SIZE = 50 * 1024 * 1024  # 50 MB in bytes

def check_create_folder(folder_path):
    """Ensure the folder exists, create if not."""
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

def next_file_index(folder_path, base_filename):
    """Find the next file index that can be used."""
    files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith('.txt')]
    if not files:
        return 1
    highest_index = max([int(f.split(base_filename)[-1].split('.txt')[0]) for f in files])
    return highest_index + 1

def get_file_path(folder_base_path, base_filename, index):
    """Construct a file path with the given index."""
    return os.path.join(folder_base_path, f"{base_filename}{index}.txt")

def should_create_new_file(file_path):
    """Determine if a new file needs to be created based on the size."""
    return os.path.exists(file_path) and os.path.getsize(file_path) >= MAX_FILE_SIZE

def adjust_content_and_write(content, file_path, add_separator=False):
    """Append content to the file, optionally adding a separator if the file already exists."""
    mode = "a" if os.path.exists(file_path) else "w"
    with open(file_path, mode, encoding='utf-8') as f:
        if add_separator and os.path.exists(file_path) and os.path.getsize(file_path) > 0:
            f.write("\n\n" + ("-" * 80) + "\n\n")
        f.write(content)
   
def recursive_file_search(directory_path, callback):
    """Recursively searches for files within the directory_path and processes them using callback."""
    for root, _, files in os.walk(directory_path):
        for filename in files:
            callback(os.path.join(root, filename))

def process_files_into_folders(user_given_directory_path, target_folder_base_path, base_filename):
    check_create_folder(target_folder_base_path)
    index = next_file_index(target_folder_base_path, base_filename)
    file_path = get_file_path(target_folder_base_path, base_filename, index)
    
    def process_file(full_file_path):
        nonlocal index, file_path
        # Attempt to open and read the file, replacing characters that cannot be decoded
        with open(full_file_path, 'r', encoding='utf-8', errors='replace') as file:
             content = file.read()
                
        # Check if a new file is needed before writing content
        if should_create_new_file(file_path):
            index += 1  # Increment to use a new file
            file_path = get_file_path(target_folder_base_path, base_filename, index)
        
        adjust_content_and_write(content, file_path, add_separator=True)
                    

    
    recursive_file_search(user_given_directory_path, process_file)

# Example usage
directory_path = "E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing"
folder_base_path = "E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/"
base_filename = 'output1test1'
process_files_into_folders(directory_path, folder_base_path, base_filename)


In [None]:
import os

# Constants
MAX_FILE_SIZE = 50 * 1024 * 1024  # 50 MB in bytes

def check_create_folder(folder_path):
    """Ensure the folder exists, create if not."""
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

def next_file_index(folder_path, base_filename):
    """Find the next file index that can be used."""
    files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith('.txt')]
    if not files:
        return 1
    highest_index = max([int(f.replace(base_filename, '').replace('.txt', '')) for f in files])
    return highest_index + 1

def get_file_path(folder_base_path, base_filename, index):
    """Construct a file path with the given index."""
    return os.path.join(folder_base_path, f"{base_filename}{index}.txt")

def should_create_new_file(file_path):
    """Determine if a new file needs to be created based on the size."""
    return os.path.exists(file_path) and os.path.getsize(file_path) >= MAX_FILE_SIZE

def adjust_content_and_write(content, file_path, add_separator=False):
    """Append content to the file, optionally adding a separator if the file already exists."""
    mode = "a" if os.path.exists(file_path) else "w"
    with open(file_path, mode, encoding='utf-8') as f:
        if add_separator and os.path.getsize(file_path) > 0:
            f.write("\n\n" + ("-" * 80) + "\n\n")
        f.write(content)
        
def process_files_into_folders(directory_path, folder_base_path, base_filename):
    check_create_folder(folder_base_path)
    index = next_file_index(folder_base_path, base_filename)
    file_path = get_file_path(folder_base_path, base_filename, index)
    
    for filename in sorted(os.listdir(directory_path)):
        if filename.endswith('.py'):  # Assuming we're only interested in .txt files
            with open(os.path.join(directory_path, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                
                # Check if a new file is needed before writing content
                if should_create_new_file(file_path):
                    index += 1  # Increment to use a new file
                    file_path = get_file_path(folder_base_path, base_filename, index)
                
                adjust_content_and_write(content, file_path, add_separator=True)

# Example usage
directory_path = "E:/LLMS/hemanth/"
folder_base_path = "E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/"
base_filename = 'caseing'
process_files_into_folders(directory_path, folder_base_path, base_filename)


In [None]:
import os
import pandas as pd

# Constants
MAX_FILE_SIZE = 50 * 1024 * 1024  # 50 MB in bytes

def check_create_folder(folder_path):
    """Ensure the folder exists, create if not."""
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

def next_file_index(folder_path, base_filename):
    """Find the next file index that can be used."""
    files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith('.txt')]
    if not files:
        return 1
    highest_index = max([int(f.replace(base_filename, '').replace('.txt', '')) for f in files])
    return highest_index + 1

def get_file_path(folder_base_path, base_filename, index):
    """Construct a file path with the given index."""
    return os.path.join(folder_base_path, f"{base_filename}{index}.txt")

def should_create_new_file(file_path):
    """Determine if a new file needs to be created based on the size."""
    return os.path.exists(file_path) and os.path.getsize(file_path) >= MAX_FILE_SIZE

def adjust_content_and_write(content, file_path, add_separator=False):
    """Write content to the file, optionally adding a separator if the file already exists."""
    mode = "a" if os.path.exists(file_path) else "w"
    with open(file_path, mode, encoding='utf-8') as f:
        if add_separator and os.path.getsize(file_path) > 0:
            # Simulate a 'separator' with a newline
            f.write("\n\n" + ("-" * 80) + "\n\n")
        f.write(content)

def process_files_into_folders(user_given_directory_path, target_folder_base_path, base_filename):
    check_create_folder(target_folder_base_path)
    index = next_file_index(target_folder_base_path, base_filename)
    file_path = get_file_path(target_folder_base_path, base_filename, index)
    
    for filename in os.listdir(user_given_directory_path):
        if filename.endswith('.txt'):  # Assuming we are only interested in .txt files
            with open(os.path.join(user_given_directory_path, filename), 'r', encoding='utf-8') as file:
                content = file.read()
                
                if should_create_new_file(file_path):
                    index += 1
                    file_path = get_file_path(target_folder_base_path, base_filename, index)
                
                adjust_content_and_write(content, file_path, add_separator=True)

# Example usage
# Example usage - adjust the paths and base_filename as necessary
directory_path = "E:/LLMS/hemanth/"
folder_base_path = "E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/"
base_filename = 'outputFile'
process_files_into_folders(directory_path, folder_base_path, base_filename)


In [None]:
import os
import json
import pandas as pd
from pptx import Presentation
import re

# Constants
MAX_FILE_SIZE = 50 * 1024 * 1024  # 50 MB in bytes

def check_create_folder(folder_path):
    """Ensure the folder exists, create if not."""
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
def next_file_index(folder_path, base_filename):
    """Find the next file index that can be used."""
    files = [f for f in os.listdir(folder_path) if f.startswith(base_filename) and f.endswith('.txt')]
    if not files:
        return 1
    highest_index = max([int(f.replace(base_filename, '').replace('.txt', '')) for f in files])
    return highest_index + 1
def get_file_path(folder_path, base_filename, index):
    """Construct a file path with the given index."""
    return os.path.join(folder_path, f"{base_filename}{index}")

def should_create_new_file(file_path):
    """Determine if a new file needs to be created based on the size."""
    return os.path.exists(file_path) and os.path.getsize(file_path) >= MAX_FILE_SIZE

def adjust_content_and_write(content, file_path, add_page_break=False):
    """Write content to the file, optionally adding a page break (separator)."""
    with open(file_path, "a", encoding='utf-8') as f:
        if add_page_break and os.path.getsize(file_path) > 0:
            # Simulate a 'page break' with a separator
            f.write("\n\n" + ("-" * 80) + "\n\n")
        
        if isinstance(content, pd.DataFrame):
            content.to_csv(f, index=False, header=not os.path.exists(file_path))
        elif isinstance(content, list):
            f.write('\n'.join(content))
        elif isinstance(content, str):
            f.write(content)
        else:
            f.write(str(content))
def process_files_into_folders(directory_path, folder_base_path, base_filename):
    files = list_files_with_extensions(directory_path)
    
    if files is None:
        return
    
    # Initialize file management variables
    index = next_file_index(folder_base_path, base_filename)
    file_path = get_file_path(folder_base_path, base_filename, index)
    check_create_folder(folder_base_path)
    
    for filename in files:
        content = read_file_content(directory_path, filename)
        
        if content is not None:
            if should_create_new_file(file_path):
                index += 1
                file_path = get_file_path(folder_base_path, base_filename, index)

            adjust_content_and_write(content, file_path)
            
            print(f"--- File: {filename} ---")
            print(f"Content written to {file_path}")
            print("-------------------------------\n")

# Example usage - adjust the paths and base_filename as necessary
directory_path = "E:/LLMS/hemanth/Hemanth/file_operations-/"
folder_base_path = "E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/"
base_filename = 'outputFile.txt'
process_files_into_folders(directory_path, folder_base_path, base_filename)



--- File: DeepSpeed.csv ---
Content written to E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/outputFile.txt1
-------------------------------

--- File: downloading.py ---
Content written to E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/outputFile.txt1
-------------------------------

--- File: duckduckgo.py ---
Content written to E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/outputFile.txt1
-------------------------------

--- File: extractall.py ---
Content written to E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/outputFile.txt1
-------------------------------

--- File: functions1.csv ---
Content written to E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/outputFile.txt1
-------------------------------

--- File: paramiko.json ---
Content written to E:/LLMS/hemanth/Hemanth/file_operations-/dataset_processing/Hemanth/outputFile.txt1
-------------------------------

--- File: Pre_

In [7]:
!pip install -U gradio gtts


Collecting gradio
  Obtaining dependency information for gradio from https://files.pythonhosted.org/packages/68/6c/28d4a841651b32b2e8b4bdc55cfe96785e5cce63ce2c07d039df2c0abfd3/gradio-4.21.0-py3-none-any.whl.metadata
  Downloading gradio-4.21.0-py3-none-any.whl.metadata (15 kB)
Collecting gradio-client==0.12.0 (from gradio)
  Obtaining dependency information for gradio-client==0.12.0 from https://files.pythonhosted.org/packages/cd/4d/5b430cc0fbb19b20368e9cd791700270c9551dab7234e6501b1587c414de/gradio_client-0.12.0-py3-none-any.whl.metadata
  Downloading gradio_client-0.12.0-py3-none-any.whl.metadata (7.1 kB)
Collecting python-multipart>=0.0.9 (from gradio)
  Obtaining dependency information for python-multipart>=0.0.9 from https://files.pythonhosted.org/packages/3d/47/444768600d9e0ebc82f8e347775d24aef8f6348cf00e9fa0e81910814e6d/python_multipart-0.0.9-py3-none-any.whl.metadata
  Downloading python_multipart-0.0.9-py3-none-any.whl.metadata (2.5 kB)
Collecting ruff>=0.2.2 (from gradio)
  O

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ragatouille 0.0.7.post2 requires ruff<0.2.0,>=0.1.9, but you have ruff 0.3.2 which is incompatible.

[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import gradio as gr


In [None]:
import gradio as gr
from IPython.display import Javascript
from google.colab import output
from base64 import b64decode
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from gtts import gTTS

# JavaScript code to record audio from the user's microphone for a specified amount of time
RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=60):
  """
  This function records audio from the user's microphone for a specified amount of time.
  The audio is saved as a .wav file.

  Input:
  sec: int, the number of seconds to record

  Output:
  'audio.wav': str, the filename of the recorded audio
  """
  print('Recording Your Voice')
  display(Javascript(RECORD))
  s = output.eval_js('record(%d)' % (sec*1000))
  b = b64decode(s.split(',')[1])
  with open('audio.wav','wb') as f:
    f.write(b)
  print('Stopped Recording')
  return 'audio.wav'

def speech_to_text(audio_data):
  """
  This function converts speech in an audio file to text using a pre-trained model from Hugging Face.

  Input:
  audio_data: str, the filename of the audio file

  Output:
  result["text"]: str, the transcribed text
  """
  device = "cpu"
  torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

  model_id = "openai/whisper-large-v3"

  model = AutoModelForSpeechSeq2Seq.from_pretrained(
      model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=False, use_safetensors=True
  )
  model.to(device)

  processor = AutoProcessor.from_pretrained(model_id)

  pipe = pipeline(
      "automatic-speech-recognition",
      model=model,
      tokenizer=processor.tokenizer,
      feature_extractor=processor.feature_extractor,
      max_new_tokens=128,
      chunk_length_s=30,
      batch_size=16,
      return_timestamps=True,
      torch_dtype=torch_dtype,
      device=device,
  )

  result = pipe(audio_data)
  return result["text"]

def text_to_speech(text):
  """
  This function converts text to speech using the gTTS library.
  The speech is saved as an .mp3 file.

  Input:
  text: str, the text to convert to speech

  Output:
  filename: str, the filename of the speech audio
  """
  tts = gTTS(text=text, lang='en')
  filename = "speech.mp3"
  tts.save(filename)
  return filename

def get_answer(user_query):
  """
  This function takes a user query in the form of text or audio and returns an answer in the form of text or audio.

  Input:
  user_query: str or audio file, the user's query

  Output:
  answer or audio_answer: str or audio file, the answer to the user's query
  """
  if isinstance(user_query, str):
    # Text input
    retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=5)
    print(user_query)
  else:
    # Audio input
    audio_file = record()
    user_query = speech_to_text(audio_file)
    retrieved_docs = KNOWLEDGE_VECTOR_DATABASE.similarity_search(query=user_query, k=5)

  RAG_PROMPT_TEMPLATE = tokenizer.apply_chat_template(
    prompt_in_chat_format, tokenize=False, add_generation_prompt=True
  )
  retrieved_docs_text = [doc.page_content for doc in retrieved_docs]
  context = "\nExtracted documents:\n"
  context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(retrieved_docs_text)])
  final_prompt = RAG_PROMPT_TEMPLATE.format(question=user_query, context=context)

  answer = READER_LLM(final_prompt)[0]["generated_text"]

  output_format = gr.Radio(["Text", "Audio"], label="Output Format", value="Text")
  if output_format == "Audio":
    audio_answer = text_to_speech(answer)
    return audio_answer
  else:
    return answer

demo = gr.Interface(
    fn=get_answer,
    inputs=[
        gr.Textbox(lines=7, label="Enter your text query"),
        gr.Audio(label="Record your audio query")
    ],
    outputs=[
        gr.Textbox(label="Answer"),
        gr.Audio(label="Audio Answer")
    ]
)

demo.launch()
