In [None]:
!pip install -q PyMuPDF  datasets python-pptx langchain sentence-transformers  accelerate bitsandbytes faiss-gpu openpyxl  pacmap


In [None]:
!pip install -q arxiv


In [None]:
# Standard Library imports
import os
import re
from typing import List
from urllib.parse import quote_plus
import urllib.request
import re
from datetime import date, timedelta

# Third-party imports
import requests
import arxiv
# Define constants and types
QUERY: str = "ADVERSARICAL PAPERS ON COMPUTER VISION AND LLMS"
MAX_FILES: int = 200
SEARCH_ENGINE_URL: str = "https://www.google.com/search"
DOWNLOAD_FOLDER: str = "downloaded_files"

# Functions
def create_directory(directory_name: str) -> None:
    """Create a directory if it does not exist."""
    os.makedirs(directory_name, exist_ok=True)

def download_files(urls: List[str], folder: str) -> None:
    """Download a list of files to a specified folder."""
    for i, url in enumerate(urls):
        if i >= MAX_FILES:
            break

        try:
            file_name = url.split("/")[-1]
            file_path = os.path.join(folder, file_name)

            with requests.get(url, timeout=30, stream=True) as response:
                response.raise_for_status()
                with open(file_path, "wb") as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)

            print(f"Downloaded {i + 1} of {MAX_FILES} files: {file_name}")
        except requests.RequestException as e:
            print(f"Failed to download {url}: {e}")

def search_and_download_pdfs(query: str) -> None:
    """Search for PDFs related to a query and download them."""
    encoded_query = quote_plus(query)
    search_url = f"{SEARCH_ENGINE_URL}?q={encoded_query}"
    response = requests.get(search_url)
    response.raise_for_status()
    html = response.text

    pdf_pattern = r"https?://\S+\.pdf"
    pdf_urls = re.findall(pdf_pattern, html)

    folder_name = query.replace(" ", "_")
    create_directory(folder_name)
    download_files(pdf_urls, folder_name)
def download_papers(query, max_results, save_dir):
    """
    This function downloads papers from arXiv based on the provided query.

    Parameters:
    query (str): The search query for the papers.
    max_results (int): The maximum number of results to return.
    save_dir (str): The directory where the papers will be saved.
    """
    # Construct the default API client
    client = arxiv.Client()

    # Create a search object
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )

    # Get the results as a list
    results = list(client.results(search))

    # Check if the folder exists
    if not os.path.exists(save_dir):
        # If not, create the folder
        os.makedirs(save_dir)

    # Loop through the results
    for result in results:
        # Print the title of the paper
        print(result.title)
        # Download the paper as a PDF file and save it to the directory
        result.download_pdf(save_dir)



search_and_download_pdfs(query=QUERY)

download_papers(query=QUERY,max_results=MAX_FILES,save_dir=DOWNLOAD_FOLDER)


In [None]:
import os
import csv
from typing import Dict, List ,Optional
from collections import defaultdict
from pathlib import Path
import pandas as pd
import fitz  # PyMuPDF
import json
import yaml
import re
from pptx import Presentation
from io import StringIO
from datasets import load_dataset,Dataset
from tqdm.notebook import tqdm
import torch
import pandas as pd
from typing import Optional, List, Tuple

from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer
from langchain.vectorstores.faiss import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from langchain.docstore.document import Document as LangchainDocument

EXTENSION_READERS = {
    '.ipynb': lambda f: json.load(f),
    '.md': lambda f: f.read(),
    '.py': lambda f: f.read(),
    '.csv': lambda f: pd.read_csv(f),
    '.json': lambda f: json.load(f),
    '.yaml': lambda f: yaml.safe_load(f),
    '.txt': lambda f: f.read(),
    '.xml': lambda f: f.read(),
    '.html': lambda f: f.read(),
    '.css': lambda f: f.read(),
    '.js': lambda f: f.read(),
    '.java': lambda f: f.read(),
    '.cpp': lambda f: f.read(),
    '.h': lambda f: f.read(),
    '.php': lambda f: f.read(),
    '.rb': lambda f: f.read(),
    '.sql': lambda f: f.read(),
    '.xls': lambda f: pd.read_excel(f),
    '.xlsx': lambda f: pd.read_excel(f),
    '.ppt': lambda f: read_pptx(f),
    '.pptx': lambda f: read_pptx(f)
}




def read_pptx(file):
    """Custom function to read .pptx files with python-pptx"""
    prs = Presentation(file)
    text = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text.append(shape.text)
    return "\n".join(text)

# Utilize regular expressions to match any of the file extensions
EXTENSION_PATTERN = r".*\.(md|py|csv|json|yaml|txt|xml|html|css|js|java|cpp|h|php|rb|sql|xls|xlsx|ppt|pptx|ipynb)$"


def get_files_with_extensions(dir_path: str) -> Dict[str, List[str]]:
    ext_files = defaultdict(list)
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            file_path = os.path.join(root, file).replace("\\", "/")
            _, ext = os.path.splitext(file)
            ext_files[ext].append(file_path)
    return ext_files


def write_to_csv(file_path: str, ext_files: Dict[str, List[str]]) -> None:
    with open(file_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        max_len = max(len(v) for v in ext_files.values())
        writer.writerow(ext_files.keys())
        for i in range(max_len):
            row = [ext_files[k][i] if i < len(ext_files[k]) else '' for k in ext_files.keys()]
            writer.writerow(row)


def clean_text(text: str) -> str:
    """
    Clean the extracted text from the PDF.
    This function can be customized based on the cleaning requirements.

    Parameters:
    - text (str): The text extracted from the PDF.

    Returns:
    - str: The cleaned text.
    """
    cleaned_text = ' '.join(text.split())  # Removing extra whitespaces
    # Add more cleaning rules as needed.
    return cleaned_text

def split_and_save_text(cleaned_text: str, base_output_path: Path, max_size_bytes: int = 50 * 1024 * 1024) -> None:
    """
    Split the cleaned text into multiple files, each smaller than the specified max size, and save them.

    Parameters:
    - cleaned_text (str): The cleaned text to be split and saved.
    - base_output_path (Path): The base path where the text files will be saved.
    - max_size_bytes (int): Maximum size of the text file in bytes.
    """
    part_num = 1
    text_part = ""
    for line in cleaned_text.split('\n'):
        if len(text_part.encode('utf-8')) + len(line.encode('utf-8')) < max_size_bytes:
            text_part += line + '\n'
        else:
            # Save the current part and start a new one
            output_path = base_output_path.with_suffix(f'.part{part_num}.txt')
            with open(output_path, 'w', encoding='utf-8') as file:
                file.write(text_part)
            part_num += 1
            text_part = line + '\n' # Start new part with the current line

    # Save the last part
    if text_part:
        output_path = base_output_path.with_suffix(f'.part{part_num}.txt')
        with open(output_path, 'w', encoding='utf-8') as file:
            file.write(text_part)

def convert_pdf_to_text(pdf_path: str, output_folder: str) -> None:
    """
    Convert a PDF file to text files, splitting contents to ensure each resulting file is less than 50 MB.

    Parameters:
    - pdf_path (str): Path to the PDF file.
    - output_folder (str): Path to the folder where the text files will be saved.
    """
    # Ensure the output folder exists, create it if it does not
    output_folder_path = Path(output_folder)
    output_folder_path.mkdir(parents=True, exist_ok=True)

    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()

        cleaned_text = clean_text(text)

        base_output_path = output_folder_path / Path(pdf_path).stem
        split_and_save_text(cleaned_text, base_output_path)
    except Exception as e:
        print(f"An error occurred while converting {pdf_path}: {str(e)}")

def process_pdfs_from_csv(csv_path: str, output_folder: str) -> None:
    """
    Process PDFs listed in a CSV file, converting them to text files and ensuring each part is less than 50 MB.

    Parameters:
    - csv_path (str): Path to the CSV file containing paths to PDF files.
    - output_folder (str): Path to the folder where text files will be stored.
    """
    pdf_paths = pd.read_csv(csv_path, encoding='latin1')
    # pdf_paths = pd.read_csv(csv_path)
    for pdf_path in pdf_paths['.pdf']:
        convert_pdf_to_text(pdf_path, output_folder)




def list_files_with_extensions(directory_path):
    try:
        files = os.listdir(directory_path)
        return [file for file in files if re.match(EXTENSION_PATTERN, file)]
    except FileNotFoundError:
        print(f"The directory {directory_path} was not found.")
        return None

def read_file_content(directory_path, filename):
    try:
        extension = os.path.splitext(filename)[1]
        with open(os.path.join(directory_path, filename), 'r',encoding='utf-8') as file:
            file_reader = EXTENSION_READERS.get(extension)
            return file_reader(file) if file_reader else None
    except Exception as e:
        print(f"An error occurred while reading the file {filename}: {e}")



def process_files_txtfile(directory_path: str, user_folder: str, txt_file_counter: int = 1, txt_file_size: int = 0) -> Optional[str]:
    """
    This function recursively processes all files in a given directory and its subdirectories,
    and writes their content to a user-specific text file. Each text file is ensured to be less than 50 MB in size.

    Args:
    directory_path (str): The path to the directory containing the files to be processed.
    user_folder (str): The name of the user-specific folder where the text files will be written.
    txt_file_counter (int): The counter for the current text file.
    txt_file_size (int): The current size of the text file.

    Returns:
    str: The path to the user-specific folder, or None if an error occurred.
    """
    # Create the user-specific folder if it doesn't exist
    user_folder_path = os.path.join(directory_path, user_folder)
    os.makedirs(user_folder_path, exist_ok=True)

    # Get a list of all files in the directory
    files = list_files_with_extensions(directory_path)

    if files is None:
        return

    for filename in files:
        content = read_file_content(directory_path, filename)
        if content is not None:
            # Create a new text file if the size is over 50 MB
            if txt_file_size >= 50 * 1024 * 1024:
                txt_file_counter += 1
                txt_file_size = 0

            # Open the text file in append mode
            txt_file_path = os.path.join(user_folder_path, f"{txt_file_counter}.txt")
            with open(txt_file_path, "a") as f:
                if isinstance(content, pd.DataFrame):
                    # Convert DataFrame to CSV string without index and write to file
                    content_csv = content.to_csv(index=False)
                    f.write(content_csv)
                    txt_file_size += len(content_csv)
                elif isinstance(content, dict):
                    # Convert dict to pretty-printed string using json.dumps and write to file
                    content_str = json.dumps(content, indent=4)
                    f.write(content_str)
                    txt_file_size += len(content_str)
                else:
                    # If it's not a DataFrame or dict, convert it to a string
                    content_str = str(content)
                    f.write(content_str)
                    txt_file_size += len(content_str)

            print(f"--- File: {filename} ---")
            print(f"Content written to {txt_file_path}")
            print("-------------------------------\n")

    # Recursively process subdirectories
    for subdir in [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))]:
        subdir_path = os.path.join(directory_path, subdir)
        txt_file_counter, txt_file_size = process_files_txtfile(subdir_path, user_folder, txt_file_counter, txt_file_size)

    return txt_file_counter, txt_file_size
def loading_folder_using_datasets(folder_path:str):


    dataset = load_dataset('text', data_files=folder_path+'/*.txt')
    return dataset

##=========================||    Extraction  OF DATA         ||==================================
dir_path =f"/content/{DOWNLOAD_FOLDER}"
dir_output='/content/output' # replace with your directory path
csv_file_path = '/content/csvfile.csv'  # replace with your CSV file path
ext_files = get_files_with_extensions(dir_path)
write_to_csv(csv_file_path, ext_files)
process_pdfs_from_csv(csv_path=csv_file_path, output_folder=dir_output)
process_files_txtfile(dir_path,  dir_output)
dataset=loading_folder_using_datasets(folder_path=dir_output)
dataset


In [None]:
import pandas as pd

pd.DataFrame(dataset["train"])


In [None]:
from typing import Dict, List, Union, Callable
import pandas as pd
from datasets import load_dataset, Dataset

def create_dataframe(
    dataset: pd.DataFrame,
    subset: str = "train",
    columns: List[str] = None,
    transformations: Dict[str, Callable] = None,
) -> pd.DataFrame:
    """
    Create a DataFrame from a given dataset with specified columns and transformations.

    Args:
        dataset (Union[str, Dataset]): The dataset to load. It can be either a string representing the dataset name
                                       or a pre-loaded Dataset object.
        subset (str): The subset of the dataset to use (e.g., "train", "test", "validation"). Default is "train".
        columns (List[str]): The list of columns to include in the DataFrame. If None, all columns will be included.
        transformations (Dict[str, Callable]): A dictionary specifying the transformations to apply to each column.
                                               The keys are the column names, and the values are the transformation functions.

    Returns:
        pd.DataFrame: The created DataFrame.
    """
    # if isinstance(dataset, str):
    #     dataset = load_dataset(dataset)

    if subset not in dataset:
        raise ValueError(f"Subset '{subset}' not found in the dataset.")

    data = dataset[subset]
    print(data)

    if columns is None:
        columns = data.column_names

    return pd.DataFrame.from_dict(
        [
            {
                **{column: data[i][column] for column in columns},
                **{
                    "z": i,
                    "source": f"dataset.column_names: {columns}",
                    "extract": f"dataset.num_rows: {i + 1}/{data.num_rows}",
                    "type": "scatter",
                    "symbol": "circle",
                    "size_col": 4,
                },
                **{
                    column: transformations[column](data[i][column])
                    for column in transformations
                    if column in columns
                },
            }
            for i in range(100)
        ]
    )

dataset_name =dataset
subset = "train"
columns_to_include =['text']
transformations = {
    "text": lambda x: x.upper(),
    "text": lambda x: x[:500] + "...",
}

dataframe = create_dataframe(dataset_name, subset, columns_to_include, transformations)


In [None]:
dataframe


In [None]:
import plotly.express as px

columns = columns_to_include

for column in columns:

    fig = px.histogram(dataframe, x=column, title=f'Histogram of {column}')

    # fig.show()
    # fig = px.scatter(dataframe, x=column, )

    fig.show()


In [None]:
dataset=loading_folder_using_datasets(folder_path=dir_output)
langchain_docs = [
    LangchainDocument(page_content=doc["text"])
    for doc in tqdm(dataset['train'])
]
docs_processed=[]
EMBEDDING_MODEL_NAME = "thenlper/gte-small"
MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

def split_documents(
    chunk_size: int,
    knowledge_base,
    tokenizer_name: Optional[str] = EMBEDDING_MODEL_NAME,
) -> List:
    """
    This function splits documents into chunks of maximum size `chunk_size` tokens and return a list of documents.

    Parameters:
    chunk_size (int): Maximum size of each chunk
    knowledge_base (List): List of documents to be processed
    tokenizer_name (str, optional): Name of the tokenizer. Defaults to EMBEDDING_MODEL_NAME.

    Returns:
    List: List of processed documents
    """
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        AutoTokenizer.from_pretrained(tokenizer_name),
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        add_start_index=True,
        strip_whitespace=True,
        separators=MARKDOWN_SEPARATORS,
    )

    docs_processed = []
    for doc in langchain_docs:
        docs_processed += text_splitter.split_documents([doc])
    print("****completed*** all files files")


    # Remove duplicates
    unique_texts = {}
    docs_processed_unique = []
    for doc in docs_processed:
        if doc.page_content not in unique_texts:
            unique_texts[doc.page_content] = True
            docs_processed_unique.append(doc)

    return docs_processed_unique


docs_processed = split_documents(
    512,  # We choose a chunk size adapted to our model
    docs_processed,
    tokenizer_name=EMBEDDING_MODEL_NAME,
)


In [None]:
docs_processed[0:10]


In [None]:
from typing import Any, List, Dict, Union
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_data_structures(*data_structures: Any) -> None:
    """
    Plots various data structures using Plotly based on user input.

    Args:
        *data_structures: Variable number of data structures of any type.

    Returns:
        None
    """
    num_plots = len(data_structures)
    num_rows = (num_plots + 1) // 2
    num_cols = 2

    # Create subplots
    fig = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=[f"Data {i+1}" for i in range(num_plots)])

    for i, data in enumerate(data_structures, start=1):
        if isinstance(data, dict):
            # Plot dictionary
            for key, value in data.items():
                if isinstance(value, list):
                    # Plot dictionary of lists
                    for j, item in enumerate(value, start=1):
                        if isinstance(item, list):
                            # Plot dictionary of lists of lists
                            item_data = ", ".join(str(x) for x in item)
                        else:
                            item_data = str(item)
                        trace = go.Scatter(
                            x=[j],
                            y=[key],
                            mode="markers",
                            marker=dict(size=10),
                            text=item_data,
                            hoverinfo="text",
                            name=key,
                        )
                        fig.add_trace(trace, row=(i - 1) // num_cols + 1, col=(i - 1) % num_cols + 1)
                else:
                    # Plot dictionary of non-list values
                    trace = go.Bar(
                        x=[key],
                        y=[value],
                        text=str(value),
                        textposition="auto",
                        name=key,
                    )
                    fig.add_trace(trace, row=(i - 1) // num_cols + 1, col=(i - 1) % num_cols + 1)
        elif isinstance(data, list):
            # Plot list
            for j, item in enumerate(data, start=1):
                if isinstance(item, list):
                    # Plot list of lists
                    item_data = ", ".join(str(x) for x in item)
                    trace = go.Scatter(
                        x=[j],
                        y=[len(item)],
                        mode="lines+markers",
                        text=item_data,
                        hoverinfo="text",
                        name=f"List {j}",
                    )
                else:
                    # Plot list of non-list values
                    trace = go.Bar(
                        x=[j],
                        y=[item],
                        text=str(item),
                        textposition="auto",
                        name=f"Item {j}",
                    )
                fig.add_trace(trace, row=(i - 1) // num_cols + 1, col=(i - 1) % num_cols + 1)
        else:
            # Plot single value
            trace = go.Bar(
                x=["Value"],
                y=[data],
                text=str(data),
                textposition="auto",
                name="Value",
            )
            fig.add_trace(trace, row=(i - 1) // num_cols + 1, col=(i - 1) % num_cols + 1)

    # Customize layout
    fig.update_layout(
        title="Statistical Analysis of Data Structures",
        height=num_rows * 400,
        showlegend=False,
    )

    # Display the plot
    fig.show()


doc_json = {
        "attribute1":docs_processed[0].page_content.strip("")[:10],
        "attribute2":docs_processed[0].metadata['start_index'],
        # Add more attributes as needed
    }
# docs_processed_json.append(doc_json)

plot_data_structures(doc_json)


# # User can provide any combination and structure of data
# plot_data_structures(docs_processed)


In [None]:
docs_processed[0].metadata['start_index']


In [None]:
import pandas as pd
from datasets import load_dataset
import plotly.graph_objects as go

# Load the dataset
dataset = load_dataset("fka/awesome-chatgpt-prompts")

# Convert the dataset to a Pandas DataFrame
df = pd.DataFrame(dataset["train"])

# Extract the values from each column
act = df["act"].tolist()
act1 = df["act"].tolist()
act2 = df["act"].tolist()

# Create the scatter plot
fig = go.Figure(data=go.Scatter(
    x=act,
    y=act1,
    mode="markers",
    marker=dict(
        size=8,
        # color=act2,
        colorscale="Viridis",
        showscale=True
    ),
    text=[f"Act: {a}<br>Act1: {a1}<br>Act2: {a2}" for a, a1, a2 in zip(act, act1, act2)],
    hoverinfo="text"
))

# Customize the layout
fig.update_layout(
    title="Advanced Visualization",
    xaxis_title="Act",
    yaxis_title="Act1",
    plot_bgcolor="white",
    hoverlabel=dict(
        font_size=14,
        font_family="Arial"
    )
)

# Display the plot
fig.show()


In [None]:
# Load the dataset
dataset = load_dataset("Marxulia/asl_sign_languages_alphabets_v02")
print(dataset)
# Convert the dataset to a Pandas DataFrame
df = pd.DataFrame(dataset["train"])

df


In [None]:
import pandas as pd
# Load the dataset
dataset = load_dataset("fka/awesome-chatgpt-prompts")

split_datset=dataset.keys()
dataset=dataset["train"]

# print(dats)
columns=dataset.column_names
total_rows=dataset.num_rows


print(dataset)
pd.DataFrame.from_dict(
   [
       {
            "x": dataset['act'],
            "y": dataset['prompt'],
            'z':  i,
            "source":f"dataset.column_names{columns}" ,
            "extract":f"dataset.num_rows:{i}/{total_rows}",
            "type": "scatter",
            "symbol": "circle",
            "size_col": 4,
        }
       for i in range(dataset.num_rows)

   ]

)


In [None]:
from pathlib import Path
from typing import Dict, Any, List,Union,Optional
from datasets import (load_dataset,
                      DatasetDict,
                      concatenate_datasets
                      )


#Load the datset
def load_and_prepare_dataset(
    input_source: Union[str, Path, Dict[str, List[Union[str, Path]]]],
    split_ratios: tuple = (0.8, 0.1, 0.1),
    seed: int = 42,
    streaming: bool = False
    ) -> DatasetDict:
    """
    Load a dataset from various input sources and prepare it by splitting into train, test, and eval sets.

    :param input_source: A dataset name, path to a folder, a single file, multiple files, or a dictionary specifying train, test, and eval files.
    :param split_ratios: A tuple containing the ratios for train, test, and eval splits (default is (0.8, 0.1, 0.1)).
    :param seed: A random seed for reproducibility of the split (default is 42).
    :param streaming: Whether to use streaming to handle large files (default is False).
    :return: A DatasetDict containing the split datasets.

    Example:
    # Example usage with streaming for large files:
    # dataset_dict = load_and_prepare_dataset({
    #     'train': ['train_file_1.csv', 'train_file_2.csv'],
    #     'test': ['test_file.csv'],
    #     'eval': ['eval_file.csv']
    # }, streaming=True)
    # print(dataset_dict)
    OUTPUT1:
    DatasetDict({
    train: DatasetDict({
        train: Dataset({
            features: ['act', 'prompt'],
            num_rows: 459
        })
    })
    test: DatasetDict({
        train: Dataset({
            features: ['act', 'prompt'],
            num_rows: 459
        })
    })
    eval: DatasetDict({
        train: Dataset({
            features: ['act', 'prompt'],
            num_rows: 153
        })
    })
    })
    EXAMPLE2:
    dataset=load_and_prepare_dataset('fka/awesome-chatgpt-prompts')
    DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 122
    })
    test: Dataset({
        features: ['act', 'prompt'],
        num_rows: 15
    })
    eval: Dataset({
        features: ['act', 'prompt'],
        num_rows: 16
    })
    })
    EXAMPLE3:
    datset_path=load_and_prepare_dataset('/content/awesome-chatgpt-prompts')
DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 122
    })
    test: Dataset({
        features: ['act', 'prompt'],
        num_rows: 15
    })
    eval: Dataset({
        features: ['act', 'prompt'],
        num_rows: 16
    })
    })

    """
    # Load dataset from different types of input sources
    if isinstance(input_source, (str, Path)):
        # Dataset name, single file or path to folder
        dataset = load_dataset(input_source, streaming=streaming)
        dataset = DatasetDict(dataset)
    elif isinstance(input_source, dict):
        # Dictionary with specified train, test, and eval files
        formats = ['csv', 'json', 'jsonl', 'parquet', 'txt']
        datasets = {}
        for split, files in input_source.items():
            format_detected = None
            for fmt in formats:
                if any(str(file).endswith(fmt) for file in files):
                    format_detected = fmt
                    break
            if format_detected is None:
                raise ValueError(f"No supported file format detected for files: {files}")
            datasets[split] = load_dataset(format_detected, data_files=files, streaming=streaming)
        dataset = DatasetDict(datasets)
    else:
        raise ValueError("Input source should be a dataset name, path to a folder, a single file, multiple files, or a dictionary.")

    # Perform the split if needed and if not in streaming mode
    if not streaming:
        train_size, test_size, eval_size = split_ratios
        assert 0.0 < train_size < 1.0 and 0.0 < test_size < 1.0 and 0.0 < eval_size < 1.0 and (train_size + test_size + eval_size) == 1.0, \
            "Split ratios must be between 0 and 1 and sum up to 1."

        if "train" not in dataset or "test" not in dataset or "eval" not in dataset:
            # Assuming all splits are to be derived from the 'train' dataset
            full_dataset = concatenate_datasets(list(dataset.values())) if isinstance(dataset, dict) else dataset
            split_dataset = full_dataset.train_test_split(train_size=train_size, seed=seed)
            test_eval_split = split_dataset['test'].train_test_split(test_size=test_size / (test_size + eval_size), seed=seed)

            dataset = DatasetDict({
                "train": split_dataset["train"],
                "test": test_eval_split["train"],
                "eval": test_eval_split["test"]
            })

    return dataset



In [None]:

def print_dataset_details(dataset):
    # Print features of the training set
    print("Features of the training set:")
    print(dataset['train'].features)

    # Print number of rows in the training set
    print("Number of rows in the training set:", dataset['train'].num_rows)

    # Print column names of the training set
    print("Column names of the training set:", dataset['train'].column_names)

    # Print homepage of the training set
    print("Homepage of the training set:", dataset['train'].homepage)

    # Print citation of the training set
    print("Citation of the training set:", dataset['train'].citation)

    # Print config name of the training set
    print("Config name of the training set:", dataset['train'].config_name)

    # Print description of the training set
    print("Description of the training set:", dataset['train'].description)

    # Print dataset info description
    print("Dataset info description:", dataset['train'].info.description)

    # Print dataset info download checksums
    print("Dataset info download checksums:", dataset['train'].info.download_checksums)

    # Print dataset info features
    print("Dataset info features:", dataset['train'].info.features)

    # Print dataset info homepage
    print("Dataset info homepage:", dataset['train'].info.homepage)

    # Print dataset info citation
    print("Dataset info citation:", dataset['train'].info.citation)

    # Print dataset info config name
    print("Dataset info config name:", dataset['train'].info.config_name)

    # Print dataset info description
    print("Dataset info description:", dataset['train'].info.description)

    print("Label name:",dataset['train'].features["label"].names)

# Load and prepare the dataset
dataset = load_and_prepare_dataset("Marxulia/asl_sign_languages_alphabets_v02")

# Print details of the dataset
print_dataset_details(dataset)


In [None]:
dataset=load_and_prepare_dataset("Marxulia/asl_sign_languages_alphabets_v02")

dataset['train'].features
dataset['train'].num_rows
dataset['train'].column_names
dataset['train'].homepage
dataset['train'].citation
dataset['train'].config_name
dataset['train'].description
dataset['train'].info.description
dataset['train'].info.download_checksums
dataset['train'].info.features
dataset['train'].info.homepage
dataset['train'].info.citation
dataset['train'].info.config_name
dataset['train'].info.description
dataset['train'].features["label"].names


In [None]:
from typing import Dict, List, Union, Callable
import pandas as pd
from datasets import load_dataset, Dataset

def create_dataframe(
    dataset: Union[str, Dataset],
    subset: str = "train",
    columns: List[str] = None,
    transformations: Dict[str, Callable] = None,
) -> pd.DataFrame:
    """
    Create a DataFrame from a given dataset with specified columns and transformations.

    Args:
        dataset (Union[str, Dataset]): The dataset to load. It can be either a string representing the dataset name
                                       or a pre-loaded Dataset object.
        subset (str): The subset of the dataset to use (e.g., "train", "test", "validation"). Default is "train".
        columns (List[str]): The list of columns to include in the DataFrame. If None, all columns will be included.
        transformations (Dict[str, Callable]): A dictionary specifying the transformations to apply to each column.
                                               The keys are the column names, and the values are the transformation functions.

    Returns:
        pd.DataFrame: The created DataFrame.
    """
    if isinstance(dataset, str):
        dataset = load_dataset(dataset)

    if subset not in dataset:
        raise ValueError(f"Subset '{subset}' not found in the dataset.")

    data = dataset[subset]
    print(data)

    if columns is None:
        columns = data.column_names

    return pd.DataFrame.from_dict(
        [
            {
                **{column: data[i][column] for column in columns},
                **{
                    "z": i,
                    "source": f"dataset.column_names: {columns}",
                    "extract": f"dataset.num_rows: {i + 1}/{data.num_rows}",
                    "type": "scatter",
                    "symbol": "circle",
                    "size_col": 4,
                },
                **{
                    column: transformations[column](data[i][column])
                    for column in transformations
                    if column in columns
                },
            }
            for i in range(100)
        ]
    )

dataset_name = "b-mc2/sql-create-context"
subset = "train"
columns_to_include =['context', 'answer', 'question']
transformations = {
    "question": lambda x: x.upper(),
    "context": lambda x: x[:500] + "...",
}

dataframe = create_dataframe(dataset_name, subset, columns_to_include, transformations)


In [None]:
dataframe


In [None]:
from typing import Dict, List, Union, Callable, Optional
import pandas as pd
from datasets import load_dataset, Dataset
from tqdm import tqdm

def create_dataframe(
    dataset: Union[str, Dataset],
    subset: str = "train",
    columns: Optional[List[str]] = None,
    transformations: Optional[Dict[str, Callable]] = None,
    num_rows: Optional[int] = None,
    progress_bar: bool = True,
) -> pd.DataFrame:
    """
    Create a DataFrame from a given dataset with specified columns and transformations.

    Args:
        dataset (Union[str, Dataset]): The dataset to load. It can be either a string representing the dataset name
                                       or a pre-loaded Dataset object.
        subset (str): The subset of the dataset to use (e.g., "train", "test", "validation"). Default is "train".
        columns (Optional[List[str]]): The list of columns to include in the DataFrame. If None, all columns will be included.
        transformations (Optional[Dict[str, Callable]]): A dictionary specifying the transformations to apply to each column.
                                                         The keys are the column names, and the values are the transformation functions.
        num_rows (Optional[int]): The number of rows to include in the DataFrame. If None, all rows will be included.
        progress_bar (bool): Whether to display a progress bar during the DataFrame creation process. Default is True.

    Returns:
        pd.DataFrame: The created DataFrame.
    """
    if isinstance(dataset, str):
        dataset = load_dataset(dataset)

    if subset not in dataset:
        raise ValueError(f"Subset '{subset}' not found in the dataset.")

    data = dataset[subset]

    if columns is None:
        columns = data.column_names

    if num_rows is None:
        num_rows = data.num_rows

    rows = []
    for i in tqdm(range(num_rows), disable=not progress_bar, desc=f"Creating DataFrame from {subset} subset"):
        row = {
            **{column: data[i][column] for column in columns},
            **{
                "z": i,
                "source": f"dataset.column_names: {columns}",
                "extract": f"dataset.num_rows: {i + 1}/{data.num_rows}",
                "type": "scatter",
                "symbol": "circle",
                "size_col": 4,
            },
        }

        if transformations is not None:
            row.update(
                {
                    column: transformations[column](data[i][column])
                    for column in transformations
                    if column in columns
                }
            )

        rows.append(row)

    return pd.DataFrame(rows)

dataset_name = "b-mc2/sql-create-context"
subset = "train"
columns_to_include = ['context', 'answer', 'question']
transformations = {
    "question": lambda x: x.upper(),
    "context": lambda x: x[:500] + "...",
}

dataframe = create_dataframe(dataset_name, subset, columns_to_include, transformations, num_rows=100, progress_bar=True)


In [None]:
dataframe


In [None]:
import plotly.express as px

columns = columns_to_include

for column in columns:

    fig = px.histogram(dataframe, x=column, title=f'Histogram of {column}')

    # fig.show()
    # fig = px.scatter(dataframe, x=column, )

    fig.show()



In [None]:
import plotly.express as px

# Assuming 'dataframe' is your DataFrame and 'x' and 'y' are columns in your DataFrame
fig = px.scatter(dataframe, x='context', )

fig.show()


In [None]:
from typing import Dict, List, Union, Callable
import pandas as pd
from datasets import load_dataset, Dataset

def create_dataframe(
    dataset: Union[str, Dataset],
    subset: str = "train",
    columns: List[str] = None,
    transformations: Dict[str, Callable] = None,
) -> pd.DataFrame:
    """
    Create a DataFrame from a given dataset with specified columns and transformations.

    Args:
        dataset (Union[str, Dataset]): The dataset to load. It can be either a string representing the dataset name
                                       or a pre-loaded Dataset object.
        subset (str): The subset of the dataset to use (e.g., "train", "test", "validation"). Default is "train".
        columns (List[str]): The list of columns to include in the DataFrame. If None, all columns will be included.
        transformations (Dict[str, Callable]): A dictionary specifying the transformations to apply to each column.
                                               The keys are the column names, and the values are the transformation functions.

    Returns:
        pd.DataFrame: The created DataFrame.
    """
    if isinstance(dataset, str):
        dataset = load_dataset(dataset)

    if subset not in dataset:
        raise ValueError(f"Subset '{subset}' not found in the dataset.")

    data = dataset[subset]

    if columns is None:
        columns = data.column_names

    dataframe_data = []
    for i in range(data.num_rows):
        row_data = {}
        for column in columns:
            value = data[i][column]
            if transformations and column in transformations:
                value = transformations[column](value)
            row_data[column] = value
        dataframe_data.append(row_data)

    return pd.DataFrame(dataframe_data)

# Example usage
dataset_name = "fka/awesome-chatgpt-prompts"
subset = "train"
columns_to_include = ["act", "prompt"]
transformations = {
    "act": lambda x: x.upper(),
    "prompt": lambda x: x[:50] + "...",
}

dataframe = create_dataframe(dataset_name, subset, columns_to_include, transformations)
print(dataframe.head())
