Certainly! Here's a detailed example of how to use the `add_faiss_index` function for a dataset, which includes steps from preprocessing text data to embedding it, creating a FAISS index for quick retrieval, and then querying that index. This example uses the `datasets` library from Hugging Face, `transformers` for embedding generation, and `faiss` for indexing.

### Requirements
First, make sure to install the required libraries if you haven't already:
```bash
pip install datasets transformers faiss-cpu
```

### Example Code
```python
from datasets import load_dataset
import numpy as np
import faiss
from transformers import AutoTokenizer, AutoModel
import torch

# Function to embed text using a pre-trained transformer model
def embed(texts):
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    model = AutoModel.from_pretrained("distilbert-base-uncased")
    with torch.no_grad():
        inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # Use the [CLS] token's embeddings
    return embeddings

# Load a text dataset
ds = load_dataset('crime_and_punish', split='train')

# Embed all text data in the dataset
ds = ds.map(lambda example: {'embeddings': embed([example['line']])[0]}, batched=False)

# Function to add a Faiss index to the dataset
def add_faiss_index(dataset, column='embeddings', index_name=None, device=-1, string_factory=None, metric_type=faiss.METRIC_L2, custom_index=None, batch_size=1000, train_size=None, faiss_verbose=False, dtype=np.float32):
    dim = dataset[column][0].shape[0]
    if string_factory is None:
        index = faiss.IndexFlatL2(dim)
    else:
        index = faiss.index_factory(dim, string_factory)

    if device >= 0:
        faiss_res = faiss.StandardGpuResources()  # Use default GPU resources
        index = faiss.index_cpu_to_gpu(faiss_res, device, index)

    # Adding data to the index
    xb = np.vstack(dataset[column]).astype(dtype)
    if custom_index is not None:
        index = custom_index
    if train_size is not None and hasattr(index, 'train'):
        index.train(xb[:train_size])
    index.add(xb)
    faiss.write_index(index, f'{index_name}.faiss' if index_name else 'faiss_index.faiss')

    dataset.add_faiss_index = index  # Adding index as a property for easy access
    return dataset

# Add FAISS index to the dataset
ds = add_faiss_index(ds, 'embeddings')

# Save the index
ds.add_faiss_index.save('my_index.faiss')

# To query the index, first load it if necessary
index = faiss.read_index('my_index.faiss')

# Define a query
query = embed(["Raskolnikov spoke again"])
query_vector = np.array(query).astype(np.float32)

# Search the FAISS index
k = 10  # Number of nearest neighbors to retrieve
D, I = index.search(query_vector, k)

# Retrieve and print the results
for i, idx in enumerate(I[0]):
    print(f"Rank {i+1}: {ds['line'][idx]} (Distance: {D[0][i]})")
```

### Explanation
1. **Embedding the Text**: We use a pre-trained DistilBERT model from Hugging Face to convert text lines to embeddings. Each line in the dataset is processed to extract its embedding.
2. **Adding a FAISS Index**: A FAISS index (L2 distance metric in this case) is created and populated with the embeddings. It is then saved to disk.
3. **Querying the Index**: To find the most similar entries to a given query, the query text is first embedded, and then the nearest neighbors in the FAISS index are found.

This example demonstrates how to effectively integrate text embeddings with FAISS for efficient similarity searches in a dataset.

In [2]:
!pip install -q datasets plotly


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h

Here are some examples of how to use snippets of code for audio, text, video, and image data in the Datasets library:

Text Data:
```python
from datasets import load_dataset

# Load a text dataset
ds = load_dataset("rotten_tomatoes", split="validation")

# Add a new column with additional text data
more_text = ds["text"]
ds = ds.add_column(name="text_2", column=more_text)

# Add an ElasticSearch index for fast text retrieval
ds.add_elasticsearch_index(column='text', host='localhost', port=9200)

# Query the ElasticSearch index
scores, retrieved_examples = ds.get_nearest_examples('text', 'great movie', k=10)
```

Audio Data:
```python
from datasets import load_dataset

# Load an audio dataset
ds = load_dataset("superb", "asr_dummy", split="train")

# Add a new column with audio embeddings
ds = ds.map(lambda example: {'audio_embeddings': embed_audio(example['audio'])})

# Add a Faiss index for fast audio retrieval 
ds.add_faiss_index(column='audio_embeddings', metric_type=faiss.METRIC_INNER_PRODUCT)

# Query the Faiss index
scores, retrieved_examples = ds.get_nearest_examples('audio_embeddings', query_embedding, k=10)
```

Image Data:
```python
from datasets import load_dataset

# Load an image dataset
ds = load_dataset("cifar10", split="train")

# Add a new column with image embeddings
ds = ds.map(lambda example: {'image_embeddings': embed_image(example['img'])})

# Add a Faiss index for fast image retrieval
ds.add_faiss_index(column='image_embeddings', metric_type=faiss.METRIC_L2)

# Query the Faiss index 
scores, retrieved_examples = ds.get_nearest_examples('image_embeddings', query_embedding, k=10)
```

Video Data:
```python
from datasets import load_dataset

# Load a video dataset 
ds = load_dataset("ucf101", split="train")

# Add a new column with video embeddings
ds = ds.map(lambda example: {'video_embeddings': embed_video(example['video'])})

# Add a Faiss index for fast video retrieval
ds.add_faiss_index(column='video_embeddings', metric_type=faiss.METRIC_INNER_PRODUCT)

# Query the Faiss index
scores, retrieved_examples = ds.get_nearest_examples('video_embeddings', query_embedding, k=10)
```

The key steps are:

1. Load a dataset containing the desired data type (text, audio, image, video)
2. Optionally add a new column with embeddings/vectors representing the data 
3. Add an efficient index (e.g. ElasticSearch for text, Faiss for dense vectors) to enable fast retrieval
4. Query the index to retrieve the most similar examples to a given query

This allows you to efficiently search for nearest neighbors across large datasets of unstructured data like text, audio, images and videos. The `add_elasticsearch_index` and `add_faiss_index` methods make it simple to create the indexes, while `get_nearest_examples` provides an easy way to query them.

Certainly! Here's a detailed example of how to use the `add_faiss_index` functionality with a dataset:

```python
from datasets import load_dataset
import faiss
import numpy as np

# Load a dataset
ds = load_dataset('crime_and_punish', split='train')

# Define a function to generate embeddings for each example
def embed(text):
    # In this example, we'll use a simple bag-of-words embedding
    # You can replace this with any embedding function (e.g., using a pre-trained model)
    words = text.split()
    embedding = np.zeros(len(set(words)))
    for word in words:
        embedding[hash(word) % len(embedding)] += 1
    return embedding.astype(np.float32)

# Add an 'embeddings' column to the dataset
ds_with_embeddings = ds.map(lambda example: {'embeddings': embed(example['line'])})

# Add a Faiss index for the 'embeddings' column
ds_with_embeddings.add_faiss_index(
    column='embeddings',
    index_name='my_index',
    string_factory='Flat',
    metric_type=faiss.METRIC_L2,
    device=-1,  # Use -1 for CPU, or specify the GPU index if using GPU
    train_size=1000,  # Use a subset of examples for training the index
    faiss_verbose=True
)

# Query the Faiss index to find the nearest examples
query_text = 'The punishment should fit the crime.'
query_embedding = embed(query_text)
scores, retrieved_examples = ds_with_embeddings.get_nearest_examples('embeddings', query_embedding, k=10)

# Print the retrieved examples
for i, example in enumerate(retrieved_examples):
    print(f"Top {i+1} retrieved example:")
    print(example['line'])
    print(f"Similarity score: {scores[i]}")
    print()

# Save the Faiss index to disk
ds_with_embeddings.save_faiss_index('embeddings', 'my_index.faiss')

# Load the saved Faiss index from disk
ds_loaded = load_dataset('crime_and_punish', split='train')
ds_loaded.load_faiss_index('embeddings', 'my_index.faiss')

# Query the loaded Faiss index
scores, retrieved_examples = ds_loaded.get_nearest_examples('embeddings', query_embedding, k=10)
```

In this example:

1. We load the 'crime_and_punish' dataset using `load_dataset()`.

2. We define a custom `embed()` function to generate embeddings for each example in the dataset. In this case, we use a simple bag-of-words embedding as an example, but you can replace it with any embedding function of your choice.

3. We add an 'embeddings' column to the dataset using `map()` and apply the `embed()` function to each example.

4. We add a Faiss index for the 'embeddings' column using `add_faiss_index()`. We specify the index name, string factory (index type), metric type, device (CPU or GPU), train size, and verbosity.

5. We query the Faiss index to find the nearest examples to a given query embedding using `get_nearest_examples()`. We provide the query embedding and the number of examples to retrieve (k).

6. We print the retrieved examples along with their similarity scores.

7. We save the Faiss index to disk using `save_faiss_index()`.

8. We load the saved Faiss index from disk using `load_faiss_index()` on a new dataset object.

9. We query the loaded Faiss index to verify that it works as expected.

This example demonstrates how to use the `add_faiss_index` functionality to create an efficient index for nearest neighbor search over embeddings in a dataset, and how to query the index to retrieve similar examples.

In [None]:
import pandas as pd
from datasets import load_dataset
import plotly.graph_objects as go

# Load the dataset
dataset = load_dataset("fka/awesome-chatgpt-prompts")

dataset.map


In [11]:
import pandas as pd
from datasets import load_dataset
import plotly.graph_objects as go

# Load the dataset
dataset = load_dataset("fka/awesome-chatgpt-prompts")

# Convert the dataset to a Pandas DataFrame
df = pd.DataFrame(dataset["train"])

# Extract the values from each column
act = df["act"].tolist()
act1 = df["act"].tolist()
act2 = df["act"].tolist()

# Create the scatter plot
fig = go.Figure(data=go.Scatter(
    x=act,
    y=act1,
    mode="markers",
    marker=dict(
        size=8,
        # color=act2,
        colorscale="Viridis",
        showscale=True
    ),
    text=[f"Act: {a}<br>Act1: {a1}<br>Act2: {a2}" for a, a1, a2 in zip(act, act1, act2)],
    hoverinfo="text"
))

# Customize the layout
fig.update_layout(
    title="Advanced Visualization",
    xaxis_title="Act",
    yaxis_title="Act1",
    plot_bgcolor="white",
    hoverlabel=dict(
        font_size=14,
        font_family="Arial"
    )
)

# Display the plot
fig.show()


In [None]:
import pandas as pd
from datasets import load_dataset

# Load dataset
dataset = load_dataset('HuggingFaceH4/ultrachat_200k')
dataset = dataset['train_sft']

dataset.a
# Convert dataset to DataFrame
columns_list = dataset.column_names 
data_dict = {col: dataset[col] for col in columns_list}
df = pd.DataFrame(data_dict)

# Save DataFrame to a CSV file with escapechar
df.to_csv('dataset.csv', index=False, escapechar='\\')


In [None]:
df.to_json("")


In [24]:
import pandas as pd
# Load the dataset
dataset = load_dataset("fka/awesome-chatgpt-prompts")

split_datset=dataset.keys()
dataset=dataset["train"]
columns=dataset.column_names
total_rows=dataset.num_rows


print(dataset)
pd.DataFrame.from_dict(
   [
       {
            "x": dataset['act'],
            "y": dataset['prompt'],
            'z':  i,
            "source":f"dataset.column_names{columns}" ,
            "extract":f"dataset.num_rows:{i}/{total_rows}",
            "type": "scatter",
            "symbol": "circle",
            "size_col": 4,
        }
       for i in range(dataset.num_rows)

   ]

)


Dataset({
    features: ['act', 'prompt'],
    num_rows: 153
})


Unnamed: 0,x,y,z,source,extract,type,symbol,size_col
0,"[Linux Terminal, English Translator and Improv...",[I want you to act as a linux terminal. I will...,0,"dataset.column_names['act', 'prompt']",dataset.num_rows:0/153,scatter,circle,4
1,"[Linux Terminal, English Translator and Improv...",[I want you to act as a linux terminal. I will...,1,"dataset.column_names['act', 'prompt']",dataset.num_rows:1/153,scatter,circle,4
2,"[Linux Terminal, English Translator and Improv...",[I want you to act as a linux terminal. I will...,2,"dataset.column_names['act', 'prompt']",dataset.num_rows:2/153,scatter,circle,4
3,"[Linux Terminal, English Translator and Improv...",[I want you to act as a linux terminal. I will...,3,"dataset.column_names['act', 'prompt']",dataset.num_rows:3/153,scatter,circle,4
4,"[Linux Terminal, English Translator and Improv...",[I want you to act as a linux terminal. I will...,4,"dataset.column_names['act', 'prompt']",dataset.num_rows:4/153,scatter,circle,4
...,...,...,...,...,...,...,...,...
148,"[Linux Terminal, English Translator and Improv...",[I want you to act as a linux terminal. I will...,148,"dataset.column_names['act', 'prompt']",dataset.num_rows:148/153,scatter,circle,4
149,"[Linux Terminal, English Translator and Improv...",[I want you to act as a linux terminal. I will...,149,"dataset.column_names['act', 'prompt']",dataset.num_rows:149/153,scatter,circle,4
150,"[Linux Terminal, English Translator and Improv...",[I want you to act as a linux terminal. I will...,150,"dataset.column_names['act', 'prompt']",dataset.num_rows:150/153,scatter,circle,4
151,"[Linux Terminal, English Translator and Improv...",[I want you to act as a linux terminal. I will...,151,"dataset.column_names['act', 'prompt']",dataset.num_rows:151/153,scatter,circle,4


In [6]:
from pathlib import Path
from typing import Dict, Any, List,Union,Optional
from datasets import (load_dataset,
                      DatasetDict,
                      concatenate_datasets
                      )


#Load the datset
def load_and_prepare_dataset(
    input_source: Union[str, Path, Dict[str, List[Union[str, Path]]]],
    split_ratios: tuple = (0.8, 0.1, 0.1),
    seed: int = 42,
    streaming: bool = False
    ) -> DatasetDict:
    """
    Load a dataset from various input sources and prepare it by splitting into train, test, and eval sets.

    :param input_source: A dataset name, path to a folder, a single file, multiple files, or a dictionary specifying train, test, and eval files.
    :param split_ratios: A tuple containing the ratios for train, test, and eval splits (default is (0.8, 0.1, 0.1)).
    :param seed: A random seed for reproducibility of the split (default is 42).
    :param streaming: Whether to use streaming to handle large files (default is False).
    :return: A DatasetDict containing the split datasets.

    Example:
    # Example usage with streaming for large files:
    # dataset_dict = load_and_prepare_dataset({
    #     'train': ['train_file_1.csv', 'train_file_2.csv'],
    #     'test': ['test_file.csv'],
    #     'eval': ['eval_file.csv']
    # }, streaming=True)
    # print(dataset_dict)
    OUTPUT1:
    DatasetDict({
    train: DatasetDict({
        train: Dataset({
            features: ['act', 'prompt'],
            num_rows: 459
        })
    })
    test: DatasetDict({
        train: Dataset({
            features: ['act', 'prompt'],
            num_rows: 459
        })
    })
    eval: DatasetDict({
        train: Dataset({
            features: ['act', 'prompt'],
            num_rows: 153
        })
    })
    })
    EXAMPLE2:
    dataset=load_and_prepare_dataset('fka/awesome-chatgpt-prompts')
    DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 122
    })
    test: Dataset({
        features: ['act', 'prompt'],
        num_rows: 15
    })
    eval: Dataset({
        features: ['act', 'prompt'],
        num_rows: 16
    })
    })
    EXAMPLE3:
    datset_path=load_and_prepare_dataset('/content/awesome-chatgpt-prompts')
DatasetDict({
    train: Dataset({
        features: ['act', 'prompt'],
        num_rows: 122
    })
    test: Dataset({
        features: ['act', 'prompt'],
        num_rows: 15
    })
    eval: Dataset({
        features: ['act', 'prompt'],
        num_rows: 16
    })
    })

    """
    # Load dataset from different types of input sources
    if isinstance(input_source, (str, Path)):
        # Dataset name, single file or path to folder
        dataset = load_dataset(input_source, streaming=streaming)
        dataset = DatasetDict(dataset)
    elif isinstance(input_source, dict):
        # Dictionary with specified train, test, and eval files
        formats = ['csv', 'json', 'jsonl', 'parquet', 'txt']
        datasets = {}
        for split, files in input_source.items():
            format_detected = None
            for fmt in formats:
                if any(str(file).endswith(fmt) for file in files):
                    format_detected = fmt
                    break
            if format_detected is None:
                raise ValueError(f"No supported file format detected for files: {files}")
            datasets[split] = load_dataset(format_detected, data_files=files, streaming=streaming)
        dataset = DatasetDict(datasets)
    else:
        raise ValueError("Input source should be a dataset name, path to a folder, a single file, multiple files, or a dictionary.")

    # Perform the split if needed and if not in streaming mode
    if not streaming:
        train_size, test_size, eval_size = split_ratios
        assert 0.0 < train_size < 1.0 and 0.0 < test_size < 1.0 and 0.0 < eval_size < 1.0 and (train_size + test_size + eval_size) == 1.0, \
            "Split ratios must be between 0 and 1 and sum up to 1."

        if "train" not in dataset or "test" not in dataset or "eval" not in dataset:
            # Assuming all splits are to be derived from the 'train' dataset
            full_dataset = concatenate_datasets(list(dataset.values())) if isinstance(dataset, dict) else dataset
            split_dataset = full_dataset.train_test_split(train_size=train_size, seed=seed)
            test_eval_split = split_dataset['test'].train_test_split(test_size=test_size / (test_size + eval_size), seed=seed)

            dataset = DatasetDict({
                "train": split_dataset["train"],
                "test": test_eval_split["train"],
                "eval": test_eval_split["test"]
            })

    return dataset



In [7]:
dataset=load_and_prepare_dataset("zwn22/NC_Crime")


DatasetDict({
    train: Dataset({
        features: ['year', 'city', 'crime_major_category', 'crime_detail', 'latitude', 'longitude', 'occurance_time', 'clear_status', 'incident_address', 'notes', 'crime_severity'],
        num_rows: 468708
    })
    test: Dataset({
        features: ['year', 'city', 'crime_major_category', 'crime_detail', 'latitude', 'longitude', 'occurance_time', 'clear_status', 'incident_address', 'notes', 'crime_severity'],
        num_rows: 58589
    })
    eval: Dataset({
        features: ['year', 'city', 'crime_major_category', 'crime_detail', 'latitude', 'longitude', 'occurance_time', 'clear_status', 'incident_address', 'notes', 'crime_severity'],
        num_rows: 58589
    })
})

In [46]:
from typing import Dict, List, Union, Callable
import pandas as pd
from datasets import load_dataset, Dataset

def create_dataframe(
    dataset: Union[str, Dataset],
    subset: str = "train",
    columns: List[str] = None,
    transformations: Dict[str, Callable] = None,
) -> pd.DataFrame:
    """
    Create a DataFrame from a given dataset with specified columns and transformations.

    Args:
        dataset (Union[str, Dataset]): The dataset to load. It can be either a string representing the dataset name
                                       or a pre-loaded Dataset object.
        subset (str): The subset of the dataset to use (e.g., "train", "test", "validation"). Default is "train".
        columns (List[str]): The list of columns to include in the DataFrame. If None, all columns will be included.
        transformations (Dict[str, Callable]): A dictionary specifying the transformations to apply to each column.
                                               The keys are the column names, and the values are the transformation functions.

    Returns:
        pd.DataFrame: The created DataFrame.
    """
    if isinstance(dataset, str):
        dataset = load_dataset(dataset)

    if subset not in dataset:
        raise ValueError(f"Subset '{subset}' not found in the dataset.")

    data = dataset[subset]
    print(data)

    if columns is None:
        columns = data.column_names

    return pd.DataFrame.from_dict(
        [
            {
                **{column: data[i][column] for column in columns},
                **{
                    "z": i,
                    "source": f"dataset.column_names: {columns}",
                    "extract": f"dataset.num_rows: {i + 1}/{data.num_rows}",
                    "type": "scatter",
                    "symbol": "circle",
                    "size_col": 4,
                },
                **{
                    column: transformations[column](data[i][column])
                    for column in transformations
                    if column in columns
                },
            }
            for i in range(100)
        ]
    )

dataset_name = "b-mc2/sql-create-context"
subset = "train"
columns_to_include =['context', 'answer', 'question']
transformations = {
    "question": lambda x: x.upper(),
    "context": lambda x: x[:500] + "...",
}

dataframe = create_dataframe(dataset_name, subset, columns_to_include, transformations)


Dataset({
    features: ['context', 'answer', 'question'],
    num_rows: 78577
})


In [48]:
dataframe


Unnamed: 0,context,answer,question,z,source,extract,type,symbol,size_col
0,CREATE TABLE head (age INTEGER)...,SELECT COUNT(*) FROM head WHERE age > 56,HOW MANY HEADS OF THE DEPARTMENTS ARE OLDER TH...,0,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 1/78577,scatter,circle,4
1,"CREATE TABLE head (name VARCHAR, born_state VA...","SELECT name, born_state, age FROM head ORDER B...","LIST THE NAME, BORN STATE AND AGE OF THE HEADS...",1,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 2/78577,scatter,circle,4
2,"CREATE TABLE department (creation VARCHAR, nam...","SELECT creation, name, budget_in_billions FROM...","LIST THE CREATION YEAR, NAME AND BUDGET OF EAC...",2,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 3/78577,scatter,circle,4
3,CREATE TABLE department (budget_in_billions IN...,"SELECT MAX(budget_in_billions), MIN(budget_in_...",WHAT ARE THE MAXIMUM AND MINIMUM BUDGET OF THE...,3,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 4/78577,scatter,circle,4
4,CREATE TABLE department (num_employees INTEGER...,SELECT AVG(num_employees) FROM department WHER...,WHAT IS THE AVERAGE NUMBER OF EMPLOYEES OF THE...,4,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 5/78577,scatter,circle,4
...,...,...,...,...,...,...,...,...,...
95,"CREATE TABLE station (name VARCHAR, lat VARCHA...","SELECT name, lat, city FROM station ORDER BY l...","WHAT ARE THE NAME, LATITUDE, AND CITY OF THE S...",95,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 96/78577,scatter,circle,4
96,"CREATE TABLE weather (date VARCHAR, mean_tempe...","SELECT date, mean_temperature_f, mean_humidity...","WHAT ARE THE DATE, MEAN TEMPERATURE AND MEAN H...",96,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 97/78577,scatter,circle,4
97,CREATE TABLE station (city VARCHAR)...,"SELECT city, COUNT(*) FROM station GROUP BY ci...",LIST THE NAME AND THE NUMBER OF STATIONS FOR A...,97,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 98/78577,scatter,circle,4
98,"CREATE TABLE trip (start_station_id VARCHAR, s...","SELECT start_station_id, start_station_name FR...",FIND THE IDS AND NAMES OF STATIONS FROM WHICH ...,98,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 99/78577,scatter,circle,4


In [49]:
from typing import Dict, List, Union, Callable, Optional
import pandas as pd
from datasets import load_dataset, Dataset
from tqdm import tqdm

def create_dataframe(
    dataset: Union[str, Dataset],
    subset: str = "train",
    columns: Optional[List[str]] = None,
    transformations: Optional[Dict[str, Callable]] = None,
    num_rows: Optional[int] = None,
    progress_bar: bool = True,
) -> pd.DataFrame:
    """
    Create a DataFrame from a given dataset with specified columns and transformations.

    Args:
        dataset (Union[str, Dataset]): The dataset to load. It can be either a string representing the dataset name
                                       or a pre-loaded Dataset object.
        subset (str): The subset of the dataset to use (e.g., "train", "test", "validation"). Default is "train".
        columns (Optional[List[str]]): The list of columns to include in the DataFrame. If None, all columns will be included.
        transformations (Optional[Dict[str, Callable]]): A dictionary specifying the transformations to apply to each column.
                                                         The keys are the column names, and the values are the transformation functions.
        num_rows (Optional[int]): The number of rows to include in the DataFrame. If None, all rows will be included.
        progress_bar (bool): Whether to display a progress bar during the DataFrame creation process. Default is True.

    Returns:
        pd.DataFrame: The created DataFrame.
    """
    if isinstance(dataset, str):
        dataset = load_dataset(dataset)

    if subset not in dataset:
        raise ValueError(f"Subset '{subset}' not found in the dataset.")

    data = dataset[subset]

    if columns is None:
        columns = data.column_names

    if num_rows is None:
        num_rows = data.num_rows

    rows = []
    for i in tqdm(range(num_rows), disable=not progress_bar, desc=f"Creating DataFrame from {subset} subset"):
        row = {
            **{column: data[i][column] for column in columns},
            **{
                "z": i,
                "source": f"dataset.column_names: {columns}",
                "extract": f"dataset.num_rows: {i + 1}/{data.num_rows}",
                "type": "scatter",
                "symbol": "circle",
                "size_col": 4,
            },
        }

        if transformations is not None:
            row.update(
                {
                    column: transformations[column](data[i][column])
                    for column in transformations
                    if column in columns
                }
            )

        rows.append(row)

    return pd.DataFrame(rows)

dataset_name = "b-mc2/sql-create-context"
subset = "train"
columns_to_include = ['context', 'answer', 'question']
transformations = {
    "question": lambda x: x.upper(),
    "context": lambda x: x[:500] + "...",
}

dataframe = create_dataframe(dataset_name, subset, columns_to_include, transformations, num_rows=100, progress_bar=True)


Creating DataFrame from train subset: 100%|██████████| 100/100 [00:00<00:00, 2419.13it/s]


In [50]:
dataframe


Unnamed: 0,context,answer,question,z,source,extract,type,symbol,size_col
0,CREATE TABLE head (age INTEGER)...,SELECT COUNT(*) FROM head WHERE age > 56,HOW MANY HEADS OF THE DEPARTMENTS ARE OLDER TH...,0,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 1/78577,scatter,circle,4
1,"CREATE TABLE head (name VARCHAR, born_state VA...","SELECT name, born_state, age FROM head ORDER B...","LIST THE NAME, BORN STATE AND AGE OF THE HEADS...",1,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 2/78577,scatter,circle,4
2,"CREATE TABLE department (creation VARCHAR, nam...","SELECT creation, name, budget_in_billions FROM...","LIST THE CREATION YEAR, NAME AND BUDGET OF EAC...",2,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 3/78577,scatter,circle,4
3,CREATE TABLE department (budget_in_billions IN...,"SELECT MAX(budget_in_billions), MIN(budget_in_...",WHAT ARE THE MAXIMUM AND MINIMUM BUDGET OF THE...,3,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 4/78577,scatter,circle,4
4,CREATE TABLE department (num_employees INTEGER...,SELECT AVG(num_employees) FROM department WHER...,WHAT IS THE AVERAGE NUMBER OF EMPLOYEES OF THE...,4,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 5/78577,scatter,circle,4
...,...,...,...,...,...,...,...,...,...
95,"CREATE TABLE station (name VARCHAR, lat VARCHA...","SELECT name, lat, city FROM station ORDER BY l...","WHAT ARE THE NAME, LATITUDE, AND CITY OF THE S...",95,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 96/78577,scatter,circle,4
96,"CREATE TABLE weather (date VARCHAR, mean_tempe...","SELECT date, mean_temperature_f, mean_humidity...","WHAT ARE THE DATE, MEAN TEMPERATURE AND MEAN H...",96,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 97/78577,scatter,circle,4
97,CREATE TABLE station (city VARCHAR)...,"SELECT city, COUNT(*) FROM station GROUP BY ci...",LIST THE NAME AND THE NUMBER OF STATIONS FOR A...,97,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 98/78577,scatter,circle,4
98,"CREATE TABLE trip (start_station_id VARCHAR, s...","SELECT start_station_id, start_station_name FR...",FIND THE IDS AND NAMES OF STATIONS FROM WHICH ...,98,"dataset.column_names: ['context', 'answer', 'q...",dataset.num_rows: 99/78577,scatter,circle,4


In [69]:
import plotly.express as px

columns = columns_to_include

for column in columns:

    fig = px.histogram(dataframe, x=column, title=f'Histogram of {column}')

    # fig.show()
    # fig = px.scatter(dataframe, x=column, )

    fig.show()



In [None]:
import plotly.express as px

# Assuming 'dataframe' is your DataFrame and 'x' and 'y' are columns in your DataFrame
fig = px.scatter(dataframe, x='context', )

fig.show()


In [22]:
from typing import Dict, List, Union, Callable
import pandas as pd
from datasets import load_dataset, Dataset

def create_dataframe(
    dataset: Union[str, Dataset],
    subset: str = "train",
    columns: List[str] = None,
    transformations: Dict[str, Callable] = None,
) -> pd.DataFrame:
    """
    Create a DataFrame from a given dataset with specified columns and transformations.

    Args:
        dataset (Union[str, Dataset]): The dataset to load. It can be either a string representing the dataset name
                                       or a pre-loaded Dataset object.
        subset (str): The subset of the dataset to use (e.g., "train", "test", "validation"). Default is "train".
        columns (List[str]): The list of columns to include in the DataFrame. If None, all columns will be included.
        transformations (Dict[str, Callable]): A dictionary specifying the transformations to apply to each column.
                                               The keys are the column names, and the values are the transformation functions.

    Returns:
        pd.DataFrame: The created DataFrame.
    """
    if isinstance(dataset, str):
        dataset = load_dataset(dataset)

    if subset not in dataset:
        raise ValueError(f"Subset '{subset}' not found in the dataset.")

    data = dataset[subset]

    if columns is None:
        columns = data.column_names

    dataframe_data = []
    for i in range(data.num_rows):
        row_data = {}
        for column in columns:
            value = data[i][column]
            if transformations and column in transformations:
                value = transformations[column](value)
            row_data[column] = value
        dataframe_data.append(row_data)

    return pd.DataFrame(dataframe_data)

# Example usage
dataset_name = "fka/awesome-chatgpt-prompts"
subset = "train"
columns_to_include = ["act", "prompt"]
transformations = {
    "act": lambda x: x.upper(),
    "prompt": lambda x: x[:50] + "...",
}

dataframe = create_dataframe(dataset_name, subset, columns_to_include, transformations)
print(dataframe.head())


                                   act  \
0                       LINUX TERMINAL   
1      ENGLISH TRANSLATOR AND IMPROVER   
2               `POSITION` INTERVIEWER   
3                   JAVASCRIPT CONSOLE   
4                          EXCEL SHEET   
5         ENGLISH PRONUNCIATION HELPER   
6  SPOKEN ENGLISH TEACHER AND IMPROVER   
7                         TRAVEL GUIDE   
8                   PLAGIARISM CHECKER   
9   CHARACTER FROM MOVIE/BOOK/ANYTHING   

                                              prompt  
0  I want you to act as a linux terminal. I will ...  
1  I want you to act as an English translator, sp...  
2  I want you to act as an interviewer. I will be...  
3  I want you to act as a javascript console. I w...  
4  I want you to act as a text based excel. you'l...  
5  I want you to act as an English pronunciation ...  
6  I want you to act as a spoken English teacher ...  
7  I want you to act as a travel guide. I will wr...  
8  I want you to act as a plagiarism check

In [1]:
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def calculate_sum(a, b):
    logging.info(f"Calculating the sum of {a} and {b}")
    result = a + b
    logging.info(f"The sum is: {result}")
    return result

# Example usage
result = calculate_sum(5, 3)
result


2024-03-25 14:05:04,678 - INFO - Calculating the sum of 5 and 3
2024-03-25 14:05:04,678 - INFO - The sum is: 8


8

In [3]:
import logging
import time

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def process_data():
    logging.info("Starting data processing")
    # Simulating a long-running process
    time.sleep(5.0)
    logging.info("Data processing completed")

# Example usage
process_data()


2024-03-25 14:06:21,963 - INFO - Starting data processing
2024-03-25 14:06:26,972 - INFO - Data processing completed


In [4]:
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def divide_numbers(a, b):
    try:
        result = a / b
        logging.info(f"The division result is: {result}")
        return result
    except ZeroDivisionError as e:
        logging.error(f"Error occurred: {str(e)}")
        raise

# Example usage
try:
    divide_numbers(10, 0)
except ZeroDivisionError:
    logging.info("Caught ZeroDivisionError")


2024-03-25 14:06:57,799 - ERROR - Error occurred: division by zero
2024-03-25 14:06:57,801 - INFO - Caught ZeroDivisionError


In [6]:
import logging
from flask import Flask

app = Flask(__name__)

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

@app.route('/')
def home():
    logging.info("Received request for home page")
    return "Welcome to the home page"

# Example usage
if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
2024-03-25 14:08:49,524 - INFO - [33mPress CTRL+C to quit[0m


In [7]:
import logging

# Configure logging to write to a file
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', filename='app.log')

def perform_operation():
    logging.info("Starting the operation")
    # Perform some operation
    logging.info("Operation completed")

# Example usage
perform_operation()


2024-03-25 14:09:19,925 - INFO - Starting the operation
2024-03-25 14:09:19,926 - INFO - Operation completed


In [8]:
import logging

# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Example usage
logging.debug("This is a debug message")
logging.info("This is an informative message")
logging.warning("This is a warning message")
logging.error("This is an error message")
logging.critical("This is a critical message")


2024-03-25 14:11:16,867 - INFO - This is an informative message
2024-03-25 14:11:16,869 - ERROR - This is an error message
2024-03-25 14:11:16,870 - CRITICAL - This is a critical message


In [9]:

import logging

# Create a custom logger
logger = logging.getLogger(__name__)

# Create handlers
console_handler = logging.StreamHandler()
file_handler = logging.FileHandler('app.log')

# Set the level and format for each handler
console_handler.setLevel(logging.INFO)
file_handler.setLevel(logging.DEBUG)

console_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

console_handler.setFormatter(console_format)
file_handler.setFormatter(file_format)

# Add the handlers to the logger
logger.addHandler(console_handler)
logger.addHandler(file_handler)

# Example usage
logger.debug("This is a debug message")
logger.info("This is an informative message")
logger.warning("This is a warning message")
logger.error("This is an error message")
logger.critical("This is a critical message")


2024-03-25 14:12:26,429 - INFO - This is an informative message
2024-03-25 14:12:26,429 - INFO - This is an informative message
2024-03-25 14:12:26,431 - ERROR - This is an error message
2024-03-25 14:12:26,431 - ERROR - This is an error message
2024-03-25 14:12:26,434 - CRITICAL - This is a critical message
2024-03-25 14:12:26,434 - CRITICAL - This is a critical message
