<a href="https://colab.research.google.com/github/fmbento/Jupyter-Notebooks_ad-hoc-solutions/blob/main/Datasets_Sample_Maker_10_50_100_500_or_1K_records.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Generate a Sample dataset with 10, 50, 100, 500 or 1K Records.
### Ideal for users to evaluate if the main dataset file has the desired data.
### Smaller, easier to download. Also good for your server and network.
### Uses Pandas native sample method.  






1. Download from a URL?
> If yes, put it here and run the next cell (to upload a file just run the cell below).

In [None]:
# @title
import ipywidgets as widgets
from IPython.display import display

print("Demo, UK Doctoral Thesis Metadata from EThOS (https://bl.iro.bl.uk/concern/datasets/10cc13f9-797d-41f2-a7e2-d29f4306133e?locale=en):")
print("Download URL: https://bl.iro.bl.uk/downloads/05b31c0e-da22-4b9f-a17c-35880aa111f4?locale=en")
print()

url_text = widgets.Text(
    description='Enter URL:',
    disabled=False
)
display(url_text)

Demo, UK Doctoral Thesis Metadata from EThOS (https://bl.iro.bl.uk/concern/datasets/10cc13f9-797d-41f2-a7e2-d29f4306133e?locale=en):
Download URL: https://bl.iro.bl.uk/downloads/05b31c0e-da22-4b9f-a17c-35880aa111f4?locale=en



Text(value='', description='Enter URL:')

2. Download or upload file:

In [None]:
# @title
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import io
import warnings
from google.colab import files
import requests # Import the requests library
import os # Import os module for file handling

# Ignore FutureWarning messages
warnings.filterwarnings('ignore', category=FutureWarning)

print("Libraries imported successfully. Ready for data analysis!")

# Check if a URL is provided in the url_text widget (assuming url_text widget is created in a previous cell)
if 'url_text' in globals() and url_text.value:
    print(f"Attempting to download from URL: {url_text.value}")
    try:
        # Determine the filename from the URL or set a default
        filename = url_text.value.split('/')[-1]
        if '.' not in filename:
             filename = 'downloaded_file.csv' # Default name if no extension in URL

        # Use !wget to download the file
        get_ipython().system(f'wget -O {filename} {url_text.value}')

        # Check if the file was downloaded successfully
        if not os.path.exists(filename):
            raise Exception(f"Error downloading file from URL using wget: {url_text.value}")

        # Try reading the file with different encodings, delimiters, and specifying the quote character
        try:
            encodings_to_try = ['utf-8', 'latin1', 'ISO-8859-1']
            delimiter_to_try = [',', ';']
            df = None

            for encoding in encodings_to_try:
                for delimiter in delimiter_to_try:
                    try:
                        df = pd.read_csv(filename, sep=delimiter, encoding=encoding, quotechar='"')
                        print(f"\nDataset loaded successfully from URL with delimiter '{delimiter}', encoding '{encoding}', and quotechar '\"'!")
                        break # Exit loops if successful
                    except (UnicodeDecodeError, pd.errors.ParserError) as e:
                        print(f"Failed to read with delimiter '{delimiter}', encoding '{encoding}', and quotechar '\"': {e}")
                        continue # Try next combination
                if df is not None:
                    break # Exit encoding loop if successful

            if df is None:
                raise Exception("Could not read the CSV file with any of the tested delimiter/encoding/quotechar combinations.")

        except Exception as e:
            raise Exception(f"Error during file reading after download: {e}")

    except Exception as e:
        raise Exception(f"Error during file download or reading: {e}")

else:
    # Fallback to the existing file upload functionality if no URL is provided
    print("No URL provided. Falling back to file upload.")
    print("Please upload your dataset (CSV file).")
    uploaded = files.upload()

    # Check if a file was uploaded
    if not uploaded:
        raise FileNotFoundError("No file was uploaded. Please upload a CSV file to proceed.")

    # Get the filename and file content
    filename = list(uploaded.keys())[0]
    file_content = io.StringIO(uploaded[filename].decode('utf-8'))

    print(f'User uploaded file "{filename}" with length {len(uploaded[filename])} bytes')

    # Try reading with comma delimiter first
    try:
        df = pd.read_csv(file_content)
        print("\nDataset loaded successfully with comma delimiter!")
    except pd.errors.ParserError:
        # If parsing with comma fails, try reading with semicolon delimiter
        file_content.seek(0) # Reset file pointer to the beginning
        try:
            df = pd.read_csv(file_content, sep=';')
            print("\nDataset loaded successfully with semicolon delimiter!")
        except Exception as e:
            raise Exception(f"Could not read the CSV file with either comma or semicolon delimiter: {e}")


# Display basic information about the dataset
print("\n--- DataFrame Head ---")
print(df.head())

print("\n--- DataFrame Info ---")
df.info()

print("\n--- DataFrame Description ---")
print(df.describe())

3. Perform sampling:

In [None]:
# @title
import ipywidgets as widgets
from IPython.display import display
from google.colab import files
import pandas as pd
import os

# Create a dropdown widget for sample size selection
sample_size = widgets.Dropdown(
    options=[10, 50, 100, 500, 1000],
    value=50,  # Default value
    description='Sample Size:',
    disabled=False,
)

# Create a button widget to trigger sampling
go_button = widgets.Button(
    description='Go',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click to create sample',
    icon='check' # (FontAwesome icons available: https://fontawesome.com/icons?d=gallery&c=spinners&m=free)
)

# Display the widgets
display(sample_size, go_button)

# Define the function to be executed when the button is clicked
def on_button_clicked(b):
    print("Creating sample...")
    # Create a sample of the selected size
    df_sample = df.sample(n=sample_size.value, random_state=42) # Using random_state for reproducibility

    # Determine the original filename based on whether a URL was used or a file was uploaded
    if 'url_text' in globals() and url_text.value:
        # If a URL was provided, the filename is 'downloaded_file.csv' or inferred from URL if it had an extension
        original_filename = url_text.value.split('/')[-1]
        if '.' not in original_filename:
            original_filename = 'downloaded_file.csv'
    elif 'uploaded' in globals() and uploaded:
        # If a file was uploaded, get the filename from the uploaded dictionary
        original_filename = list(uploaded.keys())[0]
    else:
        # Default filename if neither a URL was used nor a file was uploaded
        original_filename = 'dataset.csv'


    # Create the new filename
    new_filename = original_filename.replace('.csv', f'_sample_{sample_size.value}.csv')

    # Export the sample DataFrame to a CSV file
    df_sample.to_csv(new_filename, index=False)

    print(f"\nSample dataset saved successfully as '{new_filename}'")

    # Automatically download the generated sample file
    try:
        files.download(new_filename)
        print(f"'{new_filename}' is being downloaded.")
    except Exception as e:
        print(f"Error initiating download: {e}")


go_button.on_click(on_button_clicked)

Dropdown(description='Sample Size:', index=1, options=(10, 50, 100, 500, 1000), value=50)

Button(description='Go', icon='check', style=ButtonStyle(), tooltip='Click to create sample')

Creating sample...

Sample dataset saved successfully as 'downloaded_file_sample_1000.csv'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

'downloaded_file_sample_1000.csv' is being downloaded.
