# Imports

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import norm, nbinom
from sklearn import datasets
from sklearn.utils import shuffle
import plotly.express as px

# Auxiliary Functions

In [71]:
def load_data() -> pd.DataFrame:

    """Returns data frame with iris dataset."""

    iris = datasets.load_iris()

    X = iris.data
    y = iris.target.reshape(-1, 1)

    dataset = pd.DataFrame(np.c_[X, y])
    dataset.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
    dataset.replace({0:'Setosa', 1:'Versicolour', 2:'Virginica'}, inplace=True)

    return dataset

def binary_search_percentile(random_number: float, distribution: str,
                                lower_bound: float, upper_bound: float, tol: float) -> float:

    """Recieves a random number generated from a specific distribution and uses binary search to
    find the percentile to which it corresponds.

    Args:
        random_number (float): random number generated from a specific distribution.
        distribution (str): name of the used distribution.
        lower_bound (float): lower bound of the percentile search.
        upper_bound (float): upper bound of the percentile search.
        tol (float): tol of the percentile search.

    Returns:
        float: percentile of the random number."""

    middle_percentile = (lower_bound + upper_bound)/2

    if distribution == "normal":
        percentile_number = norm.ppf(middle_percentile, loc=0, scale=1)
    else:
        percentile_number = nbinom.ppf(middle_percentile, n=1, p=0.1)

    if abs(random_number - percentile_number) < tol:
        return middle_percentile
    elif random_number > percentile_number:
        return binary_search_percentile(random_number, distribution, middle_percentile, upper_bound, tol)
    else:
        return binary_search_percentile(random_number, distribution, lower_bound, middle_percentile, tol)

def sample_data(dataset: pd.DataFrame, distribution: str, shuffle_data: bool = False) -> pd.DataFrame:


    """Samples the data according to certain probability distribution.

    Args:
        dataset (pd.DataFrame): dataset to sample from.
        distribution (str): distribution to use to generate the random numbers.
        shuffle_data (bool): specifies if shuffling the data in each iteration.

    Returns:
        pd.DataFrame: sampled dataset"""

    sampling_dataset = dataset.copy()

    n_iter = int(np.ceil(dataset.shape[0]*0.6))
    random_samples = []

    for _ in range(n_iter):

        if distribution == 'normal':
            random_number = np.random.normal(0, 1)
            percentile_of_distribution = binary_search_percentile(random_number, distribution, 0., 1., 10e-3)
        elif distribution == 'negative_binomial':
            random_number = np.random.negative_binomial(1, 0.1)
            percentile_of_distribution = binary_search_percentile(random_number, distribution, 0., 1., 10e-3)
        else:
            random_number = np.random.uniform(0, 1)
            percentile_of_distribution = random_number

        sample_index = int(percentile_of_distribution*sampling_dataset.shape[0])

        sample = sampling_dataset.iloc[sample_index, :]
        sampling_dataset.drop(index=sample_index, axis=0, inplace=True)

        if shuffle_data:
            sampling_dataset = shuffle(sampling_dataset)

        sampling_dataset.reset_index(inplace=True, drop=True)

        random_samples.append(sample)

    train_dataset = pd.concat(random_samples, axis=1).transpose()
    test_dataset = sampling_dataset

    return train_dataset, test_dataset

def plot_dataset(dataset: pd.DataFrame) -> None:

    """Generates seaborn pairplot from dataset."""

    fig = px.scatter_3d(dataset, x='sepal_length', y='sepal_width', z='petal_width', color='species', size_max=18, opacity=0.7)
    fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
    fig.show()

def target_variable_distribution(dataset: pd.DataFrame) -> None:

    """Counts how many times certain species appear in a given dataset."""

    species_appearance = dataset.species.value_counts()
    out = 'Setosa: {0}\nVirginica: {1}\nVersicolour: {2}'
    print(out.format(species_appearance['Setosa'], species_appearance['Virginica'], species_appearance['Versicolour']))

# Regular Dataset Results

In [67]:
dataset = load_data()

target_variable_distribution(dataset)
plot_dataset(dataset)

Setosa: 50
Virginica: 50
Versicolour: 50


In [68]:
uniform_sampled_dataset = sample_data(dataset, 'uniform')

target_variable_distribution(uniform_sampled_dataset)
plot_dataset(uniform_sampled_dataset)

Setosa: 28
Virginica: 30
Versicolour: 32


In [69]:
normal_sampled_dataset = sample_data(dataset, 'normal')

target_variable_distribution(normal_sampled_dataset)
plot_dataset(normal_sampled_dataset)

Setosa: 27
Virginica: 26
Versicolour: 37


In [72]:
negative_binomial_sampled_dataset = sample_data(dataset, 'negative_binomial')

target_variable_distribution(negative_binomial_sampled_dataset)
plot_dataset(negative_binomial_sampled_dataset)

Setosa: 30
Virginica: 26
Versicolour: 34


# Shuffled Dataset Results

In [75]:
shuffled_dataset = shuffle(dataset)

shuffled_uniform_sampled_dataset = sample_data(shuffled_dataset, 'uniform')

target_variable_distribution(shuffled_uniform_sampled_dataset)
plot_dataset(shuffled_uniform_sampled_dataset)

Setosa: 22
Virginica: 33
Versicolour: 35


In [76]:
shuffled_normal_sampled_dataset = sample_data(shuffled_dataset, 'normal')

target_variable_distribution(shuffled_normal_sampled_dataset)
plot_dataset(shuffled_normal_sampled_dataset)

Setosa: 30
Virginica: 27
Versicolour: 33


In [77]:
shuffled_negative_binomial_sampled_dataset = sample_data(shuffled_dataset, 'negative_binomial')

target_variable_distribution(shuffled_negative_binomial_sampled_dataset)
plot_dataset(shuffled_negative_binomial_sampled_dataset)

Setosa: 30
Virginica: 33
Versicolour: 27
