# Importing the package

In [None]:
from pytoolbox.dataframe import DataFrame

DataFrame class documentation

In [None]:
help(DataFrame)

# Examples of Usage

## C++ Binded Methods

In [None]:
iris = DataFrame()

iris.read_csv('../datasets/iris.csv')


In [None]:
iris.head()

In [None]:
iris.table_nan()
iris.drop_row_nan()


In [None]:
iris.table_nan()

## New  Python Methods

In [None]:
print(iris)

In [None]:
help(iris.head)

In [None]:
# Use dynamically added methods
pandas_df = iris.to_pandas()
pandas_df.head()


In [None]:
iris.plot_histogram("SepalWidthCm", bins=5)

In [None]:
print(iris.get_header())

In [None]:
# remember that it is not possible to compute the correlation if nans are present

iris.drop_row_nan()

iris.plot_correlation_matrix(["SepalLengthCm", "SepalWidthCm", "PetalLengthCm"])

In [None]:
iris.scatter_plot("SepalLengthCm", "SepalWidthCm")

In [None]:
stats = iris.advanced_stat("SepalWidthCm")
print(f"Advanced Stats for 'Age': {stats}")


In [None]:
iris.to_np_array("SepalLengthCm")

access column using `__get_item__`

In [None]:
for value in iris['Species']:
    print(value)

Possible use cases of `row_iterator`

In [None]:
for row in iris:
    print(row)

In [None]:
# Get an iterator for the iris DataFrame
iris_iterator = iter(iris)

# Iterate over the first 5 rows and print them
for _ in range(5):
    row = next(iris_iterator)
    print(row)

`__len__` method

In [None]:
len(iris)

# Testing Callbacks

In [None]:
import time
import numpy as np
import pandas as pd

The `DataFrame::calculate_correlation_with_callback` method allows us to calculate the correlations between columns in a DataFrame while leveraging a user-defined callback function. 

In this example, we define a Python callback function, `process_row`, which simply prints each row of the correlation matrix as it is computed. We then pass this function as an argument to the `calculate_correlation_with_callback` method. The C++ implementation ensures that the Python callback is invoked for each row of the correlation matrix, making it easy to handle the results dynamically.

In [None]:
# Define a Python callback function
def process_row(row):
    print(f"Processing row: {row}")

# Use the callback

iris2 = DataFrame()
iris2.read_csv('../datasets/iris.csv')
iris2.drop_row_nan()
iris2.drop_col("Species")
iris2.calculate_correlation_with_callback(process_row)

Let's now measure how much time is required to do the same computation, on a synthetic large dataset, using:

1. Pandas method
2. C++ method with callback
3. C++ method without callback

In [None]:
# Data preparation
n_rows = 1000
n_cols = 100

data = np.random.rand(n_rows, n_cols)

# Convert to C++ DataFrame
df_cpp = DataFrame()

for i in range(n_cols):
    df_cpp.add_column(f'col_{i}', data[:, i].tolist())

# Convert to Pandas DataFrame
df_pd = pd.DataFrame(data)

In [None]:

# Convert to Pandas DataFrame
df_pd = pd.DataFrame(data)

In [None]:
# Using Pandas
start = time.time()
correlation_pd = df_pd.corr()
print("Pandas Time:", time.time() - start)

# Using C++ with callbacks
start = time.time()
df_cpp.calculate_correlation_with_callback(lambda row: None)  # No-op callback
print("C++ with callbacks Time:", time.time() - start)

# Using C++ without callbacks
start = time.time()
df_cpp.calculate_correlation() 
print("C++ without callbacks Time:", time.time() - start)
