# Example usage

Here we will describ how to use `pyclean` on a DataFrame to clean and compare:

# Imports

In [1]:
from pyclean import pyclean
from pyclean.compare import compare_data
import pandas as pd
import numpy as np

# Create a new DataFrame

In [2]:
def generate_random_dataframe(num_rows: int, num_columns: int) -> pd.DataFrame:

    data = {}

    for col in range(num_columns):
        column_name = f'Column_{col}'
        data[column_name] = np.random.randint(0, 100, size=num_rows)

    df = pd.DataFrame(data)

    return df

random_df = generate_random_dataframe(100, 5)
print(random_df.head())

   Column_0  Column_1  Column_2  Column_3  Column_4
0         4        31         4        22        39
1        97        46        33        91        94
2         3        41         0        81        81
3        77        38         1         8        93
4        13        14        16        80        74


# Clean DataFrame <br>
Here we can clean the DataFrame through removing the missing values, and any possible duplicates

In [3]:
cleaned_df = pyclean.remove_missing_values(random_df)
print(cleaned_df.head())
cleaned_df = pyclean.remove_duplicates(cleaned_df)
print(cleaned_df.head())

   Column_0  Column_1  Column_2  Column_3  Column_4
0         4        31         4        22        39
1        97        46        33        91        94
2         3        41         0        81        81
3        77        38         1         8        93
4        13        14        16        80        74
   Column_0  Column_1  Column_2  Column_3  Column_4
0         4        31         4        22        39
1        97        46        33        91        94
2         3        41         0        81        81
3        77        38         1         8        93
4        13        14        16        80        74


# Check for changes

In [4]:
changes = compare_data(random_df, cleaned_df)
changes

{'missing_values': Column_0    0
 Column_1    0
 Column_2    0
 Column_3    0
 Column_4    0
 dtype: int64,
 'unique_values': Column_0    0
 Column_1    0
 Column_2    0
 Column_3    0
 Column_4    0
 dtype: int64}