In [3]:
import pandas as pd
import janitor

In [4]:
dirty_dataset = {
    "name": ["felipe", "carlos", "blue", "safira"],
    "id": [1, 2, 3, 4],
    "age": [25, 20, 21, None],
    "payment #ID": [213234, 423423, 534543, None],
    "value": [85.0, 86, 120.0, 200],
    "due_date": ["01/09/2020", "12/12/2020", None, "21/09/2021"],
    
}

In [40]:
df = pd.DataFrame(dirty_dataset)

In [41]:
df.head(10)

Unnamed: 0,name,id,age,payment #ID,value,due_date
0,felipe,1,25.0,213234.0,85.0,01/09/2020
1,carlos,2,20.0,423423.0,86.0,12/12/2020
2,blue,3,21.0,534543.0,120.0,
3,safira,4,,,200.0,21/09/2021


In [42]:
df.dtypes

name            object
id               int64
age            float64
payment #ID    float64
value          float64
due_date        object
dtype: object

In [43]:
df

Unnamed: 0,name,id,age,payment #ID,value,due_date
0,felipe,1,25.0,213234.0,85.0,01/09/2020
1,carlos,2,20.0,423423.0,86.0,12/12/2020
2,blue,3,21.0,534543.0,120.0,
3,safira,4,,,200.0,21/09/2021


Como dá pra ver, nós temos alguns problemas no dataset, pense por uns dois minutos que você **acha** problemático no dataset.



Os principais que eu consigo enxergar são:
* payment #ID é um nome de coluna horrível para quase qualquer sistema.
* payment #ID tá listado como float64 quando é fácil ver que é do tipo inteiro, assim como age.
* due_date tá listado como tipo "object" quando é óbvio que seu tipo é Date/datetime
* Missing data nas colunas age, payment #ID e due_date

Então eu vou mostrar como eu faria essas limpezas utilizando pandas somente e depois a diferença ao utilizar Pyjanitor.


# Limpando utilizando pandas

In [45]:
cleaned_by_pandas_df = df.copy()
cleaned_by_pandas_df.rename(columns={"payment #ID": "payment_id"}, inplace=True)
desired_dtypes = {
    "age": "int32",
    "payment_id": "int64",
    "due_date": "datetime64",
    "value": "float64"
}
cleaned_by_pandas_df["age"].fillna(cleaned_by_pandas_df["age"].median(), inplace=True)
cleaned_by_pandas_df.dropna(subset=["payment_id"], inplace=True)
cleaned_by_pandas_df = cleaned_by_pandas_df.astype(desired_dtypes)

In [46]:
cleaned_by_pandas_df.head(10)

Unnamed: 0,name,id,age,payment_id,value,due_date
0,felipe,1,25,213234,85.0,2020-01-09
1,carlos,2,20,423423,86.0,2020-12-12
2,blue,3,21,534543,120.0,NaT


#  Limpando dados utilizando pyjanitor

In [47]:
cleaned_by_janitor_df = (
    df
    .rename_column("payment #ID", "payment_id")
    .dropna(subset=["payment_id"])
    .change_type('age', 'int32')
    .change_type('due_date', 'datetime64')
    .change_type('payment_id', 'int64')
    .fill_empty('age', df["age"].median())
)

In [48]:
cleaned_by_janitor_df.head(10)

Unnamed: 0,name,id,age,payment_id,value,due_date
0,felipe,1,25,213234,85.0,2020-01-09
1,carlos,2,20,423423,86.0,2020-12-12
2,blue,3,21,534543,120.0,NaT


In [49]:
df.head(10)

Unnamed: 0,name,id,age,payment #ID,value,due_date
0,felipe,1,25.0,213234.0,85.0,01/09/2020
1,carlos,2,20.0,423423.0,86.0,12/12/2020
2,blue,3,21.0,534543.0,120.0,
3,safira,4,,,200.0,21/09/2021


# Expandir pyjanitor

In [50]:
import pandas_flavor as pf


@pf.register_dataframe_method
def remove_column(df, column_name: str):
    del df[column_name]
    return df

In [52]:
cleaned_by_janitor_df = (
    cleaned_by_janitor_df
    .remove_column("age")
    .remove_column("name")
)

In [53]:
cleaned_by_janitor_df.head(10)

Unnamed: 0,id,payment_id,value,due_date
0,1,213234,85.0,2020-01-09
1,2,423423,86.0,2020-12-12
2,3,534543,120.0,NaT
