In [76]:
import pandas as pd
from datetime import datetime

In [77]:
data = {
    'pais': ['Argentina', 'Chile', 'Brasil', 'Paraguay', 'Uruguay', 'Bolivia', 'Chile', 'Argentina', 'Chile', 'Brasil', 'Paraguay', 'Uruguay', 'Bolivia'],
    'conversion_dolar': ['0.0043', '0.0013', '0.20', '0.00014', '0.026', '0.14','0.0013', '0.0046', '0.0012', '0.201', '0.000143', '0.0261', '0.15'], 
    'moneda': ['ARS', 'CLP', 'BRL R', 'PYG', 'UYU', 'BOB Bs', 'CLP', 'ARS', 'CLP', 'BRL R', 'PYG', 'UYU', 'BOB Bs'],
    'last_update': ['2023-05-23 09:00:00', '2023-05-23 09:00:00', '2023-05-23 09:00:00', '2023-05-23 09:00:00', '2023-05-23 09:00:00', '2023-05-23 09:00:00', '2023-05-23 09:00:00', '2023-05-22 09:00:00', '2023-05-22 09:00:00', '2023-05-22 09:00:00', '2023-05-22 09:00:00', '2023-05-22 09:00:00', '2023-05-22 09:00:00']
    }

In [78]:
df = pd.DataFrame(data)

In [79]:
df

Unnamed: 0,pais,conversion_dolar,moneda,last_update
0,Argentina,0.0043,ARS,2023-05-23 09:00:00
1,Chile,0.0013,CLP,2023-05-23 09:00:00
2,Brasil,0.2,BRL R,2023-05-23 09:00:00
3,Paraguay,0.00014,PYG,2023-05-23 09:00:00
4,Uruguay,0.026,UYU,2023-05-23 09:00:00
5,Bolivia,0.14,BOB Bs,2023-05-23 09:00:00
6,Chile,0.0013,CLP,2023-05-23 09:00:00
7,Argentina,0.0046,ARS,2023-05-22 09:00:00
8,Chile,0.0012,CLP,2023-05-22 09:00:00
9,Brasil,0.201,BRL R,2023-05-22 09:00:00


In [80]:
df.count()

pais                13
conversion_dolar    13
moneda              13
last_update         13
dtype: int64

### Se trabaja en funciones como buena práctica por si se quiere replicar lo mismo para varios df, o se podrian agrupar las funciones en una Clase LimpiezaDeDatos con el mismo fin. Quitar duplicados, transformaciones específicas, etc

#### Drop Duplicates

In [81]:
def quitar_duplicados(df):
    df.drop_duplicates(inplace=True)
    df.reset_index(drop=True, inplace=True) # buena practica para que el index no pegue saltos con los datos que borro
    return df

In [82]:
quitar_duplicados(df)   # como tiene el inplace=True no hace falta asignarlo a un nuevo df
#df_sin_duplicados
df

Unnamed: 0,pais,conversion_dolar,moneda,last_update
0,Argentina,0.0043,ARS,2023-05-23 09:00:00
1,Chile,0.0013,CLP,2023-05-23 09:00:00
2,Brasil,0.2,BRL R,2023-05-23 09:00:00
3,Paraguay,0.00014,PYG,2023-05-23 09:00:00
4,Uruguay,0.026,UYU,2023-05-23 09:00:00
5,Bolivia,0.14,BOB Bs,2023-05-23 09:00:00
6,Argentina,0.0046,ARS,2023-05-22 09:00:00
7,Chile,0.0012,CLP,2023-05-22 09:00:00
8,Brasil,0.201,BRL R,2023-05-22 09:00:00
9,Paraguay,0.000143,PYG,2023-05-22 09:00:00


In [83]:
df.dtypes

pais                object
conversion_dolar    object
moneda              object
last_update         object
dtype: object

#### String to Date / Transformaciones de tipos de datos en general

Forma 1

In [84]:
def string_to_date(df, column):
    df[column] = pd.to_datetime(df[column])
    return df

In [85]:
df_fecha = string_to_date(df, column='last_update')
df_fecha

Unnamed: 0,pais,conversion_dolar,moneda,last_update
0,Argentina,0.0043,ARS,2023-05-23 09:00:00
1,Chile,0.0013,CLP,2023-05-23 09:00:00
2,Brasil,0.2,BRL R,2023-05-23 09:00:00
3,Paraguay,0.00014,PYG,2023-05-23 09:00:00
4,Uruguay,0.026,UYU,2023-05-23 09:00:00
5,Bolivia,0.14,BOB Bs,2023-05-23 09:00:00
6,Argentina,0.0046,ARS,2023-05-22 09:00:00
7,Chile,0.0012,CLP,2023-05-22 09:00:00
8,Brasil,0.201,BRL R,2023-05-22 09:00:00
9,Paraguay,0.000143,PYG,2023-05-22 09:00:00


In [86]:
df_fecha.dtypes

pais                        object
conversion_dolar            object
moneda                      object
last_update         datetime64[ns]
dtype: object

Forma 2

In [87]:
def string_to_timestamp(df, column, formato):
    df[column] = pd.to_datetime(df[column].dt.strftime(formato))
    return df

In [88]:
df_formato_date = string_to_timestamp(df, column='last_update', formato='%Y-%m-%d')
df_formato_date

Unnamed: 0,pais,conversion_dolar,moneda,last_update
0,Argentina,0.0043,ARS,2023-05-23
1,Chile,0.0013,CLP,2023-05-23
2,Brasil,0.2,BRL R,2023-05-23
3,Paraguay,0.00014,PYG,2023-05-23
4,Uruguay,0.026,UYU,2023-05-23
5,Bolivia,0.14,BOB Bs,2023-05-23
6,Argentina,0.0046,ARS,2023-05-22
7,Chile,0.0012,CLP,2023-05-22
8,Brasil,0.201,BRL R,2023-05-22
9,Paraguay,0.000143,PYG,2023-05-22


In [89]:
df_formato_date.dtypes

pais                        object
conversion_dolar            object
moneda                      object
last_update         datetime64[ns]
dtype: object

Forma 3

In [90]:
def dtype_convert(df, detail_dict):
    for column, new_type in detail_dict.items():
        if new_type == 'datetime':  # porque suele romper cuando son fechas
            df[column] = pd.to_datetime(df[column])
        else:
            df[column] = df[column].astype(new_type)
    return df

In [91]:
detail = {  # puede ser int, float, str/object, datetime, category
    'pais': 'category',
    'conversion_dolar': 'float',
    'moneda': 'str',
    'last_update': 'datetime'
    }

In [92]:
df_types = dtype_convert(df, detail)
df_types.dtypes

pais                      category
conversion_dolar           float64
moneda                      object
last_update         datetime64[ns]
dtype: object

#### Categorías

In [93]:
df_types['pais'].cat.categories

Index(['Argentina', 'Bolivia', 'Brasil', 'Chile', 'Paraguay', 'Uruguay'], dtype='object')

In [94]:
df_filtrado = df_types[df_types['pais'] == 'Argentina']
select_columns = ['moneda', 'conversion_dolar', 'last_update']

df_filtrado[select_columns]

Unnamed: 0,moneda,conversion_dolar,last_update
0,ARS,0.0043,2023-05-23
6,ARS,0.0046,2023-05-22


Ordenar por orden customizado

In [95]:
df_types['pais'] = df_types['pais'].cat.reorder_categories(['Uruguay', 'Argentina', 'Bolivia', 'Brasil', 'Chile', 'Paraguay'])
df_ordenado = df_types.sort_values(by='pais')
df_ordenado

Unnamed: 0,pais,conversion_dolar,moneda,last_update
4,Uruguay,0.026,UYU,2023-05-23
10,Uruguay,0.0261,UYU,2023-05-22
0,Argentina,0.0043,ARS,2023-05-23
6,Argentina,0.0046,ARS,2023-05-22
5,Bolivia,0.14,BOB Bs,2023-05-23
11,Bolivia,0.15,BOB Bs,2023-05-22
2,Brasil,0.2,BRL R,2023-05-23
8,Brasil,0.201,BRL R,2023-05-22
1,Chile,0.0013,CLP,2023-05-23
7,Chile,0.0012,CLP,2023-05-22


#### Metricas

##### Hallar variación de cotización

In [96]:
selected_columns = ['pais', 'moneda']
df_sin_duplicados = quitar_duplicados(df_ordenado[selected_columns])
df_sin_duplicados

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)


Unnamed: 0,pais,moneda
0,Uruguay,UYU
1,Argentina,ARS
2,Bolivia,BOB Bs
3,Brasil,BRL R
4,Chile,CLP
5,Paraguay,PYG


In [102]:
#df_ordenado['variación_dolar'] = df['conversion_dolar'].pct_change()
df_ordenado = df_types.sort_values(by=['pais', 'last_update'])
df_ordenado['variacion_dolar'] = df_ordenado['conversion_dolar'].pct_change()
df_ordenado.groupby() # ta mal

TypeError: You have to supply one of 'by' and 'level'