In [1]:
import pandas as pd
import os
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

In [2]:
list_data = os.listdir('../data/')

In [3]:
df = pd.DataFrame()

for file in tqdm(list_data):

    if file.endswith('.csv'):
        file_path = os.path.join('../data/', file)
    
        dataset = pd.read_csv(file_path, sep=';', encoding='latin-1')
        dataset = dataset.loc[:, ~dataset.columns.str.contains('^Unnamed')]
        dataset = dataset.loc[:, ~dataset.columns.str.contains('Codigo CPC')]
        dataset.columns = ['mercado', 'fecha', 'cod_dpto', 'cod_mun',
                        'nom_dpto', 'nom_mun', 'grupo', 'alimento', 
                        'cantidad']
    
        df = pd.concat([df, dataset])            

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:15<00:00,  1.02it/s]


In [4]:
df['alimento'] = df['alimento'].str.upper()

In [5]:
df['fecha'] = pd.to_datetime(df['fecha'])
df['anio'] = df['fecha'].dt.year
df['mes'] = df['fecha'].dt.month
df['dia'] = df['fecha'].dt.day

In [6]:
df['cantidad'] = df['cantidad'].str.replace(',', '.')
df['cantidad'] = df['cantidad'].astype('float')

In [7]:
df['cod_dpto'] = df['cod_dpto'].str.extract('(\d+)', expand=False)
df['cod_mun'] = df['cod_mun'].str.extract('(\d+)', expand=False)

In [8]:
df['nom_mun'] = df['nom_mun'] + ' (' + df['nom_dpto'] + ')'

In [9]:
df.dropna(inplace=True)

In [10]:
df

Unnamed: 0,mercado,fecha,cod_dpto,cod_mun,nom_dpto,nom_mun,grupo,alimento,cantidad,anio,mes,dia
0,"Barranquilla, Barranquillita",2019-01-07,76,76400,VALLE DEL CAUCA,LA UNIÓN (VALLE DEL CAUCA),FRUTAS,UVA NACIONAL,162.0,2019.0,1.0,7.0
1,"Barranquilla, Barranquillita",2019-01-07,76,76400,VALLE DEL CAUCA,LA UNIÓN (VALLE DEL CAUCA),FRUTAS,MARACUYÁ,2200.0,2019.0,1.0,7.0
2,"Barranquilla, Barranquillita",2019-01-07,76,76400,VALLE DEL CAUCA,LA UNIÓN (VALLE DEL CAUCA),FRUTAS,AGUACATE PAPELILLO,200.0,2019.0,1.0,7.0
3,"Barranquilla, Barranquillita",2019-01-07,76,76400,VALLE DEL CAUCA,LA UNIÓN (VALLE DEL CAUCA),FRUTAS,GUAYABA PERA,2400.0,2019.0,1.0,7.0
4,"Barranquilla, Barranquillita",2019-01-07,76,76400,VALLE DEL CAUCA,LA UNIÓN (VALLE DEL CAUCA),VERDURAS Y HORTALIZAS,PIMENTÓN,1944.0,2019.0,1.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...
923318,"Villavicencio, CAV",2022-06-30,25,25178,CUNDINAMARCA,CHIPAQUE (CUNDINAMARCA),"TUBERCULOS, RAICES Y PLATANOS",PAPA SUPERIOR,4000.0,2022.0,6.0,30.0
923319,"Villavicencio, CAV",2022-06-30,25,25178,CUNDINAMARCA,CHIPAQUE (CUNDINAMARCA),VERDURAS Y HORTALIZAS,CILANTRO,1000.0,2022.0,6.0,30.0
923320,"Villavicencio, CAV",2022-06-30,25,25178,CUNDINAMARCA,CHIPAQUE (CUNDINAMARCA),"TUBERCULOS, RAICES Y PLATANOS",PAPA CRIOLLA,2500.0,2022.0,6.0,30.0
923321,"Villavicencio, CAV",2022-06-30,25,25269,CUNDINAMARCA,FACATATIVÁ (CUNDINAMARCA),FRUTAS,FRESA,1040.0,2022.0,6.0,30.0


In [11]:
df.reset_index(drop=True, inplace=True)
df.to_feather('../data/data.feather')