# Process the files

In [1]:
import numpy as np
import pandas as pd
import os
from graph_traffic.config import data_path

raw_traffic_path = f"{data_path}/01-raw/traffic"

#### Historical traffic data

In [2]:
files = [file for file in os.listdir(f"{raw_traffic_path}/history") if file.endswith('.csv')]
n_files = len(files)
n_files

36

#### Locations data

In [3]:
locations = sorted(pd.read_csv(f'{raw_traffic_path}/ubs.csv').id.values)
n_locations = len(locations)
print(n_locations)
locations[:10]

4609


[1001, 1002, 1003, 1006, 1009, 1010, 1011, 1012, 1013, 1014]

## 1. Files by month --> Files by locations and id

In [4]:
for j, file in enumerate(files):
    print(f"file {j+1}/{n_files}     {file}")
    df = pd.read_csv(f"{raw_traffic_path}/history/{file}", sep=';', parse_dates=['fecha'])
    year = file.split("-")[1].split(".")[0]
    month = file.split("-")[0]
    for i, location_id in enumerate(locations):
        if location_id != 1001:
            continue
        print(f"location {i+1}/{n_locations}     {location_id}", end="\r")
        df_i = df[df.id==location_id].copy()
        for c in ['intensidad', 'ocupacion', 'carga', 'vmed']:
            df_i.loc[:, c] = np.where(df_i['error'].isna(), np.nan, df_i[c])
        df_i = df_i[['id', 'fecha', 'intensidad', 'ocupacion', 'carga', 'vmed']]
        df_i.set_index('fecha', inplace=True)
        df_i.to_csv(f"{data_path}/02-by-location-month/traffic/{location_id}_{year}_{month}.csv")

file 1/36     01-2019.csv
file 2/36     01-2020.csv
file 3/36     01-2021.csv
file 4/36     02-2019.csv
file 5/36     02-2020.csv
file 6/36     02-2021.csv
file 7/36     03-2019.csv
file 8/36     03-2020.csv
file 9/36     03-2021.csv
file 10/36     04-2019.csv
file 11/36     04-2020.csv
file 12/36     04-2021.csv
file 13/36     05-2019.csv
file 14/36     05-2020.csv
file 15/36     05-2021.csv
file 16/36     06-2019.csv
file 17/36     06-2020.csv
file 18/36     06-2021.csv
file 19/36     07-2019.csv
file 20/36     07-2020.csv
file 21/36     07-2021.csv
file 22/36     08-2019.csv
file 23/36     08-2020.csv
file 24/36     08-2021.csv
file 25/36     09-2019.csv
file 26/36     09-2020.csv
file 27/36     09-2021.csv
file 28/36     10-2019.csv
file 29/36     10-2020.csv
file 30/36     10-2021.csv
file 31/36     11-2019.csv
file 32/36     11-2020.csv
file 33/36     11-2021.csv
file 34/36     12-2019.csv
file 35/36     12-2020.csv
file 36/36     12-2021.csv
location 1/4609     1001

## 2. Concatenate all the files for the same location

In [5]:
all_loc_files = os.listdir(f"{data_path}/02-by-location-month/traffic")
for i, location in enumerate(locations):
    if location != 1001:
        continue
    print(f"location {i+1}/{n_locations}     {location}", end="\r")
    loc_files = [x for x in all_loc_files if x.startswith(f"{location}_")]
    df = pd.DataFrame([])
    for l in sorted(loc_files):
        df_i = pd.read_csv(f'{data_path}/02-by-location-month/traffic/{l}')
        df = pd.concat([df, df_i])
    df.to_csv(f'{data_path}/03-by-location/traffic/{location}.csv', index=False)

location 1/4609     1001