# Imports and paths

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import json
import gzip
import os
from collections import defaultdict
from tqdm import tqdm
from numpy.linalg import norm
from tqdm.notebook import tqdm
from scipy import spatial
sns.set_palette('colorblind')
sns.set_theme()
tqdm.pandas()
# RNG
rng = np.random.default_rng(seed=42)

In [2]:
# File containing the activities as extracted from MATSIM
_EXTRACTED_ACTIVITIES_PATH_ = "../data/abm/vaud/extracted/vaud_activities.csv.gz"
# Files to where the agents' visits and facilities' visitors should be saved
_AGENTS_VISITS_DEST_FILE_ = "../data/abm/vaud/prepared/vaud_agents_visits.json.gz"
_FACILITIES_VISITORS_DEST_FILE_ = "../data/abm/vaud/prepared/vaud_facilities_visitors.json.gz"
# Files to where the ID to index translations should be saved
_AGENTS_ID_TRANSLATIONS_FILE_ = "../data/abm/vaud/prepared/vaud_agents_id_translations.csv.gz"
_FACILITIES_ID_TRANSLATIONS_FILE_ = "../data/abm/vaud/prepared/vaud_facilities_id_translations.csv.gz"
# Population attributes dataset
_POPULATION_ATTRIBUTES_PATH_ = "../data/abm/vaud/extracted/vaud_population.csv.gz"
# Directory into which the activities per period files are saved
_PERIOD_ACTIVITIES_REP_ = "../data/abm/vaud/prepared/period_activities/"

# Preparing contacts data
In order to run the Agent-Based Model, we'll need to be able to access the data in two ways:
* Given an agent $i$, fetch all activities performed by $i$ during the simulation;
* Given a facility $f$, fetch all individuals who visited $f$ during the simulation.

The first task will be a translation: as extracted from MATSIM, the agents' IDs and facility names are unordered (facility names aren't even all numerical, such as "home4076"). However it will be much easier to work with a total index (from 1 to the number of agents or facilities). Hence we'll need to perform that translation while treating the data.  
Once this is done, we can create two lists:
* For a given agent $i$, $A(i)$ will yield the the list of facilities visited by $i$;
* For a given facility $f$, $F(f)$ will yield the list of agents who visited $f$.



## Loading the activities into memory
Let's first load the activities CSV into memory:

In [3]:
activities = pd.read_csv(_EXTRACTED_ACTIVITIES_PATH_, index_col=0)
print(activities.info())
activities.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4455076 entries, 423464 to 36710599
Data columns (total 8 columns):
 #   Column      Dtype  
---  ------      -----  
 0   id          int64  
 1   type        object 
 2   facility    object 
 3   link        object 
 4   x           float64
 5   y           float64
 6   start_time  object 
 7   end_time    object 
dtypes: float64(2), int64(1), object(5)
memory usage: 305.9+ MB
None


Unnamed: 0,id,type,facility,link,x,y,start_time,end_time
423464,1069770,home,home480932,399007,2569239.0,1190194.0,,09:23:32
423465,1069770,shop,168569,244909,2571696.0,1189601.0,09:25:32,09:33:32
423466,1069770,home,home480932,399007,2569239.0,1190194.0,09:35:32,09:53:32
423467,1069770,leisure,399230,399013,2569430.0,1189314.0,09:56:32,10:23:32
423468,1069770,home,home480932,399007,2569239.0,1190194.0,10:26:32,


## Looking at public transport stations
It seems that public transport stations do not have an associated facility name:

In [4]:
null_facility = activities[activities['facility'].isna()]
null_facility.head()

Unnamed: 0,id,type,facility,link,x,y,start_time,end_time
587570,1098688,pt interaction,,692184,2551212.0,1166853.0,,
587571,1098688,pt interaction,,869871,2551020.0,1168574.0,,
587590,1098691,pt interaction,,613162,2553918.0,1154785.0,,
587591,1098691,pt interaction,,120829,2537875.0,1152042.0,,
587592,1098691,pt interaction,,729548,2537875.0,1152042.0,,


In [5]:
null_facility['type'].unique()

array(['pt interaction'], dtype=object)

Let's count how many unique stations there are, based on location:

In [6]:
null_facility.drop_duplicates(subset=['x', 'y',]).shape

(6953, 8)

Since there's far less unique stations, based on location, than entries in the null facilities dataframe, we can assign a name
to each station:

In [7]:
def assign_station_name(row):
    """
    Based on a row of the null facility dataframe, creates a name
    for the transport station.
    """
    return 'station' + str(row['x']) + str(row['y'])

In [8]:
activities.loc[:, 'facility'] = activities['facility'].fillna(null_facility.progress_apply(assign_station_name, axis=1))

  0%|          | 0/1046370 [00:00<?, ?it/s]

Let's check that there are no longer any missing facilities:

In [9]:
activities[activities['facility'].isna()]

Unnamed: 0,id,type,facility,link,x,y,start_time,end_time


## Building the lists

We'll now build the aforementioned lists:

In [10]:
# An "index" is the integer associated with an element (as opposed to the original
# agent ID or facility name).
# list agent index --> facility indexes
agents_visits = []
# list facility index --> agent indexes
facilities_visitors = []
# Hashmap facility name --> facility index
# This will be used to translate the facility names
hashmap_fi = dict()
# This will count the number of facilities that have already
# been translated, and thus will be the translation for the next
# facility.
# We use a list of 1 element instead of an integer, because we need it
# to be a mutable type in order to use pandas.apply thereafter (that's some
# dark python properties, ngl).
facility_counter = [0]
# Same for the agents
hashmap_ai = dict()
agents_counter = [0]

We can actually perform the translation while filling the lists, thus only browsing the activities dataset once:

In [11]:
def load_into_hashmaps(row):
    """
    Loads a row of the activities DataFrame into the hashmaps.
    """
    agent_id, facility_name = row['id'], row['facility']
    # If the facility name doesn't have an integer index yet,
    # create one
    if facility_name not in hashmap_fi:
        hashmap_fi[facility_name] = facility_counter[0]
        # facilities_visitors[facility_counter] is the list of visitors of the current
        # facility. For now it will be an empty list, waiting to be filled.
        facilities_visitors.append([])
        facility_index = facility_counter[0]
        facility_counter[0] += 1
    else:
        facility_index = hashmap_fi[facility_name]
    # If the agent ID doesn't have an integer index yet,
    # create one
    if agent_id not in hashmap_ai:
        hashmap_ai[agent_id] = agents_counter[0]
        agents_visits.append([])
        agent_index = agents_counter[0]
        agents_counter[0] += 1
    else:
        agent_index = hashmap_ai[agent_id]
    # Add the facility to the agent's visits
    agents_visits[agent_index].append(facility_index)
    facilities_visitors[facility_index].append(agent_index)

In [12]:
activities.progress_apply(load_into_hashmaps, axis=1)

  0%|          | 0/4455076 [00:00<?, ?it/s]

423464      None
423465      None
423466      None
423467      None
423468      None
            ... 
36710595    None
36710596    None
36710597    None
36710598    None
36710599    None
Length: 4455076, dtype: object

Let's check the results:

In [13]:
print("Number of agents: ", len(agents_visits))
print("Number of facil.: ", len(facilities_visitors))

Number of agents:  825576
Number of facil.:  452141


## Saving the ID -> index translation hashmaps

In [14]:
agents_id_translations = pd.DataFrame.from_dict(hashmap_ai, orient="index", columns=['agent_index'])
agents_id_translations.to_csv(_AGENTS_ID_TRANSLATIONS_FILE_)

In [15]:
facilities_id_translations = pd.DataFrame.from_dict(hashmap_fi, orient="index", columns=['facility_index'])
facilities_id_translations.to_csv(_FACILITIES_ID_TRANSLATIONS_FILE_)

## Saving the lists

In [16]:
# The following writes the list into the .json.gz format
# taken from https://stackoverflow.com/questions/39450065/python-3-read-write-compressed-json-objects-from-to-gzip-file
with gzip.open(_AGENTS_VISITS_DEST_FILE_, "w") as dest_file:
    json_str = json.dumps(agents_visits)
    json_bytes = json_str.encode('utf-8')
    dest_file.write(json_bytes)

In [17]:
with gzip.open(_FACILITIES_VISITORS_DEST_FILE_, "w") as dest_file:
    json_str = json.dumps(facilities_visitors)
    json_bytes = json_str.encode('utf-8')
    dest_file.write(json_bytes)

# Translating the agents' IDs and facility names into indexes in datasets
Now that we've have associations ```facility name``` --> ```facility_index``` and ```agent ID``` --> ```agent index```, we can translate the agents'IDs and facility names in the datasets.  
If you haven't run the previous sections, you can load the translation maps in the following cells:

In [18]:
agents_translations = pd.read_csv(_AGENTS_ID_TRANSLATIONS_FILE_, index_col=0)
facilities_translations = pd.read_csv(_FACILITIES_ID_TRANSLATIONS_FILE_, index_col=0)
agents_translations.head()

Unnamed: 0,agent_index
1069770,0
1071953,1
1097112,2
1098681,3
1098682,4


## Population attributes dataset

In [19]:
pop_attributes = pd.read_csv(_POPULATION_ATTRIBUTES_PATH_, index_col="id")
pop_attributes.head()

Unnamed: 0_level_0,age,bikeAvailability,carAvail,employed,hasLicense,home_x,home_y,householdIncome,isCarPassenger,municipalityType,...,sex,spRegion,postcode,canton,local,municipality,muni_index,wgs84_e,wgs84_n,agent_index
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1069770,85,FOR_NONE,never,False,no,2569239.0,1190194.0,,False,,...,f,3,1580,VD,Avenches,Avenches,5451,7.040607,46.880248,
1103868,85,FOR_NONE,never,False,no,2569239.0,1190194.0,,False,,...,f,3,1580,VD,Avenches,Avenches,5451,7.040607,46.880248,
1110921,76,FOR_ALL,always,True,yes,2569193.0,1189751.0,,False,,...,m,3,1580,VD,Avenches,Avenches,5451,7.040607,46.880248,
1110922,70,FOR_ALL,never,False,yes,2569193.0,1189751.0,,False,,...,f,3,1580,VD,Avenches,Avenches,5451,7.040607,46.880248,
1111181,61,FOR_ALL,always,False,yes,2569340.0,1189900.0,,False,,...,f,3,1580,VD,Avenches,Avenches,5451,7.040607,46.880248,


In [20]:
# If the dataset has already been translated, we want our code to still run, and update the result.
if 'agent_index' in pop_attributes.columns:
    pop_attributes = pop_attributes.drop('agent_index', axis=1)
pop_attributes = pop_attributes.join(agents_translations)
pop_attributes['agent_index'].head()

id
1069770     0
1103868    13
1110921    18
1110922    19
1111181    20
Name: agent_index, dtype: int64

In [21]:
pop_attributes.to_csv(_POPULATION_ATTRIBUTES_PATH_, index=True)

## Activities per period
While running the model, we'll most likely need the activities per period. 

In [22]:
# Loads the activities per period as a single dataframe (might be very large !)
period_activities = pd.read_csv('../data/abm/vaud/prepared/vaud_period_activities.csv.gz')
period_activities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21355565 entries, 0 to 21355564
Data columns (total 5 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        int64 
 1   type      object
 2   facility  object
 3   age       int64 
 4   period    object
dtypes: int64(2), object(3)
memory usage: 814.7+ MB


We'll prcocess the periods successively: for each period, we'll isolate the activities that occurred during that time; then translate the facility and agent; and finally save the result to a specific file.  

In [23]:
# Computes the unique periods in the dataset
unique_periods = period_activities['period'].unique()

In [25]:
for period_index, period in enumerate(unique_periods):
    print("Processing period ", period)
    # Isolate the activities that occurred during that period
    sub_activ = period_activities[period_activities['period'] == period]
    # Translate the agent ids to agent index
    sub_activ = sub_activ.merge(agents_translations, left_on="id", right_index=True)
    # Translate the facility names to indexes
    sub_activ = sub_activ.merge(facilities_translations, left_on="facility", right_index=True)
    # Only keep the relevant info
    sub_activ = sub_activ.drop(['age', 'period', 'id', 'facility'], axis=1)
    # Save the sub dataset
    sub_activ.to_csv(os.path.join(_PERIOD_ACTIVITIES_REP_, f"{str(period_index)}.csv.gz"),
                    index=False)

Processing period  1900-01-01 00:00:00
Processing period  1900-01-01 01:00:00
Processing period  1900-01-01 02:00:00
Processing period  1900-01-01 03:00:00
Processing period  1900-01-01 04:00:00
Processing period  1900-01-01 05:00:00
Processing period  1900-01-01 06:00:00
Processing period  1900-01-01 07:00:00
Processing period  1900-01-01 08:00:00
Processing period  1900-01-01 09:00:00
Processing period  1900-01-01 10:00:00
Processing period  1900-01-01 11:00:00
Processing period  1900-01-01 12:00:00
Processing period  1900-01-01 13:00:00
Processing period  1900-01-01 14:00:00
Processing period  1900-01-01 15:00:00
Processing period  1900-01-01 16:00:00
Processing period  1900-01-01 17:00:00
Processing period  1900-01-01 18:00:00
Processing period  1900-01-01 19:00:00
Processing period  1900-01-01 20:00:00
Processing period  1900-01-01 21:00:00
Processing period  1900-01-01 22:00:00
Processing period  1900-01-01 23:00:00
