In [1]:
import sys
import pandas as pd
from pathlib import Path

sys.path.append("../")
from utils import geotag, write_meta

In [2]:
data_path = Path.cwd().parent/"data"
source_path = data_path/'sources'
mi_path = data_path/"mi_intermediate"
info_path = Path.cwd().parent/"metadata"

sample_period = "1s"

## Colect and preprocess data sources

We preload the portmap from the measurement plan to assign the device to the server

In [3]:
portmap = pd.read_csv(info_path/"portmap.csv")
pc_map = pd.concat(portmap[[pc]].set_index(pc).assign(device=pc) for pc in portmap.columns if 'pc' in pc)
print("Head of map port->device:")
print(pc_map.head())
portmap['direction'] = portmap.scenario.str.slice(-1).map({'U': 'uplink', 'D': 'downlink'})
portmap.set_index('scenario', inplace=True)
portmap

Head of map port->device:
     device
5213    pc4
5214    pc4
5215    pc4
5216    pc4
5205    pc2


Unnamed: 0_level_0,datarate,qos_measurement,pc4,pc2,pc3,pc1,direction
scenario,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A2U,400000,delay,5213,5205,5209,5201,uplink
A2D,400000,delay,5214,5206,5210,5202,downlink
A3U,75000000,datarate,5215,5207,5211,5203,uplink
A3D,350000000,datarate,5216,5208,5212,5204,downlink


In [4]:
iperf_cols = ["datarate", "jitter"]
ping_cols = ["ping_ms"]

source_dirs = ("gps", "iperf", "ping")

df_dict = {}
for p in source_path.rglob("*.parquet"):
    
    src = p.parent.stem
    pc = p.stem
    
    if src not in source_dirs:
        continue
    
    print(p.relative_to(source_path))


         
    df = pd.read_parquet(p)
    
    # Mapping of port to device e.g. 5201 -> pc1
    if pc == 'server':
        df['device'] = df['port_local'].astype(int).map(pc_map.device)
        df = df.groupby('device')
    
    if src == "iperf":
        df = df[iperf_cols].resample(sample_period).mean().dropna(how='all')
    elif src == "ping":
        df = df[ping_cols].resample(sample_period).mean().dropna(how='all')
    elif src == "gps":
        df["area"] = geotag(df.Latitude, df.Longitude)
    
    if pc == 'server':
        df = df.reset_index().set_index('timestamp').sort_index()
        src = 'server'
    else:
        df['device'] = pc
    
    try:
        df_dict[src].append(df)
    except KeyError:
        df_dict[src] = [df]

gps\pc1.parquet
gps\pc2.parquet
gps\pc3.parquet
gps\pc4.parquet
iperf\pc1.parquet
iperf\pc2.parquet
iperf\pc3.parquet
iperf\pc4.parquet
iperf\server.parquet
ping\pc1.parquet
ping\pc2.parquet
ping\pc3.parquet
ping\pc4.parquet


## Merge data

In [5]:
merged_dict = {k: pd.concat(v) for k, v in df_dict.items()}

In [6]:
mi_df = pd.read_parquet(mi_path/"mobile_insight.parquet")

In [7]:
server_df = merged_dict['server']

merge_keys = ['timestamp', 'device']

merged_df = merged_dict['ping'][['device', 'ping_ms']].merge(merged_dict['iperf'], how='outer', on=merge_keys)
merged_df = merged_df.merge(merged_dict['gps'], how='outer', on=merge_keys)
merged_df = merged_df.merge(mi_df, how='outer', on=['timestamp', 'device'])



merged_df.sort_index(inplace=True)
merged_df

Unnamed: 0_level_0,device,ping_ms,datarate,jitter,ts_gps,Latitude,Longitude,Altitude,speed_kmh,COG,...,SCell_Downlink_bandwidth_MHz,SCell_Uplink_bandwidth_MHz,SCell_Cell_Identity,SCell_TAC,SCell_Band_Indicator,SCell_MCC,SCell_MNC_Digit,SCell_MNC,SCell_Allowed_Access,SCell_freq_MHz
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-21 10:28:00+02:00,pc1,,,,NaT,,,,,,...,,,,,,,,,,
2021-06-21 10:28:01+02:00,pc1,,,,NaT,,,,,,...,,,,,,,,,,
2021-06-21 10:33:09+02:00,pc1,,,,NaT,,,,,,...,,,,,,,,,,
2021-06-21 10:33:10+02:00,pc1,,,,NaT,,,,,,...,,,,,,,,,,
2021-06-21 10:33:11+02:00,pc1,,,,NaT,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-24 19:09:12+02:00,pc1,,,,NaT,,,,,,...,,,,,,,,,,
2021-06-24 19:09:13+02:00,pc1,,,,NaT,,,,,,...,,,,,,,,,,
2021-06-24 19:09:14+02:00,pc1,,,,NaT,,,,,,...,,,,,,,,,,
2021-06-24 19:09:15+02:00,pc1,,,,NaT,,,,,,...,,,,,,,,,,


## Berlin protocol

In [8]:
protocol = pd.read_csv(info_path/'berlin_protocol.csv')
protocol['start_ts'] = pd.to_datetime(protocol['Start Timestamp']).dt.tz_localize("Europe/Berlin")
protocol['stop_ts'] = pd.to_datetime(protocol['Stop Timestamp']).dt.tz_localize("Europe/Berlin")
protocol = protocol.rename(columns={c: c.replace("_","") for c in protocol.columns if "pc_" in c})
protocol

Unnamed: 0,subfolder,meas-id,Day,Date,Planned Time,Planned ID,Start Timestamp,Stop Timestamp,Drive Type,pc4,pc2,pc3,pc1,sl_4,sl_2,sl_3,sl_1,start_ts,stop_ts
0,meas-00,0,Tuesday,22.06.2021,09:00,0,22.06.2021 09:49,22.06.2021 11:08,Platoon,A3DT,A3DV,A3DV,A3DT,S1,S1,S1,S1,2021-06-22 09:49:00+02:00,2021-06-22 11:08:00+02:00
1,meas-01,1,Tuesday,22.06.2021,10:00,1,22.06.2021 11:26,22.06.2021 12:24,Platoon,A3DT,A2DV,A2DV,A3DT,S1,S1,S1,S1,2021-06-22 11:26:00+02:00,2021-06-22 12:24:00+02:00
2,meas-02,2,Tuesday,22.06.2021,13:00,3,22.06.2021 14:33,22.06.2021 15:45,Platoon,A3DT,A2DV,A2DV,A3DT,S2,S2,S2,S2,2021-06-22 14:33:00+02:00,2021-06-22 15:45:00+02:00
3,meas-03,3,Tuesday,22.06.2021,14:00,4,22.06.2021 15:56,22.06.2021 17:17,Platoon,A2DT,A3DV,A3DV,A2DT,S2,S2,S2,S2,2021-06-22 15:56:00+02:00,2021-06-22 17:17:00+02:00
4,meas-04,4,Tuesday,22.06.2021,15:00,5,22.06.2021 17:24,22.06.2021 18:14,Platoon,A2DT,A3DV,A3DV,A2DT,S1,S1,S1,S1,2021-06-22 17:24:00+02:00,2021-06-22 18:14:00+02:00
5,meas-05,5,Wednesday,23.06.2021,09:00,6,23.06.2021 09:29,23.06.2021 10:22,Platoon,A3UT,A3UV,A3UV,A3UT,S1,S1,S1,S1,2021-06-23 09:29:00+02:00,2021-06-23 10:22:00+02:00
6,meas-06,6,Wednesday,23.06.2021,10:00,7,23.06.2021 10:34,23.06.2021 11:28,Platoon,A3UT,A3UV,A3UV,A3UT,S1,S1,S1,S1,2021-06-23 10:34:00+02:00,2021-06-23 11:28:00+02:00
7,meas-07,7,Wednesday,23.06.2021,11:00,8,23.06.2021 11:30,23.06.2021 12:16,Platoon,A3UT,A3UV,A3UV,A3UT,S1,S1,S1,S1,2021-06-23 11:30:00+02:00,2021-06-23 12:16:00+02:00
8,meas-08,8,Wednesday,23.06.2021,13:00,9,23.06.2021 13:50,23.06.2021 14:49,Platoon,A2UT,A2UV,A2UV,A2UT,S2,S2,S2,S2,2021-06-23 13:50:00+02:00,2021-06-23 14:49:00+02:00
9,meas-09,9,Wednesday,23.06.2021,14:00,10,23.06.2021 14:54,23.06.2021 15:55,Platoon,A2UT,A2UV,A2UV,A2UT,S2,S2,S2,S2,2021-06-23 14:54:00+02:00,2021-06-23 15:55:00+02:00


Assign measurement and scenario based on timestamps and device

In [9]:
cell_protocol = merged_df.assign(measurement=-1)

cell_protocol['measurement'] = None
cell_protocol['scenario'] = None
cell_protocol['drive_mode'] = None
cell_index = cell_protocol.index

for i, meas_protocol in protocol.iterrows():
    meas_index = (meas_protocol.start_ts <= cell_index) & (meas_protocol.stop_ts > cell_index)
    
    cell_protocol.loc[meas_index, 'measurement'] = meas_protocol["meas-id"]
    cell_protocol.loc[meas_index, 'drive_mode'] = meas_protocol["Drive Type"].lower()
    for pc in ("pc1", "pc2", "pc3", "pc4"):
        cell_protocol.loc[meas_index & (cell_protocol.device == pc), 'scenario'] = meas_protocol[pc] # A3DT, A2UV ...

cell_protocol.dropna(subset=['scenario'], inplace=True)
cell_protocol

Unnamed: 0_level_0,device,ping_ms,datarate,jitter,ts_gps,Latitude,Longitude,Altitude,speed_kmh,COG,...,SCell_Cell_Identity,SCell_TAC,SCell_Band_Indicator,SCell_MCC,SCell_MNC_Digit,SCell_MNC,SCell_Allowed_Access,SCell_freq_MHz,scenario,drive_mode
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-22 09:49:00+02:00,pc4,,,,NaT,,,,,,...,33802248.0,1494.0,7.0,262.0,2.0,1.0,0.0,2600.0,A3DT,platoon
2021-06-22 09:49:00+02:00,pc3,,,,NaT,,,,,,...,51842568.0,49101.0,3.0,262.0,2.0,2.0,0.0,1800.0,A3DV,platoon
2021-06-22 09:49:00+02:00,pc1,,,,NaT,,,,,,...,33802248.0,1494.0,7.0,262.0,2.0,1.0,0.0,2600.0,A3DT,platoon
2021-06-22 09:49:00+02:00,pc2,,,,NaT,,,,,,...,51842565.0,49101.0,7.0,262.0,2.0,2.0,0.0,2600.0,A3DV,platoon
2021-06-22 09:49:01+02:00,pc2,,,,NaT,,,,,,...,51842565.0,49101.0,7.0,262.0,2.0,2.0,0.0,2600.0,A3DV,platoon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-24 18:59:55+02:00,pc1,,178000000.0,0.000123,2021-06-24 18:59:55,52.513883,13.335023,28.0,2.7780,81.6,...,33802248.0,1494.0,7.0,262.0,2.0,1.0,0.0,2600.0,A3DT,1x2
2021-06-24 18:59:56+02:00,pc1,,178000000.0,0.000091,2021-06-24 18:59:56,52.513885,13.335027,28.0,0.7408,81.6,...,33802248.0,1494.0,7.0,262.0,2.0,1.0,0.0,2600.0,A3DT,1x2
2021-06-24 18:59:57+02:00,pc1,,165000000.0,0.000076,2021-06-24 18:59:57,52.513887,13.335030,28.1,0.0000,81.6,...,33802248.0,1494.0,7.0,262.0,2.0,1.0,0.0,2600.0,A3DT,1x2
2021-06-24 18:59:58+02:00,pc1,540.0,177000000.0,0.000040,2021-06-24 18:59:58,52.513890,13.335032,28.1,0.0000,81.6,...,33802248.0,1494.0,7.0,262.0,2.0,1.0,0.0,2600.0,A3DT,1x2


Add metadata based on scenario

In [10]:
operator_map = {"T": 1, "V": 2}

# We strip the 2/3 and T/V out of the scenario to assign an operator ID
scenario_operator = cell_protocol.scenario.str.slice(start=3)
cell_protocol["scenario"] = cell_protocol.scenario.str.slice(stop=3)

cell_protocol["target_datarate"] = cell_protocol.scenario.map(portmap.datarate)
cell_protocol["direction"] = cell_protocol.scenario.map(portmap.direction)
cell_protocol["measured_qos"] = cell_protocol.scenario.map(portmap.qos_measurement)
cell_protocol["operator"] = scenario_operator.map(operator_map)
cell_protocol = cell_protocol.astype({"measurement": int})

Now that we have the metadata, we substitute the uplink iperf values with the server

In [11]:
iperf_uplink = cell_protocol[['device']].merge(server_df[iperf_cols + ['device']], on=['timestamp', 'device'], how='left')
cell_protocol[iperf_cols] = cell_protocol[iperf_cols].where(cell_protocol.direction == "downlink", iperf_uplink)

In [12]:
write_meta(cell_protocol, info_path/"cellular_info.csv")
cell_protocol.to_parquet(data_path/"cellular_dataframe.parquet", compression="gzip")

In [13]:
cell_protocol

Unnamed: 0_level_0,device,ping_ms,datarate,jitter,ts_gps,Latitude,Longitude,Altitude,speed_kmh,COG,...,SCell_MNC_Digit,SCell_MNC,SCell_Allowed_Access,SCell_freq_MHz,scenario,drive_mode,target_datarate,direction,measured_qos,operator
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-06-22 09:49:00+02:00,pc4,,,,NaT,,,,,,...,2.0,1.0,0.0,2600.0,A3D,platoon,350000000,downlink,datarate,1
2021-06-22 09:49:00+02:00,pc3,,,,NaT,,,,,,...,2.0,2.0,0.0,1800.0,A3D,platoon,350000000,downlink,datarate,2
2021-06-22 09:49:00+02:00,pc1,,,,NaT,,,,,,...,2.0,1.0,0.0,2600.0,A3D,platoon,350000000,downlink,datarate,1
2021-06-22 09:49:00+02:00,pc2,,,,NaT,,,,,,...,2.0,2.0,0.0,2600.0,A3D,platoon,350000000,downlink,datarate,2
2021-06-22 09:49:01+02:00,pc2,,,,NaT,,,,,,...,2.0,2.0,0.0,2600.0,A3D,platoon,350000000,downlink,datarate,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-24 18:59:55+02:00,pc1,,178000000.0,0.000123,2021-06-24 18:59:55,52.513883,13.335023,28.0,2.7780,81.6,...,2.0,1.0,0.0,2600.0,A3D,1x2,350000000,downlink,datarate,1
2021-06-24 18:59:56+02:00,pc1,,178000000.0,0.000091,2021-06-24 18:59:56,52.513885,13.335027,28.0,0.7408,81.6,...,2.0,1.0,0.0,2600.0,A3D,1x2,350000000,downlink,datarate,1
2021-06-24 18:59:57+02:00,pc1,,165000000.0,0.000076,2021-06-24 18:59:57,52.513887,13.335030,28.1,0.0000,81.6,...,2.0,1.0,0.0,2600.0,A3D,1x2,350000000,downlink,datarate,1
2021-06-24 18:59:58+02:00,pc1,540.0,177000000.0,0.000040,2021-06-24 18:59:58,52.513890,13.335032,28.1,0.0000,81.6,...,2.0,1.0,0.0,2600.0,A3D,1x2,350000000,downlink,datarate,1
