# Merge RRC Serving Cell data

## Preliminaries: Imports, load data and defines

In [1]:
import pandas as pd
from pathlib import Path
from datetime import datetime
import numpy as np

In [2]:
data_path = Path.cwd().parent/"data"
mi_path = data_path/'sources/mobile_insight'
out_path = data_path/"mi_intermediate"

res_ival = "1s"

In [3]:
in_path = mi_path
rrc_list = []

for p in in_path.rglob("*/*LTE_RRC_Serv_Cell_Info*.parquet"):
    print(p.relative_to(in_path))

    pc, fname = p.relative_to(in_path).parts
         
    df = pd.read_parquet(p)
    if df.index.dtype != 'datetime64[ns, Europe/Berlin]':
        print("Adding index as tz-ed datetime")
        df.set_index("timestamp", inplace=True)
        df.index = df.index.tz_localize("Europe/Berlin")
    df = df.rename(columns={"Serving Cell Index": 'mi2log_count'})
    df['device'] = pc
    rrc_list.append(df)
rrc_df = pd.concat(rrc_list)

pc1\LTE_RRC_Serv_Cell_Info.parquet
pc2\LTE_RRC_Serv_Cell_Info.parquet
pc3\LTE_RRC_Serv_Cell_Info.parquet
pc4\LTE_RRC_Serv_Cell_Info.parquet


## Step 1 - Preprocess RRC

1. Round it down to minutes to make the mapping from Cell ID to Cell Identity unique
2. Separate Cell IDs with unique Identities (per device) and ambiguous ones

In [4]:
rrc_df

Unnamed: 0_level_0,log_msg_len,type_id,Version,Cell ID,Downlink frequency,Uplink frequency,Downlink bandwidth,Uplink bandwidth,Cell Identity,TAC,Band Indicator,MCC,MNC Digit,MNC,Allowed Access,file,device
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2021-06-21 10:33:42.186928+02:00,41,LTE_RRC_Serv_Cell_Info,3,115,1300,19300,20 MHz,20 MHz,26367490,1494,3,262,2,1,0,monitor-20210621-103309,pc1
2021-06-21 10:34:10.860116+02:00,41,LTE_RRC_Serv_Cell_Info,3,115,1300,19300,20 MHz,20 MHz,26367490,1494,3,262,2,1,0,monitor-20210621-103309,pc1
2021-06-21 10:35:20.460883+02:00,41,LTE_RRC_Serv_Cell_Info,3,115,1300,19300,20 MHz,20 MHz,26367490,1494,3,262,2,1,0,monitor-20210621-103501,pc1
2021-06-21 10:35:37.606931+02:00,41,LTE_RRC_Serv_Cell_Info,3,115,1300,19300,20 MHz,20 MHz,26367490,1494,3,262,2,1,0,monitor-20210621-103501,pc1
2021-06-21 10:36:09.571953+02:00,41,LTE_RRC_Serv_Cell_Info,3,115,1300,19300,20 MHz,20 MHz,26367490,1494,3,262,2,1,0,monitor-20210621-103501,pc1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-24 10:10:46.467735+02:00,41,LTE_RRC_Serv_Cell_Info,3,116,1300,19300,20 MHz,20 MHz,26367489,1494,3,262,2,1,0,monitor-20210624-082805,pc4
2021-06-24 10:11:59.669551+02:00,41,LTE_RRC_Serv_Cell_Info,3,246,1300,19300,20 MHz,20 MHz,33802242,1494,3,262,2,1,0,monitor-20210624-082805,pc4
2021-06-24 10:12:48.647748+02:00,41,LTE_RRC_Serv_Cell_Info,3,248,1300,19300,20 MHz,20 MHz,33802241,1494,3,262,2,1,0,monitor-20210624-082805,pc4
2021-06-24 10:13:27.430625+02:00,41,LTE_RRC_Serv_Cell_Info,3,407,1300,19300,20 MHz,20 MHz,30829824,1495,3,262,2,1,0,monitor-20210624-082805,pc4


In [5]:
rrc_df["period"] = rrc_df.index.floor("min")

In [6]:
rrc_dropped = rrc_df.reset_index().drop(columns=["timestamp", "file", "log_msg_len", "type_id", "Version"])

In [7]:
rrc_unique_dict = {}
for name, group in rrc_dropped.drop(columns="period").groupby(["device", "Cell ID"]):
    num_cids = group["Cell Identity"].nunique()
    if num_cids > 1: # Discard CIDs with multiple assigned Cell Identities
        continue
    
    row = {}
    for k, v in group.items():
        gk_unique = group[k].unique()
        if len(gk_unique) > 1:
            raise ValueError(f"{k} is not unique for {name}")
        row[k] = gk_unique[0]
    rrc_unique_dict[name] = row
rrc_unique = pd.DataFrame(rrc_unique_dict).T.reset_index(drop=True)
unique_cids = rrc_unique[["device", "Cell ID"]]

In [8]:
rrc_count = {}
rrc_lut = {}
for name, group in rrc_dropped.groupby(["device", "period", "Cell ID"]):
    dev_id, period, cid = name
    if ((unique_cids['device'] == dev_id) & (unique_cids['Cell ID'] == cid)).any():
        continue # Exclude unique CIDs from LUT
    rrc_count[name] = group.nunique()
    row = {}
    for k, v in group.items():
        gk_unique = group[k].unique()
        if len(gk_unique) > 1:
            raise ValueError(f"{k} is not unique for {name}")
        row[k] = gk_unique[0]
    rrc_lut[name] = row

In [9]:
rrc_lut_df = pd.DataFrame(rrc_lut).T.reset_index(drop=True).set_index("period").sort_index()

In [10]:
rrc_unique.to_parquet(out_path/"rrc_unique.parquet")
rrc_lut_df.to_parquet(out_path/"rrc_lut.parquet")

In [11]:
rrc_lut_df

Unnamed: 0_level_0,Cell ID,Downlink frequency,Uplink frequency,Downlink bandwidth,Uplink bandwidth,Cell Identity,TAC,Band Indicator,MCC,MNC Digit,MNC,Allowed Access,device
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-06-22 09:22:00+02:00,246,1300,19300,20 MHz,20 MHz,33802242,1494,3,262,2,1,0,pc4
2021-06-22 09:52:00+02:00,246,1300,19300,20 MHz,20 MHz,33802242,1494,3,262,2,1,0,pc4
2021-06-22 09:53:00+02:00,457,2850,20850,20 MHz,20 MHz,51447566,49101,7,262,2,2,0,pc3
2021-06-22 09:53:00+02:00,447,125,18125,15 MHz,15 MHz,3538707,49101,1,262,2,2,0,pc2
2021-06-22 09:55:00+02:00,294,2850,20850,20 MHz,20 MHz,4000015,49100,7,262,2,2,0,pc2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-24 18:47:00+02:00,346,1300,19300,20 MHz,20 MHz,30625794,1493,3,262,2,1,0,pc1
2021-06-24 18:51:00+02:00,414,1300,19300,20 MHz,20 MHz,26365186,1493,3,262,2,1,0,pc1
2021-06-24 18:51:00+02:00,65,125,18125,15 MHz,15 MHz,3282963,49300,1,262,2,2,0,pc3
2021-06-24 18:54:00+02:00,278,1300,19300,20 MHz,20 MHz,29127682,1494,3,262,2,1,0,pc1
