In [44]:
# import package
import pandas as pd
import numpy as np

In [45]:
# import data
cabinet_raw = pd.read_csv("signal_data/signals_cabinet_9_26.csv")
signal_raw = pd.read_csv("signal_data/signals_9_26.csv")

### 0.data cleaning

In [46]:
# pick useful columns from two datasets, change all columns to strings
signal = signal_raw[["SIGNAL_ID", "PRIMARY_SIGNAL_ID", "CONTROL", "LOCATION_NAME", "CONTROLLER_IP"]].astype(str)
cabinet = cabinet_raw[["SIGNAL_ID", "CABINET_ID"]].astype(str)

In [47]:
signal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1117 entries, 0 to 1116
Data columns (total 5 columns):
SIGNAL_ID            1117 non-null object
PRIMARY_SIGNAL_ID    1117 non-null object
CONTROL              1117 non-null object
LOCATION_NAME        1117 non-null object
CONTROLLER_IP        1117 non-null object
dtypes: object(5)
memory usage: 43.7+ KB


In [48]:
cabinet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935 entries, 0 to 934
Data columns (total 2 columns):
SIGNAL_ID     935 non-null object
CABINET_ID    935 non-null object
dtypes: object(2)
memory usage: 14.7+ KB


In [49]:
signal.head(5)

Unnamed: 0,SIGNAL_ID,PRIMARY_SIGNAL_ID,CONTROL,LOCATION_NAME,CONTROLLER_IP
0,2,,PRIMARY,GUADALUPE ST / LAMAR BLVD,172.16.74.52
1,3,,PRIMARY,LAMAR BLVD / 51ST ST,172.16.74.48
2,4,,PRIMARY,LAMAR BLVD / NORTH LOOP BLVD,172.16.74.44
3,5,,PRIMARY,KOENIG LN / LAMAR BLVD,172.16.74.40
4,6,,PRIMARY,LAMAR BLVD / DENSON DR,172.16.74.36


In [50]:
cabinet.head(5)

Unnamed: 0,SIGNAL_ID,CABINET_ID
0,332,1
1,742,2
2,959,3
3,288,4
4,326,5


In [51]:
# filter out secondary signals
secondary = signal.loc[signal["CONTROL"] == "SECONDARY"].copy()

# filter out primary signals
primary = signal.loc[signal["CONTROL"] == "PRIMARY"].copy()

### 1. Fill cabinet IDs for secondary signals. 

In [52]:
# slice out trailing 0
secondary["PRIMARY_SIGNAL_ID"] = secondary["PRIMARY_SIGNAL_ID"].astype(str)
secondary["PRIMARY_SIGNAL_ID"] = secondary["PRIMARY_SIGNAL_ID"].str.split('.', expand = True)[0]

In [53]:
# match cabinet ID to secondary signals using the primary 
# signal IDs of secondary signals
secondary_cab = secondary.merge(cabinet, 
                                left_on = "PRIMARY_SIGNAL_ID", 
                                right_on = "SIGNAL_ID", 
                                how = "left").copy()

In [54]:
secondary_cab.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 0 to 135
Data columns (total 7 columns):
SIGNAL_ID_x          136 non-null object
PRIMARY_SIGNAL_ID    136 non-null object
CONTROL              136 non-null object
LOCATION_NAME        136 non-null object
CONTROLLER_IP        136 non-null object
SIGNAL_ID_y          121 non-null object
CABINET_ID           121 non-null object
dtypes: object(7)
memory usage: 8.5+ KB


In [55]:
secondary_cab["SIGNAL_ID_x"].value_counts().head(5)

672    2
810    1
981    1
697    1
597    1
Name: SIGNAL_ID_x, dtype: int64

In [56]:
secondary_cab = secondary_cab.rename(columns={'SIGNAL_ID_x': 'SIGNAL_ID'})

### 2. map cabinet IDs to primary signals


In [57]:
primary_cab = primary.merge(cabinet, 
                                left_on = "SIGNAL_ID", 
                                right_on = "SIGNAL_ID", 
                                how = "left").copy()

In [58]:
primary_cab.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 988 entries, 0 to 987
Data columns (total 6 columns):
SIGNAL_ID            988 non-null object
PRIMARY_SIGNAL_ID    988 non-null object
CONTROL              988 non-null object
LOCATION_NAME        988 non-null object
CONTROLLER_IP        988 non-null object
CABINET_ID           921 non-null object
dtypes: object(6)
memory usage: 54.0+ KB


In [59]:
primary_cab["CABINET_ID"].value_counts().head(5)

740    1
78     1
552    1
737    1
385    1
Name: CABINET_ID, dtype: int64

In [60]:
# primary_cab["CABINET_ID"] = primary_cab["CABINET_ID"].astype(str)
# primary_cab["CABINET_ID"] = primary_cab["CABINET_ID"].str.split('.', expand = True)[0]

### 3. put primary and secondary signals (with cab ID) together

In [61]:
signal_cab = primary_cab.append(secondary_cab, sort=False)

In [62]:
signal_cab.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1124 entries, 0 to 135
Data columns (total 7 columns):
SIGNAL_ID            1124 non-null object
PRIMARY_SIGNAL_ID    1124 non-null object
CONTROL              1124 non-null object
LOCATION_NAME        1124 non-null object
CONTROLLER_IP        1124 non-null object
CABINET_ID           1042 non-null object
SIGNAL_ID_y          121 non-null object
dtypes: object(7)
memory usage: 70.2+ KB


In [63]:
signal_cab["SIGNAL_ID"].value_counts()

672     2
914     2
245     2
725     2
871     2
671     2
513     2
683     1
737     1
552     1
1004    1
919     1
463     1
78      1
366     1
21      1
429     1
990     1
681     1
49      1
77      1
269     1
580     1
727     1
109     1
814     1
10      1
348     1
732     1
836     1
       ..
4018    1
826     1
6       1
168     1
602     1
835     1
33      1
98      1
912     1
986     1
562     1
272     1
653     1
506     1
896     1
4079    1
35      1
656     1
524     1
558     1
961     1
4068    1
255     1
493     1
936     1
68      1
863     1
464     1
574     1
457     1
Name: SIGNAL_ID, Length: 1117, dtype: int64

In [64]:
# drop duplicate signals
signal_cab = signal_cab.drop_duplicates(subset="SIGNAL_ID")

In [65]:
signal_cab.shape

(1117, 7)

In [66]:
signal_cab["CABINET_ID"].value_counts().head(100)

79     4
767    4
741    4
776    4
701    4
34     3
600    2
869    2
549    2
64     2
773    2
83     2
114    2
891    2
511    2
282    2
832    2
45     2
445    2
142    2
110    2
434    2
430    2
673    2
905    2
51     2
738    2
419    2
816    2
472    2
      ..
489    2
504    2
807    2
283    2
835    2
41     2
53     2
903    2
35     2
805    2
197    2
133    2
889    2
737    2
21     2
878    2
739    2
70     2
796    2
663    2
748    2
293    2
733    2
582    2
632    2
722    2
890    2
84     2
825    2
392    2
Name: CABINET_ID, Length: 100, dtype: int64

In [67]:
signal_cab.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1117 entries, 0 to 135
Data columns (total 7 columns):
SIGNAL_ID            1117 non-null object
PRIMARY_SIGNAL_ID    1117 non-null object
CONTROL              1117 non-null object
LOCATION_NAME        1117 non-null object
CONTROLLER_IP        1117 non-null object
CABINET_ID           1035 non-null object
SIGNAL_ID_y          120 non-null object
dtypes: object(7)
memory usage: 69.8+ KB


### 4. save to CSV

In [68]:
signal_cab.to_csv("all_signals_with_cabinet_ID.csv")