# Sankey of IoT devices's communication traffic
- This program is to analyse the traffic of IoT devices by Sankey diagram

In [1]:
import pandas as pd
import numpy as np
import os
import plotly.graph_objects as go

## Functions for Creating a Y Lable 

In [2]:
def findexact(lst,key):
    for idx, elem in enumerate(lst):
        if key == elem:
            return idx
            
def Y_label(IoT_df1,df2):
    """ 
    IoT_df1: IoT Database,
    df2: device_name
    
    """
    y_label = []
    macADD_list = df2["macAddress"].tolist()
    macADD_list = [x.strip(' ') for x in macADD_list]
    print('macADD_list = ',macADD_list)

    for r in range(len(IoT_df1)):
        #print(IoT_df1.iloc[r]['macSrc'])
        src_device = findexact(macADD_list,IoT_df1.iloc[r]['macSrc'])
        dst_device = findexact(macADD_list,IoT_df1.iloc[r]['macDst'])
        if src_device !=None :
            y_label.append(src_device)
        elif dst_device !=None:
            y_label.append(dst_device)
        else:
            print('null')
        
    return y_label

## GenSangkey function

In [3]:
def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
    # maximum of 6 value cols -> 6 colors
    colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
        
    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
    
    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]]*colorNum
        
    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
        
    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
    
    # creating the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = labelList,
          color = colorList
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count']
        )
      )
    
    layout =  dict(
        #title = title,
        font = dict(
          size = 20
        )
    )
       
    fig = dict(data=[data], layout=layout)
    return fig


In [5]:
def getDF(file_dir,headernames):
    for r, d, f in os.walk(file_dir):
        for file in f:
            if file.endswith("model0_train_paper.csv"):
                entry = os.path.join(r, file)
                IoT_df2 = pd.read_csv(entry,header=None)
    IoT_df2.columns=headernames
    
    return IoT_df2
            
                        

## Load csv file of IoT dataset 

In [6]:
## File path
file_dir = '/Users/kalika/Documents/CPS-IoT_security/TOR_tawan/database/Tawan_device_list/dataset_model_0/' 

In [7]:
headernames = ['ipSrc', 'ipDst', 'macSrc', 'macDst', 'portSrc', 'portDst', 'pktLength', 'deviceName', 'protocol', 'detail' ]


In [8]:
list_name = '/Users/kalika/Documents/CPS-IoT_security/TOR_tawan/database/Tawan_device_list/dataset_model_0/Device_list_tw.csv'
device_name = pd.read_csv(list_name,header=None)

In [9]:

device_name.columns = ['device','macAddress']

In [11]:
IoT_df2 = getDF(file_dir,headernames)
y_label = Y_label(IoT_df2,device_name)
Label = device_name['device'].values.tolist()

macADD_list =  ['cc:50:e3:da:00:7f', 'cc:50:e3:da:00:3f', '60:01:94:ac:93:31', '60:01:94:ac:8f:fd', '60:01:94:74:22:a6', 'cc:50:e3:00:68:c8', '84:f3:eb:3d:fa:f5', 'ac:84:c6:21:07:3e', 'b0:4e:26:ae:47:e5', '50:c7:bf:8d:87:b6', 'ac:84:c6:bf:fc:a5', '00:17:88:b2:6b:0c', '18:b4:30:8f:88:a8', '54:e5:bd:8c:5c:5e', '4c:ef:c0:a9:b1:c1', '68:9a:87:31:d8:15', '4c:17:44:df:1f:b6', '44:65:0d:56:cc:d3']


## Create source Sankey

In [None]:
df = IoT_df2['deviceName'] == 6
df = IoT_df2[df]
print(df.shape)
print(IoT_df2.shape)

In [None]:
source = df['deviceName'].values.tolist()
target = df['macDst'].values.tolist()
value = df['pktLength'].values.tolist()

In [None]:
srcPort_list, srcPort_count  = np.unique(target,return_counts=True)
print(srcPort_list)
print(srcPort_list.shape)
print(srcPort_count)
src_port = dict(zip(srcPort_list, srcPort_count))
src_port_df1 = pd.DataFrame({'srcPort': srcPort_list,'count': srcPort_count})
src_port_df1.head()
Device_srcport = src_port_df1['count'] > 100
Device_src = src_port_df1[Amazon_srcport]
Device_src.head()
Device_src.shape


In [None]:
source = []
for i in range(Amazon_src.shape[0]):
    source.append('Nest IQ Cam')

In [None]:
target = Device_src['srcPort'].values.tolist()
value = Device_src['count'].values.tolist()

In [None]:
df = pd.DataFrame({'source': source, 'target': target, 'count': value})
df

In [None]:
#fig = genSankey(df,cat_cols=['source','target'],value_cols='count',title='Sankey Diagram')
fig = genSankey(df,cat_cols=['source','target'],value_cols='count')
fig2 = go.Figure(fig)
fig2.show()

