In [None]:
import glob
import dpkt
import subprocess
import socket
import pandas as pd
import re
import csv
import sys

import concurrent.futures
import math

In [None]:
# Benign Data
BEN_1 = glob.glob('../../datasets/d/Benign/p2pbox1/*')
BEN_2 = glob.glob('../../datasets/d/Benign/p2pbox2/*')
BEN_T = glob.glob('../../datasets/d/Benign/torrent/*')

BEN_IP = {'p2p_1':'192.168.1.2', 'p2p_2':'192.168.2.2', 'tor':'172.27.28.106'}

# print(BEN_1, BEN_2, BEN_T, BEN_IP)

In [None]:
# Botnet Data
BOT_S = glob.glob('../../datasets/d/Botnet/storm/*')
BOT_Z = glob.glob('../../datasets/d/Botnet/zeus/*')
BOT_V = glob.glob('../../datasets/d/Botnet/vinchuca/*')

STORM_IP = '''
66.154.80.101
66.154.80.105
66.154.80.111
66.154.80.125
66.154.83.107
66.154.83.113
66.154.83.138
66.154.83.80
66.154.87.39
66.154.87.41
66.154.87.57
66.154.87.58
66.154.87.61
'''.split()
ZEUS_IP = ['10.0.2.15']
VIN_IP = ['172.27.22.206']

In [None]:
TIME_WINDOW = 3600 # 1 hr in sec
TSHARK_FIELDS = " -e _ws.col.Protocol -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport -e _ws.col.Time -e _ws.col.Source -e _ws.col.Destination -e _ws.col.Length -e tcp.len -e udp.length -e tcp"

fields = '''
protocol
srcport
dstport

time

source
destination

totallen
datalen

istcp

src_dst
sip_dip
'''.split()

fields_dict = dict()

i = 0
for field in fields: 
    fields_dict[field]=i
    i+=1

In [None]:
# Process file function
def parse_cap_tr(file_name):
    command= "tshark -r "+file_name+" -Y "+"'ip.version==4&&(tcp||udp)&&(!dns)'"+ \
        " -T fields " + TSHARK_FIELDS + " -E separator='|' 2>/dev/null"
            
    p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    
    outs =[]
    
    i=0
    
    while True:
        print(f'\rPacket: {i}',end = '')
        i+=1
        
        line = p.stdout.readline()
        
        if not line:
            break
        
        line = line.decode('utf-8').strip()
        
        splits = line.split('|')
        
        l = []

        try:
            # append protocol
            l.append(splits[0].strip())

            # if TCP, append ports
            if splits[11].strip()!='':
                l.append(int(splits[1].strip()))
                l.append(int(splits[2].strip()))
            else:
                l.append(int(splits[3].strip()))
                l.append(int(splits[4].strip()))

            # append time
            l.append(float(splits[5].strip()))

            # append src, dst ip
            l.extend(splits[6:8])

            # append totallen
            l.append(int(splits[8].strip()))

            # if TCP, append datalen and istcp
            if splits[11].strip()!='':
                try:
                    l.append(int(splits[9].strip()))
                except:
                    l.append(0)
                l.append(1)
            else:
                try:
                    l.append(int(splits[10].strip()))
                except:
                    l.append(0)
                l.append(0)

            # append src+dst for grouping, and sip+dip
            l.append(min(splits[6],splits[7])+'__'+max(splits[6],splits[7]))
            
            l.append(str(min(l[fields_dict['srcport']], l[fields_dict['dstport']]))+'__'+str(max(l[fields_dict['srcport']], l[fields_dict['dstport']])))

            index = int(int(l[fields_dict['time']])/TIME_WINDOW)

            if index >= len(outs):
                out = [l]
                outs.append(out)
            else:
                outs[index].append(l)

        except Exception as e:
            print("ER1", e, line, splits)

    print('\r')
    
    try:
        dfs = []
        for out in outs:
            df = pd.DataFrame(out, columns = fields)
            dfs.append(df)
        return dfs
    except Exception as e:
        print("ER2", e, outs, fields)
        return pd.DataFrame()

In [None]:
# Feature Extraction Function (returns list of list of 23 (18 used in classification) faetures)
def extr_feats_tr(df, file_c):
    iplist = []
    if(file_c == 'z'):
        iplist = ZEUS_IP
    elif(file_c == 's'):
        iplist = STORM_IP
    elif(file_c == 'v'):
        iplist = VIN_IP
  
    groups = df.groupby(['src_dst','protocol','sip_dip'])
    
    features = []

    for key in groups.groups:
        
        f = [0 for x in range (0,24)]
        
        f[0] = 0.0
        
        f[6] = 9223372036854775807
        f[14] = f[16] = 0.0
        f[9] = f[15] = f[17] = 3601.0
        
        time_last = -1
        time_last_f = -1
        time_last_b = -1
        
        f_count = 0
        b_count = 0
        
        tmp_src = key[0].split('__')[0]
        tmp_dst = key[0].split('__')[1]
        
        # Last feature is 0,1,2
        # label 0 denotes -> src_ip = botnet
        # label 1 denotes -> dst_ip = botnet
        # label 2 denotes -> both ips = botnet
        f[23] = -1
        _a = tmp_src in iplist
        _b = tmp_dst in iplist

        if _a and (not _b):
            f[23]=0
        elif (not _a) and _b:
            f[23]=1
        elif (_a and _b):
            f[23]=2
        
        
        
        ####### FLOW_DETAILS_NOT_USED_IN_CLASSIFICATION (just used in output of detection model)#################
        f[18] = tmp_src
        f[19] = tmp_dst
        f[20] = key[2].split('__')[0]
        f[21] = key[2].split('__')[1]
        f[22] = key[1]
        ##########################################################################################################
        
        
        i = 1        
        
        time_last = -1

        for _, row in groups.get_group(key).iterrows():
            
            if(time_last == -1):
                time_last = row['time']
            
            t_diff = row['time']-time_last
            time_last = row['time']
            
            
            # f1 (left, divide by nos)
            f[0] += t_diff
            
            # f2, f3, f4, f5
            if row['source'] == tmp_src:                
                f[1]+=1
                
                f[3]+=row['datalen']
            else:
                f[2]+=1
                
                f[4]+=row['datalen']
                
                
            # f6
            f[5]+=row['totallen']
            
            # f7, f8
            f[6] = min(f[6], row['totallen'])
            f[7] = max(f[7], row['totallen'])
            
            # f9, f10
            f[8] = max(f[8], t_diff)
            
            if(t_diff!=0.0):
                f[9] = min(f[9], t_diff)
            
            # f11
            f[10] += t_diff
            
            # (f13, f14 (divide at end)), f15, f16, f17, f18
            if row['source'] == tmp_src:
                if(time_last_f == -1):
                    time_last_f = row['time']
                
                t_diff_f = row['time']-time_last_f 
                
                f[12] += t_diff_f
                f_count+=1
                
                f[14] = max(f[14], t_diff_f)
                if t_diff_f != 0.0:
                    f[15] = min(f[15], t_diff_f)
            else:
                if(time_last_b == -1):
                    time_last_b = row['time']
                    
                t_diff_b = row['time']-time_last_b 
                
                f[13] += t_diff_b
                b_count+=1
                
                f[16] = max(f[16], t_diff_b)
                if t_diff_b != 0.0:
                    f[17] = min(f[17], t_diff_b)
            
            i+=1
            
            
        # FILTERING for flows with bidirectional packets and time window > 1/4 of the pcap field capturing window i.e. 1/4 * (3600) secs
        if( i<=2 or f_count<=1 or b_count<=1 or f[10] < 900):
            # dont append these features
            continue
            
        
        # f1, f12
        f[0] = f[0]/(i-2)     # len(groups.get_group(key))-1
        f[11] = f[10]/(i-2)   # len(groups.get_group(key))-1
        
        #f13, f14 
        if f_count > 0:
            f[12] /= f_count 
        if b_count > 0:
            f[13] /= b_count 
        
        # 3601.0 marks unavailability of feature
        # WILL NEVER OCCUR (as such cases already filtered)
        for i in [8,9,12,13,14,15,16,17]:
            if f[i]==0 or f[i] == 0.0:
                f[i]=3601.0
        

        features.append(f)
        
    return features

In [None]:
# Get features util
def get_feats(dfs, file_c):
    feats = []
    for df in dfs:
        fs = extr_feats_tr(df, file_c)
        try:
            if fs and len(fs)>0:
                feats.extend(fs)
        except Exception as e:
            print("Caught exception while adding feats:", e)
    return feats

In [None]:
def build_csv(files, file_c):
    ans = []
   
    for file in files:
        print("PARSING "+file)
        dfs = parse_cap_tr(file)
        if dfs:
            print("EXTRACTING FEATURES...")
            feats = get_feats(dfs, file_c)
            ans.extend(feats)
    
    print('EXTRACTED FEATURES!')
    return ans

In [None]:
def build_csv_parallel(files, csv_name, file_c):
    l = len(files)
    f_list = [files[math.floor(i*(l/16)):math.floor((i+1)*l/16)] for i in range(16)]
    
    ans = []
    
    with open(csv_name, 'w') as f:
        f.write('F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,TMP_SRC_IP,TMP_DST_IP,SRC_P,DST_P,PROTO,IPMARK\n')
    
    with concurrent.futures.ProcessPoolExecutor(max_workers=16) as e:
        futures = [e.submit(build_csv, files, file_c) for files in f_list]
        i=0
        for f in futures: 
            if f.result() and len(f.result())>0:
                ans.extend(f.result())
            i+=1
    
    with open(csv_name, 'a') as f:
        writer = csv.writer(f)
        writer.writerows(ans)
        print("DONE", end = '\n\n')

In [None]:
def util(x):
    if x == 'v':
        build_csv_parallel(BOT_V, './csv_generated/bot_v.csv', x)
    elif x == 'z':
        build_csv_parallel(BOT_Z, './csv_generated/bot_z.csv', x)
    elif x =='s':
        build_csv_parallel(BOT_S, './csv_generated/bot_s.csv', x)
    elif x =='1':
        build_csv_parallel(BEN_1,'./csv_generated/ben_1.csv', x)
    elif x =='2':
        build_csv_parallel(BEN_2,'./csv_generated/ben_2.csv', x)
    elif x =='t':
        build_csv_parallel(BEN_T,'./csv_generated/ben_t.csv', x)

In [None]:
util('v')
util('z')
util('s')
util('t')
util('1')
util('2')