# Feature Extraction

Flow aggregation by a time window

Generated features:
* NumSrcPorts
* NumDestAddr
* NumDestPorts
* NumFlows
* NumBytesSum
* NumBytesMean
* NumBytesVar
* NumPacketsSum
* NumPacketsMean
* NumPacketsVar

## Imports

In [2]:
import numpy as np
import pandas as pd
from datetime import datetime
import os
import glob
import time

import matplotlib.pyplot as plt
import itertools

import warnings

import pickle

warnings.filterwarnings("ignore")

## Functions

In [6]:
features = ["NumSrcPorts", "NumDestAddr", "NumDestPorts", "NumFlows",
                   "NumBytesSum", "NumBytesMean", "NumBytesVar",
                   "NumPacketsSum", "NumPacketsMean", "NumPacketsVar"]

def extract_features(input_df):
    input_df['IsBotnet'] = input_df['Label'].apply(lambda x: 1 if "flow=From-Botnet" in x else 0)
    
    input_df.loc[:, "StartTime"] = pd.to_datetime(input_df.StartTime, format='%Y/%m/%d %H:%M:%S.%f')
    input_df.sort_values(by=['StartTime'], inplace=True)
    input_df.reset_index(drop=True, inplace=True)
#     input_df.head()
    
    
    # Determine the time windows
    time_windows = [0]
    for i in range(1, len(input_df)):
        # Find the optimal time window duration
        if (input_df["StartTime"][i] - input_df["StartTime"][time_windows[-1]]).seconds > 60:
            time_windows.append(i)

    time_windows.append(len(input_df)) # Added it for easier indexing             
    
    column_keys = ["NumSrcPorts", "NumDestAddr", "NumDestPorts", "NumFlows",
                   "NumBytesSum", "NumBytesMean", "NumBytesVar",
                   "NumPacketsSum", "NumPacketsMean", "NumPacketsVar", "IsBotnet"]
    gen_df = pd.DataFrame(None, columns=column_keys)

    print(len(time_windows))
    
    rnd_id = list(range(len(time_windows)-1))
    np.random.shuffle(rnd_id)

    # Generate the features for each time window
    for i in range(len(time_windows)-1):
        current_df = input_df.iloc[time_windows[i]:time_windows[i+1],:]
        
        if not(1 in current_df['IsBotnet'].values or i in rnd_id[:len(time_windows)//10]):
            continue

        print(str(i) + ' ', end='')
        
        group = current_df.groupby("SrcAddr")

        for address, addr_df in group:
            is_botnet = addr_df['IsBotnet'].mode()[0]
            
            if (i not in rnd_id[:len(time_windows)//10] and is_botnet == 0):
                continue
                
            # TODO: Optionally source ip address can be added
#             print(address)

            num_src_ports = len(addr_df.Sport.unique())

            num_dest_addr = len(addr_df.DstAddr.unique())

            num_dest_ports = len(addr_df.Dport.unique())

            num_flows = len(addr_df)

            # TODO: SrcBytes or TotBytes?
            num_bytes_sum = np.sum(addr_df.SrcBytes)
            
            num_bytes_mean = np.mean(addr_df.SrcBytes)
            
            num_bytes_var = np.var(addr_df.SrcBytes)

            num_packets_sum = np.sum(addr_df.TotPkts)
            
            num_packets_mean = np.mean(addr_df.TotPkts)
            
            num_packets_var = np.var(addr_df.TotPkts)
            
            
            

            curr_gen_df = pd.DataFrame([[num_src_ports, num_dest_addr, num_dest_ports,num_flows,
                                         num_bytes_sum, num_bytes_mean, num_bytes_var, num_packets_sum,
                                         num_packets_mean, num_packets_var, is_botnet]], columns=column_keys)
            gen_df = gen_df.append(curr_gen_df, ignore_index=True)
            
    return gen_df


def extract_features_all(data_path):
    scenario_list = os.listdir(data_path)
    print(scenario_list)

    feat_list = []
    
    for scenario in scenario_list:
        scenario_path = os.path.join(data_path, scenario)
        
        if os.path.isdir(scenario_path):
            print(scenario)
            flow_file_path = glob.glob(os.path.join(scenario_path, "*.binetflow"))[0]
            
            # Extract features for the current scenario
            input_df = pd.read_csv(flow_file_path)
            scenario_features = extract_features(input_df)
            
            # Append scenario label
            scenario_features.loc[:, "Scenario"] = int(scenario)
            
            p_filename = "anomaly_cache2/anomaly_scenario_features_"+str(scenario)+".p"
            pickle.dump( scenario_features, open( p_filename, "wb" ) )
            
            feat_list.append(scenario_features)
    
    return feat_list

## Extract Features

In [7]:
DATA_PATH = os.path.join("..", "Project\ctu-13")

start_time = time.time()
feat_list = extract_features_all(DATA_PATH)
pickle.dump( feat_list, open( "anomaly_cache2/anomaly_feat_list.p", "wb" ) )

print()
print("--- %s seconds ---" % (time.time() - start_time))

['1', '10', '11', '12', '13', '2', '3', '4', '5', '6', '7', '8', '9']
1
363
29 35 76 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 3

1 15 28 36 37 45 62 91 93 122 124 127 183 185 195 227 237 239 244 246 250 278 287 296 298 305 309 318 320 324 326 335 341 343 355 360 366 367 381 391 404 419 420 427 442 443 451 483 488 513 528 532 537 548 555 569 593 607 609 615 632 633 641 646 648 653 657 664 669 674 707 712 715 718 730 739 747 753 757 785 789 791 802 811 822 833 852 857 869 890 892 897 913 928 934 947 949 954 956 960 969 973 974 984 992 993 1003 1015 1022 1030 1035 1044 1055 1062 1064 1086 1095 1106 1110 1113 1127 1132 1133 1141 1145 1156 1171 1179 1183 1204 1217 1232 1249 1251 1277 1278 1284 1295 1299 1309 1323 1335 1338 1357 1364 1372 1373 1386 1392 1399 1411 1446 1456 1460 1468 1479 1481 1487 1490 1500 1514 1521 1522 1525 1541 1556 1564 1565 1569 1576 1582 1621 1636 1643 1650 1653 1654 1657 1685 1687 1691 1694 1704 1707 1709 1716 1717 1721 1731 1741 1745 1758 1764 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1

0 1 2 5 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 234 235 236 237 238 239 240 241 246 256 261 262 263 264 265 267 276 300 301 302 303 304 305 306 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 