In [3]:
import json
import pandas as pd

import machine_learning.model as model
from resources.firewall_rule import FirewallRule

In [6]:
# Data processing / Features engineering

def pre_process_firewall_data(resource_data_json,
                              selected_features):
    """Pre process resource data.

    Args:
        resource_data_json (list): A list of resource data in json format.
        selected_features (list): A list of selected features, if the
            list is empty, we will include all the features.

    Returns:
        DataFrame: DataFrame table with all the resource_data.
    """

    df = pd.DataFrame(resource_data_json)

    existing_columns = df.columns

    new_columns = list(set(existing_columns).intersection(selected_features))
    df['creation_timestamp'] = df['creation_timestamp'].astype('category').cat.codes
    df['direction'] = df['direction'].astype('category').cat.codes
    df['action'] = df['action'].astype('category').cat.codes
    df['disabled'] = df['disabled'].astype('category').cat.codes
    df['ip_protocol'] = df['ip_protocol'].astype('category').cat.codes
    df['ports'] = df['ports'].astype('category').cat.codes
    df['source_ip_addr'] = df['source_ip_addr'].astype('category').cat.codes
    df['dest_ip_addr'] = df['dest_ip_addr'].astype('category').cat.codes
    df['service_account'] = df['service_account'].astype('category').cat.codes
    df['tag'] = df['tag'].astype('category').cat.codes
    df['full_name'] = df['full_name'].astype('category').cat.codes
    df['ip_protocol'] = df['ip_protocol'].astype('category').cat.codes
    df['network'] = df['network'].astype('category').cat.codes

    df = df[new_columns]

    return df

with open('sample_datasets/dataset_firewall.json') as firewall_dataset:
    firewall_rules = json.load(firewall_dataset)

    flattened_firewall_rules = FirewallRule.flatten_firewall_rules(firewall_rules)

    flattened_firewall_rules_dict = [i.to_dict() for i in flattened_firewall_rules]

    df_filtered = pre_process_firewall_data(
        flattened_firewall_rules_dict,
        ['creation_timestamp',
         'source_ip_addr',
         'dest_ip_addr',
         'service_account',
         'tag',
         'full_name',
         'action',
         'ip_protocol',
         'ports',
         'direction',
         'disabled',
         'network'])
    print(df_filtered.iloc[0])


'NoneType' object is not iterable
{u'policy_update_counter': 0, u'display_name': u'', u'name': u'1752190369480707101', u'data': u'{"allowed": [{"IPProtocol": "tcp"}], "creationTimestamp": "2017-07-19T12:28:50.295-07:00", "description": "", "direction": "INGRESS", "disabled": false, "id": "1752190369480707101", "kind": "compute#firewall", "name": "fw-tag-match", "network": "https://www.googleapis.com/compute/beta/projects/iap-1-174217/global/networks/default", "priority": 1000, "selfLink": "https://www.googleapis.com/compute/beta/projects/iap-1-174217/global/firewalls/fw-tag-match", "sourceTags": ["tag-match"], "targetTags": ["othertag", "sometag"]}', u'type_name': u'firewall/1752190369480707101', u'parent_type_name': u'project/iap-1-174217', u'full_name': u'organization/660570133860/project/iap-1-174217/firewall/1752190369480707101/', u'type': u'firewall', u'email': u''}
'NoneType' object is not iterable
{u'policy_update_counter': 0, u'display_name': u'', u'name': u'8719472173724601402

NameError: global name 'pd' is not defined

In [None]:
# PCA Visualization

pac_2 = model.dimensionality_reduce(df_filtered, 2)

model.visualize_2d(pac_2)

In [None]:
# K means

m = model.k_means(reduced_data, 3, max_iter=100, seed=0)

print (m)