In [1]:
import numpy as np
import pandas as pd
from pgmpy.factors.discrete import DiscreteFactor
from pgmpy.models import FactorGraph
from pgmpy.inference import BeliefPropagation

# Task 2 

## Task 2.1

In [2]:
G = FactorGraph() ## Create FactorGraph object
###############################
#   TODO: Define factor functions
###############################
f_1 = DiscreteFactor(['S1'], [2,], [0.85, 0.15])
f_2 = DiscreteFactor(['S1', 'E1'], [2,2], [[0.1, 0.2], [0, 0.5]])

###############################
#   TODO: Add random variables
#         and factor functions 
###############################
G.add_nodes_from(['S1', 'E1'])  ## Add random variables 
G.add_factors(f_1, f_2)     ## Add factor functions

###############################
#   TODO: Add the edges for random 
#   variables and factor functions
###############################
G.add_edges_from([('S1', f_1), ('S1', f_2), ('E1', f_2)])

## Task 2.2: Marginal Probability of S1

In [3]:
bp = BeliefPropagation(G)
###############################
#   TODO: Compute the marginal probability
###############################

margin = bp.query(variables=['S1'])
margin.normalize()
print(margin)

Eliminating: E1: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 247.47it/s]

+-------+-----------+
| S1    |   phi(S1) |
| S1(0) |    0.7727 |
+-------+-----------+
| S1(1) |    0.2273 |
+-------+-----------+





## Task 2.3: Value of S1 that maximizes its value

In [4]:
bp.map_query(variables=['S1'])

Eliminating: E1: 100%|██████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 334.47it/s]


{'S1': 0}

### The most probable state for S1 is 0

## Task 2.4: P(S1) when E1 = 1 is given

In [5]:
margin = bp.query(variables=['S1'], evidence = {'E1': 1})
margin.normalize()
print(margin)

0it [00:00, ?it/s]

+-------+-----------+
| S1    |   phi(S1) |
| S1(0) |    0.6939 |
+-------+-----------+
| S1(1) |    0.3061 |
+-------+-----------+





## Task 2.5 Most probable state for S1 when E1=1

In [6]:
bp.map_query(variables=['S1'], evidence = {'E1': 1})

0it [00:00, ?it/s]


{'S1': 0}

### The most probable state for S1 when E1 = 1 is: 0

### Compare above results with hand calculations

# Task 3

In [7]:
ATTACK_EVENTS_MAP = {
    'Scan':1,
    'Login':2,
    'Sensitive_URI':3,
    'New_Kernel_Module':4,
    'DNS_Tunneling':5
}
ATTACK_STATES_MAP = {
        'benign': 1,
        'discovery': 2,
    'access': 3,
    'lateral_movement': 4,
        'privilege_escalation': 5,
        'persistence': 6,
    'defense_evasion': 7,
    'collection': 8,
        'exfiltration': 9,
    'command_control': 10,
    'execution': 11
}

### Task 3.0 

In [114]:
event_review = open('data/event_review.txt', 'r') 
Lines = event_review.readlines() 

f = np.zeros([len(ATTACK_STATES_MAP), len(ATTACK_EVENTS_MAP)])

for line in Lines:
    for event, event_num in ATTACK_EVENTS_MAP.items():
        if (event in line):
            for state, state_num in ATTACK_STATES_MAP.items():
                if (state in line):
                    f[state_num-1, event_num-1] += 1
print("Factor function values are:\n", f)   

# Normalize the factor function by columns:
for i in range(len(f.transpose())):
    f.transpose()[i] = f.transpose()[i] / np.linalg.norm(f.transpose()[i], ord = 1)

print("\nFactor functions (in terms of probability):\n", f)

Factor function values are:
 [[468. 100. 166. 175.   2.]
 [ 32.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.]
 [  0.   0. 134.   0.   0.]
 [  0.   0.   0.  25.   0.]
 [  0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.  98.]
 [  0.   0.   0.   0.   0.]
 [  0.   0.   0.   0.   0.]]

Factor functions (in terms of probability):
 [[0.936      1.         0.55333333 0.875      0.02      ]
 [0.064      0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.44666667 0.         0.        ]
 [0.         0.         0.         0.125      0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.98      ]
 [0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0

### Task 3.1

In [9]:
attack_sequences = open('data/attack_sequences.txt', 'r') 
Lines = attack_sequences.readlines() 

seq = [] # Stores the sequence of events executed in a particular attack sequence
table = {} # Counts the factor functions
table_rep = {} # Counts the factors for repetitive patterns

for line in Lines: # Extract one attack sequence
    line = line.split(" ")
    line.pop()
    for key in line: # Extract event from attack sequence
        #Converts the attack sequence to list with corresponding coded numbers for easier processing
        seq.append(ATTACK_EVENTS_MAP[key]) 
        
    i = 2
    while i < len(seq):
        # Sliding window implementation
        a = str(seq[i-2])
        b = str(seq[i-1])
        c = str(seq[i]) 
        
        # Repetitiveness check and increment counter for the sequence
        if a == b and b ==  c:
            try:
                table_rep[a+b+c] += 1
            except:
                table_rep[a+b+c] = 1
       
        # Commonness: increment counter for the sequence
        try:
            table[a+b+c] += 1
        except:
            table[a+b+c] = 1
        i += 1 
    
    seq = [] # reset the sequence for next attack

print("The number of counts observed:\n")
print("1: Commonness")
 
#Following snippet normalizes the number of counts in table and sorts them in desc order
common = pd.DataFrame(list(table.items()), columns = ['Combination', 'Number'])
print(common.set_index('Combination'))
common['Number'] /= common['Number'].sum()
common.sort_values(['Number'], ascending = False, inplace = True)

print("2: Repetiveness")
#Following snippet normalizes the number of counts in table_rep and sorts them in desc order
common_rep = pd.DataFrame(list(table_rep.items()), columns = ['Combination', 'Number'])
print(common_rep.set_index('Combination'))
common_rep['Number'] /= common_rep['Number'].sum()
common_rep.sort_values(['Number'], ascending = False, inplace = True)

most_common_seq = [] # Most common seq overall
most_common_rep_seq = [] # Most common repetitive seq

# Just converts the number encoding back to string type description of events: For eg: "1" gets converted to "Scan"
for x in common['Combination'].loc[0]:
    for key, value in ATTACK_EVENTS_MAP.items():
        if value == int(x):
            most_common_seq.append(key)

for x in common_rep['Combination'].loc[0]:
    for key, value in ATTACK_EVENTS_MAP.items():
        if value == int(x):
            most_common_rep_seq.append(key)
            
# Generates a fancy table to display
dict = {'Most Common Event Sequence': [most_common_seq],\
       'Factor Function': [[0,0,0,0,0,round(common['Number'].loc[0], 3),0,0,0,0,0]],\
       'Attack States':['persistence'],\
       'Probability': [round(common['Number'].loc[0], 3)]}
print("\nCOMMONNESS:")
display(pd.DataFrame(dict))

dict = {'Most Common Event Sequence': [most_common_rep_seq],\
       'Factor Function': [[0,0,0,0,round(common_rep['Number'].loc[0], 3),0,0,0,0,0,0]],\
       'Attack States':['persistence'],\
       'Probability': [round(common_rep['Number'].loc[0], 3)]}
print("\nREPETITIVENESS:")
display(pd.DataFrame(dict))

The number of counts observed:

1: Commonness
             Number
Combination        
134             200
212              15
121              27
215              22
151              20
...             ...
511              11
114              11
525              14
544              11
244               8

[125 rows x 1 columns]
2: Repetiveness
             Number
Combination        
333             186
222               7
111              25
444              25
555              15

COMMONNESS:


Unnamed: 0,Most Common Event Sequence,Factor Function,Attack States,Probability
0,"[Scan, Sensitive_URI, New_Kernel_Module]","[0, 0, 0, 0, 0, 0.071, 0, 0, 0, 0, 0]",persistence,0.071



REPETITIVENESS:


Unnamed: 0,Most Common Event Sequence,Factor Function,Attack States,Probability
0,"[Sensitive_URI, Sensitive_URI, Sensitive_URI]","[0, 0, 0, 0, 0.721, 0, 0, 0, 0, 0, 0]",persistence,0.721


### Task 3.2  You will have to submit the graph you draw through Compass

### Done!

In [18]:
f.shape

(11, 5)

### Task 3.3

In [127]:
evidence_sequence_dep = [ 'E' + str(x) for x in [1, 3, 3, 3, 4]]
evidence_sequence_dep


In [130]:
G = FactorGraph() ## Create FactorGraph object
###############################
#   TODO: Define factor functions
###############################
f_1 = DiscreteFactor(['S1', 'E1'], [11,5], f)
f_2 = DiscreteFactor(['S2', 'E2'], [11,5], f)
f_3 = DiscreteFactor(['S3', 'E3'], [11,5], f)
f_4 = DiscreteFactor(['S4', 'E4'], [11,5], f)
f_5 = DiscreteFactor(['S5', 'E5'], [11,5], f)
f_6 = DiscreteFactor(['S6', 'E6'], [11,5], f)
f_7 = DiscreteFactor(['S7', 'E7'], [11,5], f)
f_8 = DiscreteFactor(['S8', 'E8'], [11,5], f)
f_9 = DiscreteFactor(['S9', 'E9'], [11,5], f)
state_margin = [[] for x in range(9)]

r_temp = np.zeros([11,5,5,5])
r_temp [4,2,2,2] = common_rep['Number'].loc[0]
r = DiscreteFactor(['S5', 'E3', 'E4', 'E5'], [11,5,5,5], r_temp)

c_temp = np.zeros([11,5,5,5])
c_temp [5,0,2,3] = common['Number'].loc[0]
c = DiscreteFactor(['S6', 'E1', 'E3' ,'E6'], [11,5,5,5], c_temp) 
###############################
#   TODO: Add random variables
#         and factor functions 
###############################
G.add_nodes_from(['S1', 'S3', 'S4', 'S5', 'S6', \
                 'E1', 'E3', 'E4', 'E5', 'E6'])  ## Add random variables 
G.add_factors(f_1, f_3, f_4, f_5, f_6, r, c)     ## Add factor functions

###############################
#   TODO: Add the edges for random 
#   variables and factor functions
###############################
G.add_edges_from([('S1', f_1),('S3', f_3),('S4', f_4),('S5', f_5),('S6', f_6),\
                  ('E1', f_1),('E3', f_3),('E4', f_4),('E5', f_5),('E6', f_6),\
                 ('E1', c),('E3', c),('E6', c),('S6', c),\
                 ('E3', r),('E4', r),('E5', r),('S5', r)])

###############################
#   TODO: Do the inference
###############################
bp_dep = BeliefPropagation(G)
MPS = [[] for x in range(9)] # Most Probable State at time t


for i in [1, 3, 4, 5, 6]:
    state_margin [i-1] = bp_dep.query(variables = ['S' + str(i)],\
                                      evidence = {'E1': 0, 'E3': 2, 'E4': 2, 'E5': 2, 'E6': 3 }, show_progress = False)
                                                                            
    state_margin [i-1].normalize()
    MPS [i-1] = bp_dep.map_query(variables=['S' + str(i)], evidence = {'E1': 0, 'E3': 2, 'E4': 2, 'E5': 2, 'E6': 3 },\
                                 show_progress = False)

'''
There are two kinds of graphs: disjoint and joint. PGMPY gives an error if the entire graph is run together. 
Hence we need to assume the 4 disconnectde nodes are 4 seperate factor graphs and analyze them individually
'''

f_indep = [f_9, f_8, f_7, f_2]

for i in [2,7,8,9]:
    factor_current = f_indep.pop()
    G_indep = FactorGraph()
    G_indep.add_nodes_from(['S' + str(i), 'E' + str(i)])  ## Add random variables 
    G_indep.add_factors(factor_current)
    G_indep.add_edges_from([('S'+str(i), factor_current ),('E'+str(i), factor_current)])
    bp_indep = BeliefPropagation(G_indep)
    
    if i == 2:
        value = 1
    else: 
        value = 4
    
    state_margin [i-1] = bp_indep.query(variables=['S' + str(i)], evidence = {'E'+str(i): value}, show_progress = False)
    state_margin [i-1].normalize()
    MPS [i-1] = bp_indep.map_query(variables = ['S' + str(i)], evidence = {'E'+str(i): value}, show_progress = False)
    
print(state_margin)

[<DiscreteFactor representing phi(S1:11) at 0x148378dc4c8>, <DiscreteFactor representing phi(S2:11) at 0x14836a51608>, <DiscreteFactor representing phi(S3:11) at 0x148365ab688>, <DiscreteFactor representing phi(S4:11) at 0x14836a51108>, <DiscreteFactor representing phi(S5:11) at 0x14836b70648>, <DiscreteFactor representing phi(S6:11) at 0x148365ab348>, <DiscreteFactor representing phi(S7:11) at 0x14836fbcd48>, <DiscreteFactor representing phi(S8:11) at 0x1483710a848>, <DiscreteFactor representing phi(S9:11) at 0x14836e1eb48>]


### Task 3.4

#### a. At every time point, provide the marginal probability of each state (Since we have 9 time points and 11 possible states, you should provide 99 probability values here)

In [131]:
for i in range(9):
    print("Marginal Probability of S" + str(i+1))
    print(state_margin[i])
    print("\n")

Marginal Probability of S1
+--------+-----------+
| S1     |   phi(S1) |
| S1(0)  |    0.9360 |
+--------+-----------+
| S1(1)  |    0.0640 |
+--------+-----------+
| S1(2)  |    0.0000 |
+--------+-----------+
| S1(3)  |    0.0000 |
+--------+-----------+
| S1(4)  |    0.0000 |
+--------+-----------+
| S1(5)  |    0.0000 |
+--------+-----------+
| S1(6)  |    0.0000 |
+--------+-----------+
| S1(7)  |    0.0000 |
+--------+-----------+
| S1(8)  |    0.0000 |
+--------+-----------+
| S1(9)  |    0.0000 |
+--------+-----------+
| S1(10) |    0.0000 |
+--------+-----------+


Marginal Probability of S2
+--------+-----------+
| S2     |   phi(S2) |
| S2(0)  |    1.0000 |
+--------+-----------+
| S2(1)  |    0.0000 |
+--------+-----------+
| S2(2)  |    0.0000 |
+--------+-----------+
| S2(3)  |    0.0000 |
+--------+-----------+
| S2(4)  |    0.0000 |
+--------+-----------+
| S2(5)  |    0.0000 |
+--------+-----------+
| S2(6)  |    0.0000 |
+--------+-----------+
| S2(7)  |    0.0000 |
+

#### b. At every time point, provide the most probable state

In [132]:
print(MPS)

[{'S1': 0}, {'S2': 0}, {'S3': 0}, {'S4': 0}, {'S5': 4}, {'S6': 5}, {'S7': 8}, {'S8': 8}, {'S9': 8}]


### Task 3.5

In [133]:
ACTIONS = {
    # each value in an actions' vector corresponds to an attack stage
    'NO-OP':   [1.,   0.61, 0.69, 0.09, 0.2 , 0. ,  0.,   0.,   0. ,  0. ,  0.  ],
    'MONITOR': [0.  , 0.39, 0.31 ,0.84, 0.63, 0.7,  0.07 ,0.1 , 0. ,  0. ,  0.  ],
    'STOP':    [0.  , 0.,   0.  , 0.07, 0.17, 0.3,  0.93 ,0.9 , 1. ,  1. ,  1.  ]
}

for i in range(len(MPS)):
    stage = MPS[i]['S'+str(i+1)]
    prob_value_array = []
    for key in ACTIONS.keys():
        prob_value_array.append([ACTIONS[key][stage]])
    
    action = list(ACTIONS)[np.argmax(np.array(prob_value_array))]
    print(action)

NO-OP
NO-OP
NO-OP
NO-OP
MONITOR
MONITOR
STOP
STOP
STOP


### Task 3.6

#### Indicate the earliest stage in which your model should recommend stopping the attack

#### S7

### Task 3.7

#### a. Judge whether the most probable states for $s_1-s_6,s_8,s_9$ remain the same as Task3.2
#### b. State the reason for your judgement

In [135]:
G = FactorGraph() ## Create FactorGraph object
###############################
#   TODO: Define factor functions
###############################
f_1 = DiscreteFactor(['S1', 'E1'], [11,5], f)
f_2 = DiscreteFactor(['S2', 'E2'], [11,5], f)
f_3 = DiscreteFactor(['S3', 'E3'], [11,5], f)
f_4 = DiscreteFactor(['S4', 'E4'], [11,5], f)
f_5 = DiscreteFactor(['S5', 'E5'], [11,5], f)
f_6 = DiscreteFactor(['S6', 'E6'], [11,5], f)
#f_7 = DiscreteFactor(['S7', 'E7'], [11,5], f)
f_8 = DiscreteFactor(['S8', 'E8'], [11,5], f)
f_9 = DiscreteFactor(['S9', 'E9'], [11,5], f)
state_margin = [[] for x in range(9)]

r_temp = np.zeros([11,5,5,5])
r_temp [4,2,2,2] = common_rep['Number'].loc[0]
r = DiscreteFactor(['S5', 'E3', 'E4', 'E5'], [11,5,5,5], r_temp)

c_temp = np.zeros([11,5,5,5])
c_temp [5,0,2,3] = common['Number'].loc[0]
c = DiscreteFactor(['S6', 'E1', 'E3' ,'E6'], [11,5,5,5], c_temp) 
###############################
#   TODO: Add random variables
#         and factor functions 
###############################
G.add_nodes_from(['S1', 'S3', 'S4', 'S5', 'S6', \
                 'E1', 'E3', 'E4', 'E5', 'E6'])  ## Add random variables 
G.add_factors(f_1, f_3, f_4, f_5, f_6, r, c)     ## Add factor functions

###############################
#   TODO: Add the edges for random 
#   variables and factor functions
###############################
G.add_edges_from([('S1', f_1),('S3', f_3),('S4', f_4),('S5', f_5),('S6', f_6),\
                  ('E1', f_1),('E3', f_3),('E4', f_4),('E5', f_5),('E6', f_6),\
                 ('E1', c),('E3', c),('E6', c),('S6', c),\
                 ('E3', r),('E4', r),('E5', r),('S5', r)])

###############################
#   TODO: Do the inference
###############################
bp_dep = BeliefPropagation(G)
MPS = [[] for x in range(9)] # Most Probable State at time t


for i in [1, 3, 4, 5, 6]:
    state_margin [i-1] = bp_dep.query(variables = ['S' + str(i)],\
                        evidence = {'E1': 0, 'E3': 2, 'E4': 2, 'E5': 2, 'E6': 3 }, show_progress = False)
    state_margin [i-1].normalize()
    MPS [i-1] = bp_dep.map_query(variables=['S' + str(i)], evidence = {'E1': 0, 'E3': 2, 'E4': 2, 'E5': 2, 'E6': 3 },\
                                 show_progress = False)

'''
There are two kinds of graphs: disjoint and joint. PGMPY gives an error if the entire graph is run together. 
Hence we need to assume the 4 disconnectde nodes are 4 seperate factor graphs and analyze them individually
'''

f_indep = [f_9, f_8, f_2]

for i in [2,8,9]:
    factor_current = f_indep.pop()
    G_indep = FactorGraph()
    G_indep.add_nodes_from(['S' + str(i), 'E' + str(i)])  ## Add random variables 
    G_indep.add_factors(factor_current)
    G_indep.add_edges_from([('S'+str(i), factor_current ),('E'+str(i), factor_current)])
    bp_indep = BeliefPropagation(G_indep)
    
    if i == 2:
        value = 1
    else: 
        value = 4
    
    state_margin [i-1] = bp_indep.query(variables=['S' + str(i)], evidence = {'E'+str(i): value}, show_progress = False)
    state_margin [i-1].normalize()
    MPS [i-1] = bp_indep.map_query(variables = ['S' + str(i)], evidence = {'E'+str(i): value}, show_progress = False)
    
print(state_margin)

[<DiscreteFactor representing phi(S1:11) at 0x148367abd48>, <DiscreteFactor representing phi(S2:11) at 0x1483734a608>, <DiscreteFactor representing phi(S3:11) at 0x14836d91ac8>, <DiscreteFactor representing phi(S4:11) at 0x14836a84648>, <DiscreteFactor representing phi(S5:11) at 0x14836572d48>, <DiscreteFactor representing phi(S6:11) at 0x14836b7d148>, [], <DiscreteFactor representing phi(S8:11) at 0x14837a9b108>, <DiscreteFactor representing phi(S9:11) at 0x14836e9fc48>]


In [139]:
print(MPS)
ACTIONS = {
    # each value in an actions' vector corresponds to an attack stage
    'NO-OP':   [1.,   0.61, 0.69, 0.09, 0.2 , 0. ,  0.,   0.,   0. ,  0. ,  0.  ],
    'MONITOR': [0.  , 0.39, 0.31 ,0.84, 0.63, 0.7,  0.07 ,0.1 , 0. ,  0. ,  0.  ],
    'STOP':    [0.  , 0.,   0.  , 0.07, 0.17, 0.3,  0.93 ,0.9 , 1. ,  1. ,  1.  ]
}

for i in range(len(MPS)):
    if MPS[i] == []:
        continue
    stage = MPS[i]['S'+str(i+1)]
    prob_value_array = []
    for key in ACTIONS.keys():
        prob_value_array.append([ACTIONS[key][stage]])
    
    action = list(ACTIONS)[np.argmax(np.array(prob_value_array))]
    print(action)

[{'S1': 0}, {'S2': 0}, {'S3': 0}, {'S4': 0}, {'S5': 4}, {'S6': 5}, [], {'S8': 8}, {'S9': 8}]
NO-OP
NO-OP
NO-OP
NO-OP
MONITOR
MONITOR
STOP
STOP


States remain the same. Because an independent node was removed.

### Task 3.8

#### a. Draw an HMM model for the attack scenario given the provided states and events.
#### b. What parameters are needed for this HMM model to work?
#### c. Give an example of an advantage of the FG over the HMM model.

a. Just remove the functions r and c, connect all states by factor functions, and add priors to the states. Everything else remains the same.

b. State transition probabilities are needed for factor functions between the states and priors are needed for the corresponding factor functions for the states

c. HMM is restricted to the Markov assumption that state_t is dependent only on state_t-1. This is not true for this case, as functions r anc c connect different states that are not immediate neighbors.

### Task 4.0:

Nope. Not possible to predict wth 100 percent accuracy even if all events were given. This is becasue there is a finite probability of the attack being benign in every stage.

### Task 4.3:

Factor Functions common: f_1 to f_8 (connecting events to states) and r
