In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
import networkx as nx

In [None]:
#Definisi environment (lingkungan)
edges = [(0, 1), (1, 5), (5, 6), (5, 4), (1, 2),
         (1, 3), (9, 10), (2, 4), (0, 6), (6, 7),
         (8, 9), (7, 8), (1, 7), (3, 9)]

goal = 10
G = nx.Graph()
G.add_edges_from(edges)
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos)
nx.draw_networkx_edges(G, pos)
nx.draw_networkx_labels(G, pos)
pl.show()

In [None]:
# LANGKAH 2: Membuat matriks reward (M)
MATRIX_SIZE = 11
M = np.matrix(np.ones(shape=(MATRIX_SIZE, MATRIX_SIZE)))
M *= -1

for point in edges:
    if point[1] == goal:
        M[point] = 100
    else:
        M[point] = 0

    if point[0] == goal:
        M[point[::-1]] = 100
    else:
        M[point[::-1]] = 0

M[goal, goal] = 100

# --- tampilkan tabel reward ---
print("Tabel Reward (M):")
print(np.asarray(M, dtype=int))

In [None]:
 #LANGKAH 3: Menyiapkan Q-table dan aturan belajar Q-Learning
Q = np.matrix(np.zeros([MATRIX_SIZE, MATRIX_SIZE]))

gamma = 0.75
# learning parameter
initial_state = 1

def available_actions(state):
    current_state_row = M[state, ]
    available_action = np.where(current_state_row >= 0)[1]
    return available_action

available_action = available_actions(initial_state)

def sample_next_action(available_actions_range):
    next_action = int(np.random.choice(available_action, 1))
    return next_action


action = sample_next_action(available_action)

def update(current_state, action, gamma):

  max_index = np.where(Q[action, ] == np.max(Q[action, ]))[1]
  if max_index.shape[0] > 1:
      max_index = int(np.random.choice(max_index, size = 1))
  else:
      max_index = int(max_index)
  max_value = Q[action, max_index]
  Q[current_state, action] = M[current_state, action] + gamma * max_value
  if (np.max(Q) > 0):
    return(np.sum(Q / np.max(Q)*100))
  else:
    return (0)
update(initial_state, action, gamma)

In [None]:
#Proses latih dan uji q learning
scores = []
for i in range(1000):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_action = available_actions(current_state)
    action = sample_next_action(available_action)
    score = update(current_state, action, gamma)
    scores.append(score)

# Testing
current_state = 0
steps = [current_state]

while current_state != 10:

    next_step_index = np.where(Q[current_state, ] == np.max(Q[current_state, ]))[1]
    if next_step_index.shape[0] > 1:
        next_step_index = int(np.random.choice(next_step_index, size = 1))
    else:
        next_step_index = int(next_step_index)
    steps.append(next_step_index)
    current_state = next_step_index

print("Rute paling efisien yang ditemukan:")
print(steps)

pl.plot(scores)
pl.xlabel('Jumlah Iterasi Pelatihan')
pl.ylabel('Nilai Reward')
pl.show()


In [None]:
# Menentukan lokasi polisi dan jejak narkoba
police = [2, 4, 5]
drug_traces = [3, 8, 9]

G = nx.Graph()
G.add_edges_from(edges)

mapping = {
    0: '0 - Detektif',
    1: '1',
    2: '2 - Polisi',
    3: '3 - Jejak narkoba',
    4: '4 - Polisi',
    5: '5 - Polisi',
    6: '6',
    7: '7',
    8: '8 - Jejak narkoba',
    9: '9 - Jejak narkoba',
    10: '10 - Lokasi bandar narkoba'
}

H = nx.relabel_nodes(G, mapping)
pos = nx.spring_layout(H)
nx.draw_networkx_nodes(H, pos, node_size=200)
nx.draw_networkx_edges(H, pos)
nx.draw_networkx_labels(H, pos)
pl.show()


In [None]:
# TAHAP 5: Penggunaan informasi lingkungan dalam Q-Learning
Q = np.matrix(np.zeros([MATRIX_SIZE, MATRIX_SIZE]))
env_police = np.matrix(np.zeros([MATRIX_SIZE, MATRIX_SIZE]))
env_drugs = np.matrix(np.zeros([MATRIX_SIZE, MATRIX_SIZE]))
initial_state = 1

def available_actions(state):
    current_state_row = M[state, ]

    av_action = np.where(current_state_row >= 0)[0]
    return av_action

def sample_next_action(available_actions_range):
    if available_actions_range.ndim > 1:
        available_actions_range = available_actions_range.flatten()
    next_action = np.random.choice(available_actions_range, 1).item()
    return next_action

def collect_environmental_data(action):
    found = []
    if action in police:
        found.append('p')
    if action in drug_traces:
        found.append('d')
    return (found)

available_action = available_actions(initial_state)
action = sample_next_action(available_action)

def update(current_state, action, gamma):
  max_index = np.where(Q[action, ] == np.max(Q[action, ]))[1]
  if max_index.shape[0] > 1:
      max_index = np.random.choice(max_index, size = 1).item()
  else:
      max_index = max_index.item()
  max_value = Q[action, max_index]
  Q[current_state, action] = M[current_state, action] + gamma * max_value
  environment = collect_environmental_data(action)
  if 'p' in environment:
    env_police[current_state, action] += 1
  if 'd' in environment:
    env_drugs[current_state, action] += 1
  if (np.max(Q) > 0):
    return(np.sum(Q / np.max(Q)*100))
  else:
    return (0)

update(initial_state, action, gamma)

env_matrix_snap = np.zeros([MATRIX_SIZE, MATRIX_SIZE])
env_matrix_snap[env_police > 0] = -1
env_matrix_snap[env_drugs > 0] = -1

def available_actions_with_env_help(state):
    current_state_row = M[state, ]
    # Corrected: use [0] to get the 1-dimensional array of indices
    av_action = np.where(current_state_row >= 0)[0]

    env_pos_row = env_matrix_snap[state, av_action]

    if (np.sum(env_pos_row < 0)):
        temp_av_action = av_action[np.array(env_pos_row)[0]>= 0]
        if len(temp_av_action) > 0:
            av_action = temp_av_action
    return av_action

In [None]:
# Melatih agen dan mencatat posisi polisi serta jejak narkoba
scores = []
for i in range(1000):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_action = available_actions(current_state)
    action = sample_next_action(available_action)
    score = update(current_state, action, gamma)

#Menampilkan Q tabel
print("\nTabel Q setelah training:")
Q_rounded = np.round(np.asarray(Q, dtype=float), 2)
print(Q_rounded)

print('Lokasi Polisi yang Terdeteksi')
print(env_police)
print('')
print('Lokasi Jejak Narkoba yang Terdeteksi')
print(env_drugs)



Tabel Q setelah training:
[[-1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [-1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]
Lokasi Polisi yang Terdeteksi
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

Lokasi Jejak Narkoba yang T

In [None]:
import pandas as pd

df_M = pd.DataFrame(np.asarray(M, int))
df_Q = pd.DataFrame(np.round(np.asarray(Q, float), 2))

display(df_M)  # Tabel reward
display(df_Q)  # Q-table