In [None]:
import math
import matplotlib.pyplot as plt
import random
import numpy as np

We generate synthetic data to simulate requests to a web application.

In [None]:
# Specify how much traffic our webserver gets for each weekday. Mon, Tue, ... Sun
factors = [0.9, 0.8, 0.85, 0.79, 0.85, 0.6, 0.5]

# In the simulation each day has just 12 hours to make the visualizations more clear.
hours = 12

# Specify how much traffic our webserver gets for each hour.
hour_factors = [0.2 + math.sin(i / (hours - 1) * 3.14) for i in range(hours)]

# Number of requests without weekday and hour variations.
req = 1000

# Create the data.
data = []
for weekday_factor in factors:
    for hour_factor in hour_factors:
        data.append(int((random.random() * 0.1 + 0.9) * req * hour_factor * weekday_factor))

# data[i] = number of requests for hour i (i=0..11 -> first day, i=12..23 -> second day, ...)

In [None]:
plt.rcParams['figure.figsize'] = [15, 7]
plt.rcParams.update({'font.size': 22})
plt.grid()
plt.bar(range(len(data)), data)
plt.show()

In our simulation we want to have 10 different types of log messages that are written into the log file. Now, we 
create this data. We create a matrix where row $i$ and column $j$ contains the number of requests for log type $i4 and hour $j$.

In [None]:
# For each log type we specify how often it occurs.
LOG_TYPE_DISTRIBUTION = [0.1, 0.05, 0.1, 0.18, 0.05, 0.3, 0.02, 0.02, 0.08, 0.1]

def create_requests_for_log_type(requests_per_hour, log_type):
    return [requests * log_type * (random.random() * 0.3 + 0.7) for requests in requests_per_hour]
    
request_matrix = np.array([create_requests_for_log_type(data, log_type) for log_type in LOG_TYPE_DISTRIBUTION])

Plot the data. Use a stacked bar chart so that we can see the distribution of log types for each hours.

In [None]:
plt.rcParams['figure.figsize'] = [18, 7]

x = np.arange(len(data))
acc = np.zeros(len(x))
for row in request_matrix:
    plt.bar(x, row, bottom=acc)
    acc += row
plt.grid()
plt.show()

As it's difficult to see variations in the distribution of the log types (i.e. anomalies are difficult to spot) we use a histogram matrix described in the paper [here](https://ieeexplore.ieee.org/document/4529398).

In [None]:
def plot_data(data):
    plt.rcParams['figure.figsize'] = [18, 18]
    plt.rcParams.update({'font.size': 22})
    _, axes = plt.subplots()
    
    for logtype, row in enumerate(data):
        for hour, val in enumerate(row):
            c = plt.Circle((hour + 1, logtype + 1), val / np.max(data) * 0.5)
            axes.add_artist(c)

    axes.set_aspect(1)
    axes.set_ylabel("log type")
    axes.set_xlabel("hour")
    plt.xlim([0, data.shape[1] + 1])
    plt.ylim([0, data.shape[0] + 1])
    plt.grid()
    plt.show()

# We visualize the distribution of log types for each hour of one day.
first_day = request_matrix[:,0:hours]

# Compute the relative occurrence of log types within each hours slot.
first_day /= np.sum(first_day, axis=0)

plot_data(first_day)

Compute the average of all weekdays.

In [None]:
avg = np.zeros((10, 12))
for i in range(7):
    current_day = request_matrix[:, i*12:i*12+12]
    avg += current_day / np.sum(current_day, axis=0)
avg /= 7

Now, we visualize the difference between the first day and the average of all days.

In [None]:
difference = np.abs(avg - first_day)

Now, we plot histogram matrix. The circles are green if the deviation from the average is small. If the deviation is large, the circles will be red.

In [None]:
def plot_diff(data, difference):
    def get_color(val):
        # 0 = green 1 = red
        val = 1 - math.exp(-7 * val)
        if val <= 0.5:
            return (val * 2, 1, 0)
        else:
            return (1, 1 - (val - 0.5) * 2, 0)

    plt.rcParams['figure.figsize'] = [18, 18]
    plt.rcParams.update({'font.size': 22})
    _, axes = plt.subplots()
    
    for logtype, row in enumerate(data):
        for hour, val in enumerate(row):
            color = get_color(difference[logtype, hour])
            c = plt.Circle((hour + 1, logtype + 1), val / np.max(data) * 0.5, color=color)
            axes.add_artist(c)

    axes.set_aspect(1)
    axes.set_ylabel("log type")
    axes.set_xlabel("hour")
    plt.xlim([0, data.shape[1] + 1])
    plt.ylim([0, data.shape[0] + 1])
    plt.grid()
    plt.show()

plot_diff(first_day, difference)

As we can see, there is no deviation. Let's manipulate one value and see what happens.

In [None]:
# Divide the relative number of requests for logtype 5 in hour 0 by 2.
first_day_modified = np.copy(first_day)
first_day_modified[5, 0] /= 2

# Compute the difference to the average.
difference = np.abs(avg - first_day_modified)

# As the difference to the average is significant, the circle should not be green anymore.
plot_diff(first_day_modified, difference)