# Sanity testing notebook

The purpose of this notebook is to perfrom sanity testing for different modules of the monte-carlo simulation:<br>
cyber_risk_simulator.py<br>
analysis_service.py<br>
web_service.py<br>

In [1]:
import json
import random
import csv
import numpy as np
import pandas as pd

In [11]:
industries_revenue_df = pd.read_json('industries_revenue_stats.json')
industries_revenue_df

Unnamed: 0,healthcare,finance,retail,manufacturing,construction
10M,"{'frequency': 0.2, 'cost': 1000000}","{'frequency': 0.4, 'cost': 5000000}","{'frequency': 0.8, 'cost': 7000000}","{'frequency': 0.08, 'cost': 900000}","{'frequency': 0.01, 'cost': 700000}"
100M,"{'frequency': 0.30000000000000004, 'cost': 200...","{'frequency': 0.5, 'cost': 70000000}","{'frequency': 0.9, 'cost': 80000000}","{'frequency': 0.23, 'cost': 40000000}","{'frequency': 0.02, 'cost': 20000000}"
500M,"{'frequency': 0.4, 'cost': 60000000}","{'frequency': 0.6000000000000001, 'cost': 1000...","{'frequency': 1.1, 'cost': 90000000}","{'frequency': 0.4, 'cost': 80000000}","{'frequency': 0.12, 'cost': 30000000}"
1B,"{'frequency': 0.4, 'cost': 120000000}","{'frequency': 0.7000000000000001, 'cost': 2000...","{'frequency': 1, 'cost': 180000000}","{'frequency': 0.7000000000000001, 'cost': 1450...","{'frequency': 0.24, 'cost': 120000000}"


In [145]:
with open('industries_revenue_stats.json') as json_file:
    data = json.load(json_file)
data

{'healthcare': {'10M': {'frequency': 0.2, 'cost': 1000000},
  '100M': {'frequency': 0.3, 'cost': 20000000},
  '500M': {'frequency': 0.4, 'cost': 60000000},
  '1B': {'frequency': 0.4, 'cost': 120000000}},
 'finance': {'10M': {'frequency': 0.4, 'cost': 5000000},
  '100M': {'frequency': 0.5, 'cost': 70000000},
  '500M': {'frequency': 0.6, 'cost': 100000000},
  '1B': {'frequency': 0.7, 'cost': 200000000}},
 'retail': {'10M': {'frequency': 0.8, 'cost': 7000000},
  '100M': {'frequency': 0.9, 'cost': 80000000},
  '500M': {'frequency': 1.1, 'cost': 90000000},
  '1B': {'frequency': 1, 'cost': 180000000}},
 'manufacturing': {'10M': {'frequency': 0.08, 'cost': 900000},
  '100M': {'frequency': 0.23, 'cost': 40000000},
  '500M': {'frequency': 0.4, 'cost': 80000000},
  '1B': {'frequency': 0.7, 'cost': 145000000}},
 'construction': {'10M': {'frequency': 0.01, 'cost': 700000},
  '100M': {'frequency': 0.02, 'cost': 20000000},
  '500M': {'frequency': 0.12, 'cost': 30000000},
  '1B': {'frequency': 0.24, 

In [210]:
def simulate_attack(industry, revenue, num_simulations = 10000):
    # print(f'{industry=}, {revenue=}')
    frequency_dist = data[industry][revenue]['frequency']
    cost_dist = data[industry][revenue]['cost']
    
    simulation_results = []
    
    for simulation_id in range(1, num_simulations + 1):
        # total_cost = 0
        for attack_id, attack_freq in enumerate(np.random.poisson(frequency_dist, size=10), start = 1):
            attack_cost = np.random.normal(cost_dist, 0.1 * cost_dist)
            # total_cost += attack_cost * attack_freq
            simulation_results.append((simulation_id, attack_id, int(attack_cost)))
    
    return simulation_results


In [None]:
def main():
    industry = input("Enter the industry: ")
    revenue = input("Enter the revenue: ")
    num_simulations = int(input("Enter the number of simulations: "))
    
    simulations = simulate_attack(industry, revenue, num_simulations)
    
    with open('simulation_results.csv', 'w', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(["simulation", "attack_id", "cost"])
        csv_writer.writerows(simulations)
    
    print("Simulation results saved to 'simulation_results.csv'.")

if __name__ == "__main__":
    main()

In [107]:
# Define industries and revenues
industries = ['healthcare', 'finance', 'retail', 'manufacturing', 'construction']
revenues = np.round(np.random.uniform(1, 1000, size=1000)).astype(int)  # Rounded to nearest million

# Create synthetic dataset
data = {
    company_id: {
        'Revenue (USD)': np.random.choice(revenues),
        'Industry': np.random.choice(industries)
    }
    for company_id in range(1, 1001)
}

synthetic_df = pd.DataFrame(data).transpose().rename_axis('company_id').reset_index(drop=False)

synthetic_df

Unnamed: 0,company_id,Revenue (USD),Industry
0,1,432,retail
1,2,455,manufacturing
2,3,288,healthcare
3,4,207,construction
4,5,633,healthcare
...,...,...,...
995,996,800,healthcare
996,997,470,healthcare
997,998,317,retail
998,999,768,healthcare


In [95]:
synthetic_df.to_csv('synthetic_data.csv')

In [221]:
revenue_buckets = [10, 100, 500, 1000]

def get_bucket(number):
    for size in revenue_buckets:
        if number <= size:
            return(f'{size}M', '1B')[size == 1000]
    # return f'{revenue_buckets[-1]}M'  # If the number is larger than all bucket sizes

def simulate_company(company):
    industry = company['Industry']
    revenue = company['Revenue (USD)']
    bucket_revenue = get_bucket(revenue)
    simulations = simulate_attack(industry, bucket_revenue, num_simulations=1)
    average_cost = np.mean([cost for _, _, cost in simulations])

    return average_cost, bucket_revenue
    # return average_cost, simulations

In [203]:
# from flask import Flask, request, jsonify

def get_results_by_id(company_id):
    company = synthetic_df[synthetic_df['company_id'] == company_id].iloc[0]
    print(f"getting results for: {company.company_id = }, {company['Revenue (USD)'] = }, {company['Industry'] = }")
    return simulate_company(company)[0]
    # average_cost = simulate_company(company)
    # return jsonify({'average_simulation_cost': average_cost})


In [222]:
get_results_by_id(1)

getting results for: company.company_id = 1, company['Revenue (USD)'] = 432, company['Industry'] = 'retail'


87867201.1

In [236]:
results_df = synthetic_df.copy()
results_df[['bucket_revenue','average_cost']] = [np.nan,np.nan]
results_df

Unnamed: 0,company_id,Revenue (USD),Industry,bucket_revenue,average_cost
0,1,432,retail,,
1,2,455,manufacturing,,
2,3,288,healthcare,,
3,4,207,construction,,
4,5,633,healthcare,,
...,...,...,...,...,...
995,996,800,healthcare,,
996,997,470,healthcare,,
997,998,317,retail,,
998,999,768,healthcare,,


In [219]:
simulate_company(results_df.iloc[0])

(89631594.1, '500M')

In [237]:
results_df[['average_cost','bucket_revenue']] = results_df.apply(simulate_company, axis=1,result_type='expand')
results_df

Unnamed: 0,company_id,Revenue (USD),Industry,bucket_revenue,average_cost
0,1,432,retail,500M,85741880.7
1,2,455,manufacturing,500M,78564247.6
2,3,288,healthcare,500M,59388874.5
3,4,207,construction,500M,30301509.0
4,5,633,healthcare,1B,122198755.4
...,...,...,...,...,...
995,996,800,healthcare,1B,120075349.2
996,997,470,healthcare,500M,62470371.0
997,998,317,retail,500M,88038196.0
998,999,768,healthcare,1B,118010337.1


In [248]:
def get_results_by_segmentation(revenues, industries):
    # request_data = request.json
    # revenues = request_data['revenues']
    # industries = request_data['industries']
    
    selected_companies = \
        results_df[(results_df['bucket_revenue'].isin(revenues)) & (results_df['Industry'].isin(industries))]
    # average_costs = [simulate_company(company) for _, company in selected_companies.iterrows()]
    # average_segment_cost = sum(average_costs) / len(average_costs)
    average_segment_cost = selected_companies.average_cost.mean()
    return average_segment_cost
    # return jsonify({'average_segment_simulation_cost': average_segment_cost})


In [253]:
get_results_by_segmentation(['1B','10M'],['retail', 'healthcare'])

144553727.69126213

In [256]:
results_df[(results_df['bucket_revenue'].isin(['1B'])) & (results_df['Industry'].isin(['retail', 'healthcare']))]

Unnamed: 0,company_id,Revenue (USD),Industry,bucket_revenue,average_cost
4,5,633,healthcare,1B,122198755.4
7,8,873,retail,1B,176968795.9
8,9,515,retail,1B,182319357.1
15,16,948,retail,1B,177475743.6
17,18,577,retail,1B,184530779.9
...,...,...,...,...,...
970,971,539,healthcare,1B,123774863.8
978,979,585,healthcare,1B,112242546.3
982,983,860,retail,1B,168325260.5
995,996,800,healthcare,1B,120075349.2


In [247]:
results_df.average_cost.mean()

105428715.844

In [263]:
from scipy.stats import poisson, norm
poisson.rvs(2, size=10)

array([4, 1, 1, 0, 1, 1, 3, 0, 3, 2], dtype=int64)

In [272]:
np.random.poisson(0.2, size=10)

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

## cyber_risk_simulator

In [3]:
%%time
from cyber_risk_simulator import CyberRiskSimulator
from simulation_types import Industry
# import cyber_risk_simulator

# Example usage
simulator = CyberRiskSimulator(industry=Industry.FINANCE, revenue=301)
metrics = simulator.run_simulation(True)

print("Total Loss:", metrics.total_loss)
print("Mean Loss:", metrics.mean_loss)
simulator.results

Total Loss: 599239957373.336
Mean Loss: 59923995.7373336
CPU times: total: 125 ms
Wall time: 130 ms


Unnamed: 0,simulation_id,attack_id,cost
0,1,1,9.891960e+07
1,1,2,1.092235e+08
2,4,1,1.044015e+08
3,4,2,8.951213e+07
4,6,1,9.506875e+07
...,...,...,...
5995,9983,1,1.040321e+08
5996,9986,1,1.116147e+08
5997,9992,1,8.700610e+07
5998,9993,1,1.039610e+08


## analysis_service

In [4]:
%%time
from analysis_service import AnalysisService
service = AnalysisService()
service.run_simulations()

CPU times: total: 2min 15s
Wall time: 2min 15s


In [6]:
service.companies

Unnamed: 0,company_id,revenue_usd,industry
0,1,780.0,healthcare
1,2,179.0,finance
2,3,795.0,finance
3,4,868.0,retail
4,5,8.0,healthcare
...,...,...,...
995,996,643.0,manufacturing
996,997,97.0,healthcare
997,998,389.0,finance
998,999,292.0,retail


In [7]:
service.get_results_from_db().head()

Unnamed: 0,company_id,total_loss,mean_loss,timestamp
0,1,478484300000.0,47848430.0,2023-08-27 04:14:59.818239
1,2,600967900000.0,60096790.0,2023-08-27 04:14:59.997676
2,3,1371723000000.0,137172300.0,2023-08-27 04:15:00.157564
3,4,1802172000000.0,180217200.0,2023-08-27 04:15:00.392708
4,5,1926397000.0,192639.7,2023-08-27 04:15:00.437511


In [8]:
service.get_synthetic_companies_from_db()

Unnamed: 0,company_id,revenue_usd,industry
0,1,780.0,healthcare
1,2,179.0,finance
2,3,795.0,finance
3,4,868.0,retail
4,5,8.0,healthcare
...,...,...,...
995,996,643.0,manufacturing
996,997,97.0,healthcare
997,998,389.0,finance
998,999,292.0,retail


In [9]:
import sqlite3
import pandas as pd

query = f"SELECT * FROM simulation_results where timestamp > '2023-08-26 19:16'"
# query = f"SELECT *FROM simulation_results --where company_id = 1"

with sqlite3.connect('monte_carlo.db') as conn:
    df_from_db = pd.read_sql_query(query, conn)

print(f'{df_from_db.shape=}\n')
df_from_db

df_from_db.shape=(1000, 4)



Unnamed: 0,company_id,total_loss,mean_loss,timestamp
0,1,4.784843e+11,4.784843e+07,2023-08-27 04:14:59.818239
1,2,6.009679e+11,6.009679e+07,2023-08-27 04:14:59.997676
2,3,1.371723e+12,1.371723e+08,2023-08-27 04:15:00.157564
3,4,1.802172e+12,1.802172e+08,2023-08-27 04:15:00.392708
4,5,1.926397e+09,1.926397e+05,2023-08-27 04:15:00.437511
...,...,...,...,...
995,996,1.027305e+12,1.027305e+08,2023-08-27 04:17:14.417727
996,997,5.944474e+10,5.944474e+06,2023-08-27 04:17:14.477992
997,998,5.975196e+11,5.975196e+07,2023-08-27 04:17:14.617972
998,999,9.874500e+11,9.874500e+07,2023-08-27 04:17:14.897637


In [10]:
# import sqlite3
# query = f"DELETE FROM simulation_results;"

# with sqlite3.connect('monte_carlo.db') as conn:
#     cursor = conn.cursor()
#     cursor.execute(query)
#     conn.commit()


## web_service

### welcome endpoint

In [16]:
import requests

response = requests.get("http://127.0.0.1:5000/")

print(f'{response.status_code=}')
print(response.content)

response.status_code=200
b'Hello world from Monte-Carlo simulation web service! :)'


### get_results_by_id

In [17]:
import requests

# existing company
response = requests.get("http://127.0.0.1:5000/get_results_by_id/-3")

print(f'{response.status_code=}')
print(response.json())


response.status_code=404
{'error': 'no company found'}


In [18]:
# existing company
response = requests.get("http://127.0.0.1:5000/get_results_by_id/4")

print(f'{response.status_code=}')
print(response.json())

response.status_code=200
{'average simulation cost': 180217168.16648653}


### get_results_by_segmentation

In [19]:
import requests

# no segemnts provided
response = requests.get("http://127.0.0.1:5000/get_results_by_segmentation?")

print(f'{response.status_code=}')
print(response.json())


response.status_code=404
{'error': 'no companies found'}


In [20]:
import requests

# segments with no company
response = requests.get("http://127.0.0.1:5000/get_results_by_segmentation?revenue=500M")

print(f'{response.status_code=}')
print(response.json())


response.status_code=404
{'error': 'no companies found'}


In [21]:
import requests

# segments with companies
response \
    = requests.get("http://127.0.0.1:5000/get_results_by_segmentation?revenue=100M&revenue=500M&industry=retail&industry=healthcare")

print(f'{response.status_code=}')
print(response.json())

response.status_code=200
{'average simulation cost': 55376770.16564594}
