# Analysis of the In-Degree ration of the PR Holders
This script analyzes the in-degree ratio of PR holders among the K closest peers we track over the hoards.

The in-degree ratio correspond to the number of PR holders that remain inside the K closest peers over rounds

In [None]:
## Import dependencies
import sqlalchemy as sa
import pandas as pd
import seaborn as sns
import multihash as mh
import hashlib as hl
from cid import make_cid
from math import log2
import matplotlib.pyplot as plt
import numpy as np

## DB Credentials
HOST="localhost"
PORT="5432"
DB="cid_hoarder_db"
USER="user"
PASSWD="password"

# Connecte with the DB
engine = sa.create_engine(f'postgresql://{USER}:{PASSWD}@{HOST}:{PORT}/{DB}')

## plotting style
fig_size= (7,4)
sns.set_context("talk", font_scale=1)


In [None]:
## all kind of necessary formulas

def get_in_degree(org, new) -> int:
    in_degree = 0
    for new_peer in new:
        if new_peer in org:
            in_degree += 1
    return in_degree

def hash_int_from_peer_id(peer_id: str) -> int:
    multihash = mh.from_b58_string(peer_id)
    hash_b = hl.sha256(multihash).digest()
    return int.from_bytes(hash_b, byteorder='big')

def hash_int_from_cid(cid: str) -> int:
    cid = make_cid(cid)
    cid_hash = hl.sha256(cid.multihash).digest()
    return int.from_bytes(cid_hash, byteorder='big')


def get_xor_distance(base: int, comp: int) -> int:
    return abs(base ^ comp)


# hour distributions from the study
sql_query="""
    SELECT 
        cid_hash,
        ping_round, 
        fetch_time_since_publication_m
    FROM fetch_results
    ORDER BY ping_round;
"""
ping_rounds = pd.read_sql_query(sql_query, engine)

avg_fetcht = ping_rounds.groupby(by="ping_round").agg({"fetch_time_since_publication_m": "mean"})
hours_dist = avg_fetcht["fetch_time_since_publication_m"].to_numpy()

hours_dist = (hours_dist - hours_dist[0]) / 60

In [None]:
## Get the number of total hops that needed to be done to get the closest peers over the entire study

sql_query = """
    SELECT 
        k_peers.ping_round,
        k_peers.cid_hash,
        count(k_peers.peer_id)
    FROM k_closest_peers as k_peers
    INNER JOIN pr_holders
    ON k_peers.cid_hash = pr_holders.cid_hash
        and k_peers.peer_id=pr_holders.peer_id
    GROUP BY k_peers.ping_round, k_peers.cid_hash
    ORDER BY ping_round asc;
"""

in_degree = pd.read_sql_query(sql_query, engine)

ping_r = in_degree.groupby("ping_round")

# make dist
dist = []
cids = 0
for i, row in ping_r:
    ## Keep track of how many cids are published
    if cids == 0:
        cids = len(row["count"])
    dist.append(row["count"])

## Recompose the In-Degree ration from the publication of the CIDs
# get an array of len(CIDs) full of K values (at publication we have 100% in-degree)
init_dist = [max(in_degree["count"])] * cids
dist.insert(0, init_dist)

## Make a boxplot with the distribution
fig, ax = plt.subplots(figsize=(12,6))
ax.boxplot(dist, positions=hours_dist, showfliers=True) 
ticks = np.linspace(0.0, 2.0, 10) ###### <---- *UPDATE THIS* to fit the study duration (start, finish, number of ticks)
plt.xticks(ticks, ticks.astype(int))
plt.xlabel("Time Since Publication (Hours)")
plt.ylabel("In Degree Ration")
plt.show()



In [None]:
engine.dispose()