# Creating Delta Frequency Data
Some of the figures and tables in the article are related to the "delta_frequency_under_1b.csv" file. In this notebook, the codes required to reproduce this file are written.

In [1]:
import sys
from pathlib import Path
from fractions import Fraction
import numpy as np
import pandas as pd
from numba import jit, uint64

First of all, we need some prime number; For this purpuse I defined eratosthenes_sieve as follow:

In [2]:
@jit(uint64[:](uint64), nopython=True, cache=True)
def eratosthenes_sieve(stop: int):
    numbers = np.arange(2, stop, dtype=np.uint64)
    for n in numbers:
        if n != 0:
            i = 2
            while i*n-2 < stop-1:          
                numbers[i*n-2] = 0
                i += 1
        else:
            continue
    primes = numbers[numbers != 0]
    return primes

Set `maxnumber` to a any number you want; Pass this variable to `eratosthenes_sieve` function as `stop` argument and this function will produce all primes up to `maxnumber`.

In [3]:
maxnumber = 10000
primes = eratosthenes_sieve(maxnumber)
print(f'Number of produced primes: {len(primes)}')

Number of produced primes: 1229


By using following function we obtain the frequency of each delta as a dictionary:

In [4]:
def get_delta_frequency(primes, order):
    last_index = primes.size - 1
    max_order = last_index//2
    if 2*order > last_index:
        raise ValueError(f'Maximum allowable order is {max_order}')
    delta_table = {}
    i = 0
    while i+2*order <= last_index:
        p1, p2, p3 = primes[i], primes[i+order], primes[i+2*order]
        delta = str(Fraction(p3.item()-p2.item(), p2.item()-p1.item()))
        if delta in delta_table:
            delta_table[delta] += 1
        else:
            delta_table[delta] = 1
        i += 1
    return delta_table

In [5]:
delta_table = get_delta_frequency(primes, 1)

Now calculate other required data using `get_delta_frequency_dataframe` and get results as pandas `DataFrame`

In [61]:
def get_delta_frequency_dataframe(delta_table):
    freq_sum = sum(delta_table.values())
    data = {'delta': list(delta_table.keys()),
            'delta_log2': [np.log2(float(Fraction(i))) for i in delta_table.keys()],
            'frequency': list(delta_table.values()),
            'density': [i/freq_sum for i in delta_table.values()]}
    df = pd.DataFrame(data)
    df.sort_values(by=['frequency'], ascending=False, ignore_index=True, inplace=True)
    df['cumulative_density'] = df['density'].cumsum()
    return df

In [7]:
df = get_delta_frequency_dataframe(delta_table)
df

Unnamed: 0,delta,delta_log2,frequency,density,cumulative_density
0,2,1.000000,121,0.098615,0.098615
1,1,0.000000,65,0.052975,0.151589
2,1/2,-1.000000,116,0.094540,0.246129
3,3/2,0.584963,78,0.063570,0.309698
4,1/3,-1.584963,78,0.063570,0.373268
...,...,...,...,...,...
80,7/11,-0.652077,1,0.000815,0.996740
81,4,2.000000,1,0.000815,0.997555
82,7/12,-0.777608,1,0.000815,0.998370
83,6/17,-1.502500,1,0.000815,0.999185


# Validate `delta_frequency_under_1b.csv`

Now lets compare results from this functions to the data in `'delta_frequency_under_1b.csv'`

In [9]:
delta_frequency_path = Path('delta_frequency_under_1b.csv')
try:
    file_df = pd.read_csv(delta_frequency_path)
except FileNotFoundError as error:
        print(error)
        in_colab = 'google.colab' in sys.modules
        if in_colab:
            print('running on colab, downloading data to session...')
            # Download delta_frequency_under_1b.csv
            !wget https://bit.ly/3zcycOz -O delta_frequency_under_1b.csv
            file_df = pd.read_csv(delta_frequency_path)
        else:
            print('Please download required data from paper repo')
file_df.head(10)

Unnamed: 0,delta,delta_log2,frequency,density,cumulative_density
0,2,1.0,2047829,0.040274,0.040274
1,1/2,-1.0,2046288,0.040244,0.080518
2,3/2,0.584963,1460663,0.028726,0.109244
3,2/3,-0.584963,1460572,0.028725,0.137968
4,1/3,-1.584963,1384018,0.027219,0.165187
5,3,1.584963,1382907,0.027197,0.192384
6,1,0.0,1328401,0.026125,0.21851
7,5,2.321928,1117019,0.021968,0.240478
8,1/5,-2.321928,1116795,0.021964,0.262441
9,3/4,-0.415037,789598,0.015529,0.27797


In [10]:
maxnumber = 1000000000
primes = eratosthenes_sieve(maxnumber)
print(f'Number of produced primes: {len(primes)}')

Number of produced primes: 50847534


In [11]:
delta_table = get_delta_frequency(primes, 1)

In [63]:
generated_df = get_delta_frequency_dataframe(delta_table)

`delta` and `frequency` column validation is easy. so lets first validate those columns:

In [64]:
if (file_df['delta'] == generated_df['delta']).all():
    print('delta column IS VALID')
else:
    print('delta column IS NOT VALID')

if (file_df['frequency'] == generated_df['frequency']).all():
    print('frequency column IS VALID')
else:
    print('frequency column IS NOT VALID')

delta column IS VALID
frequency column IS VALID


do to floating point representation, use maximum difference of the float columns to validate results:

In [67]:
delta_log2_diff = (generated_df['delta_log2'] - file_df['delta_log2']).abs().max()
density_diff = (generated_df['density'] - file_df['density']).abs().max()
cumulative_density_diff = (generated_df['cumulative_density'] - file_df['cumulative_density']).abs().max()

In [68]:
print(f'Max difference in "delta_log2" column: {delta_log2_diff}')
print(f'Max difference in "density" column: {density_diff}')
print(f'Max difference in "cumulative_density" column: {cumulative_density_diff}')


Max difference in "delta_log2" column: 2.6645352591003757e-15
Max difference in "density" column: 9.93129189996722e-17
Max difference in "cumulative_density" column: 2.5424107263916085e-14
