In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import matplotlib.cm as cm
import numpy as np

In [None]:
# Read the CSV file with tabs as separator
mempool = pd.read_csv('new_pool.csv', sep='\t')

# Remove leading/trailing spaces from column names
mempool.columns = mempool.columns.str.strip()

# This works
mempool.columns

In [None]:
# define function to determine order flow
def determine_order_flow(row):
    if row['timepending'] == 0 and row['status'] == 'confirmed':
        return 'private'
    elif pd.isna(row['timepending']) and row['status'] == 'confirmed':
        return 'private'
    elif row['timepending'] > 0 and row['status'] == 'confirmed':
        return 'public'
    else:
        return 'others'

In [None]:
def plot_scatter_colors(df):
    ## FILTER OUTLIERS ##
    # df = df[df['gasused_private'] < 1e+3]
    ## FILTER OUTLIERS END ##
    plt.scatter(df['curblocknumber'], df['gasused_private'], c=df['next_block_inclusion_time'], cmap=('cool'))
    plt.title('Relationship between Gas Used in a Block in Private O.F. and Varieties of Inclusion Time')
    plt.xlabel('Block number')
    plt.ylabel('Gas Used in Private Order Flow at a Block')
    plt.colorbar(label='Inclusion Time')
    plt.gcf().set_size_inches(15, 5)
    plt.grid(True)
    plt.show()

def plot_scatter_colors_outliers(df):
    # Calculating the first and third quartiles
    q1 = df['gasused_private'].quantile(0.25)
    q3 = df['gasused_private'].quantile(0.75)

    # Calculating the interquartile range (IQR)
    iqr = q3 - q1

    # Filtering out the outliers based on the IQR
    df_filtered = df[(df['gasused_private'] >= q1 - 1.5 * iqr) & (df['gasused_private'] <= q3 + 1.5 * iqr)]

    # Plotting the scatter plot with filtered data
    plt.scatter(df_filtered['curblocknumber'], df_filtered['gasused_private'], c=df_filtered['next_block_inclusion_time'], cmap=('cool'))
    plt.title('Relationship between Gas Used in a Block in Private O.F. and Varieties of Inclusion Time')
    plt.xlabel('Block number')
    plt.ylabel('Gas Used in Private Order Flow at a Block')
    plt.colorbar(label='Inclusion Time')
    plt.gcf().set_size_inches(15, 5)
    plt.grid(True)
    plt.show()

def plot_scatter_inclusion_vs_gasprice(df):
    plt.scatter(df['gasused_private'], df['next_block_inclusion_time'])
    plt.title('Relationship between Gas Used per Block by Private O.F vs. Varieties of Inclusion Time')
    plt.xlabel('Gas Used By Block in Private O.F')
    plt.ylabel('Inclusion Time')
    plt.gcf().set_size_inches(15, 5)
    plt.grid(True)
    plt.show()

def plot_scatter_inclusion_vs_gasprice_outliers(df):
    # Calculate the first and third quartiles for 'gasused_private' column
    q1 = df['gasused_private'].quantile(0.25)
    q3 = df['gasused_private'].quantile(0.75)

    # Calculate the interquartile range (IQR) for 'gasused_private' column
    iqr = q3 - q1

    # Filter out the outliers based on the IQR for 'gasused_private' column
    df_filtered = df[(df['gasused_private'] >= q1 - 1.5 * iqr) & (df['gasused_private'] <= q3 + 1.5 * iqr)]

    # Plot the scatter plot with filtered data
    plt.scatter(df_filtered['gasused_private'], df_filtered['next_block_inclusion_time'])
    plt.title('Relationship between Gas Used per Block by Private O.F vs. Varieties of Inclusion Time')
    plt.xlabel('Gas Used By Block in Private O.F')
    plt.ylabel('Inclusion Time')
    plt.gcf().set_size_inches(15, 5)
    plt.grid(True)
    plt.show()


In [None]:
confirmed_df = mempool[(mempool['region'] == 'us-east-1') & (mempool['status'] == 'confirmed')].copy()
confirmed_df['order_flow'] = confirmed_df.apply(determine_order_flow, axis=1)

# Calculate sum of 'gasused' for 'order_flow' that is 'private' for each 'curblocknumber'
confirmed_df['gasused_private'] = confirmed_df[confirmed_df['order_flow'] == 'private'].groupby('curblocknumber')['gasused'].transform('sum')

# Filter out 'nan' values before calculating the median
blocknumber_medians = confirmed_df.groupby('curblocknumber')['timepending'].apply(lambda x: x.dropna().median())
next_block_inclusion_time = blocknumber_medians.rolling(window=5, min_periods=1).median().shift(-4)
# Map the computed values to the original DataFrame
confirmed_df['next_block_inclusion_time'] = confirmed_df['curblocknumber'].map(next_block_inclusion_time)

### SAMPLE CHECK ###########################
# DataFrame with NaN values
confirmed_df_sample = pd.DataFrame({
    'curblocknumber': [1, 1, 1, 1, 2, 2, 2, 2, 3, 4, 5, 6, 6, 7, 7, 8, 8, 9, 9],
    'timepending': [5, 10, 15, 20, 50, 100, 150, 200, 7, 8, 9, np.nan, 12, 13, 14, 16, 18, np.nan, np.nan]
})
# Filter out 'nan' values before calculating the mean
blocknumber_means_sample = confirmed_df_sample.groupby('curblocknumber')['timepending'].apply(lambda x: x.dropna().mean())
next_block_inclusion_time = blocknumber_means_sample.rolling(window=2, min_periods=1).mean().shift(-1)
# Map the computed values to the original DataFrame
confirmed_df_sample['next_block_inclusion_time'] = confirmed_df_sample['curblocknumber'].map(next_block_inclusion_time)
### END OF SAMPLE CHECK ####################

# Last filter before plotting:
# confirmed_df = confirmed_df[confirmed_df['running_n_block_avg_inclusion_time'] < 2.5]



In [None]:
plot_scatter_colors_outliers(confirmed_df)


In [None]:
plot_scatter_inclusion_vs_gasprice_outliers(confirmed_df)
