In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
def plot_hist(df, col, quantile, title, filename):
    plt.figure()
    df = df[col]
    mean = df.mean()
    median = df.median()
    df = df[df < df.quantile(quantile)]
    df.hist(bins=min(100, len(df.unique())), label="count", align='left', rwidth=1)
#     plt.xticks(np.arange(0, df.max()+1, 1))
    plt.axvline(mean, color="red", linestyle='dashed', label=f"mean = {mean:.2f}")
    plt.axvline(median, color="orange", linestyle='dashed', label=f"median = {median:.2f}")
    plt.legend()
#     plt.title(f"{title}: {col} (quantile={quantile})")
    plt.title(f"{title} ({col})")
    plt.savefig(f'{filename}.pdf') 

In [None]:
def plot_hist_simple_title(df, col, quantile, title, filename):
    plt.figure()
    df = df[col]
    mean = df.mean()
    median = df.median()
    df = df[df < df.quantile(quantile)]
    df.hist(bins=min(100, len(df.unique())), label="count", align='left', rwidth=1)
#     plt.xticks(np.arange(0, df.max()+1, 1))
    plt.axvline(mean, color="red", linestyle='dashed', label=f"mean = {mean:.2f}")
    plt.axvline(median, color="orange", linestyle='dashed', label=f"median = {median:.2f}")
    plt.legend()
#     plt.title(f"{title} (quantile={quantile})") if quantile != 1 else plt.title(f"{title}")
    plt.title(f"{title} ({col})")
    plt.savefig(f'{filename}.pdf') 

In [None]:
def plot_hist_xticks(df, col, quantile, title, filename):
    plt.figure()
    df = df[col]
    mean = df.mean()
    median = df.median()
    df = df[df < df.quantile(quantile)]
    df.hist(bins=min(100, len(df.unique())), label="count", align='left', rwidth=1)
    plt.xticks(np.arange(0, df.max()+1, 1))
    plt.axvline(mean, color="red", linestyle='dashed', label=f"mean = {mean:.2f}")
    plt.axvline(median, color="orange", linestyle='dashed', label=f"median = {median:.2f}")
    plt.legend()
#     plt.title(f"{title}: {col} (quantile={quantile})")
    plt.title(f"{title} ({col})")
    plt.savefig(f'{filename}.pdf') 

# Number of tactic tokens by proof

In [None]:
counts_by_proof = pd.read_csv("counts_proofs.csv")
tfidf_by_proof = pd.read_csv("tfidf_proofs.csv")

In [None]:
print(len(counts_by_proof))
counts_by_proof.head()

In [None]:
init_cols = counts_by_proof.columns[:5].tolist()
init_cols

In [None]:
counts_by_proof['num_tokens'] = counts_by_proof.drop(init_cols, axis=1).sum(axis=1)
max(counts_by_proof['num_tokens'])

In [None]:
df = counts_by_proof[init_cols+['num_tokens']]
df.head()

In [None]:
plot_hist(df, 'num_tokens', 'tactics by proof', 0.99)
plot_hist(df, 'num_tokens', 'tactics by proof', 1.00)

# Number of tactic tokens by proof step

In [None]:
counts_by_pstep = pd.read_csv('counts_proof_steps.csv')

In [None]:
print(len(counts_by_pstep))
counts_by_pstep.head()

In [None]:
plot_hist_xticks(counts_by_pstep, 'num_tactics', 0.99, 'number of tactics by proof step', 'hist_num_tactics_by_proof_step_p99')
plot_hist_xticks(counts_by_pstep, 'num_tactics', 1.00, 'number of tactics by proof step', 'hist_num_tactics_by_proof_step_p100')

# Number of terms by proof step

In [None]:
import pickle

In [None]:
with open('../proof_step_stats.pkl', 'rb') as f:
    proof_step_stats = pickle.load(f)

In [None]:
proof_step_stats['n_terms'] = proof_step_stats['n_env'] + proof_step_stats['n_lc']
proof_step_stats.head()

In [None]:
plot_hist_simple_title(proof_step_stats, 'n_terms', 0.99, 'number of terms by proof step', 'hist_num_terms_by_proof_step_p99')
plot_hist_simple_title(proof_step_stats, 'n_terms', 1.00, 'number of terms by proof step', 'hist_num_terms_by_proof_step_p100')