In [1]:
import pandas as pd
import plotly.express as px
from pymongo import MongoClient
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from dln import (
    true_dln_learning_coefficient
)

# Querying data from DB and parsing

In [112]:

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['dln_lambdahat']
# db = client['dln_lambdahat_dev']
collection = db['runs']

# Query data
# EXPT_NAME = "dev"
# EXPT_NAME = "random_sv_20231213"
# EXPT_NAME = "batch500_width10-100_layer2-20_sv_202312130129"
# EXPT_NAME = "batch500_width10-100_layer2-10_sv_202312131012"
# EXPT_NAME = "batch500_width10-100_layer2-10_sv_202312131121"
# EXPT_NAME = "batch500_width2-10_layer5_sv_202312131342" # fix layer=5, vary widths 2-20
# EXPT_NAME = "batch500_width2-10_layer5_sv_202312131424" # fix widths=5, vary layer=2-20
# EXPT_NAME = "zerotruth_batch500_width10-100_layer2-20_sv_202312132339"

# EXPT_NAME = "randsv_batch500_width10-50_layer2-15_sv_202312141156"
# EXPT_NAME = "randsv_batch500_width10-50_layer2-15_sv_202312141203"

# EXPT_NAME = "randsv_batch500_width10-50_layer2-15_withtraining_202312150141"
# EXPT_NAME = "zero_batch500_width10-50_layer2-15_withtraining_202312150205"

# EXPT_NAME = "randsv_batch500_width2-5_layer2-5_notraining_202312152208"
EXPT_NAME = "zero_batch500_width2-5_layer2-5_notraining_funcrank_202312152247"
# EXPT_NAME = "randsv_batch500_width2-5_layer2-5_notraining_funcrank_202312162227"
# EXPT_NAME = "randsv_batch500_width15_layer2-5_notraining_funcrank_202312162325"
# EXPT_NAME = "randsv_batch500_width15_layer2-5_notraining_funcrank_202312180036"



query = {
    "config.expt_name": EXPT_NAME, 
    "status": "COMPLETED", 
}
projection = {
    "epsilon": "$config.sgld_config.epsilon", 
    "num_steps": "$config.sgld_config.num_steps",
    "input_dim": "$config.input_dim",
    "layer_widths": "$config.layer_widths",
    "lambdahat": "$info.lambdahat",
    "true_lambda": "$info.true_lambda",
    "true_rank": "$info.true_rank",
    "loss_trace": "$info.loss_trace",
    "init_loss": "$info.init_loss",
    "true_multiplicity": "$info.true_multiplicity",
    "lambdahat_trained": "$info.trained_param_info.lambdahat",
    "model_dim": "$info.model_dim",
    "functional_rank": "$info.functional_rank_info.functional_rank"
}
sample_doc = collection.find_one(query)


# Create DataFrame
print("Creating dataframe...")
df_data = pd.DataFrame(list(collection.find(query, projection)))



df_data["avg_width"] = [np.mean(x) for x in df_data["layer_widths"]]
df_data["num_layers"] = [len(x) for x in df_data["layer_widths"]]
df_data["first_nan_index"] = [
    list(np.isnan(trace)).index(True) if np.any(np.isnan(trace)) else None 
    for trace in df_data["loss_trace"]
]

if "model_dim" not in df_data.columns:
    model_dims = []
    for i, row in df_data.iterrows():
        widths = [row["input_dim"]] + list(row["layer_widths"])
        model_dim = np.sum([widths[i] * widths[i + 1] for i in range(len(widths) - 1)])
        model_dims.append(model_dim)
    df_data["model_dim"] = model_dims

if "true_multiplicity" not in df_data.columns:
    print("computing multiplicty... ")
    df_data["true_multiplicity"] = [
        true_dln_learning_coefficient(row["true_rank"], row["layer_widths"], row["input_dim"])[1]
        for _, row in df_data.iterrows()
    ]

print(f"Dataframe shape: {df_data.shape}")
df_data.head()

Creating dataframe...
Dataframe shape: (100, 16)


Unnamed: 0,_id,epsilon,num_steps,input_dim,layer_widths,lambdahat,true_lambda,true_rank,loss_trace,init_loss,true_multiplicity,model_dim,functional_rank,avg_width,num_layers,first_nan_index
0,1583,5e-06,10000,17,"[8, 7, 14]",19.962383,28.0,0,"[4.249676999190655e-12, 2.7847899333743875e-11...",0.0,2.0,290,0,9.666667,3,
1,1584,5e-06,10000,3,"[16, 15]",19.641167,22.0,0,"[3.611976140405204e-08, 1.5706770284396043e-07...",0.0,1.0,288,0,15.5,2,
2,1585,5e-06,10000,17,"[12, 10, 8]",24.961357,35.5,0,"[1.1483021998548448e-11, 5.944719022599187e-11...",0.0,1.0,404,0,10.0,3,
3,1586,5e-06,10000,6,"[6, 8]",12.749203,16.0,0,"[2.2979822489332946e-08, 1.0878977718675742e-0...",0.0,1.0,84,0,7.0,2,
4,1587,5e-06,10000,15,"[4, 2]",3.785163,4.0,0,"[1.0341667433522161e-07, 2.8220958370184235e-0...",0.0,1.0,68,0,3.0,2,


---
# Plotting
---

## Some diagnostics

In [113]:
nan_count = np.isnan(df_data["lambdahat"]).value_counts()
print(nan_count)
if True in nan_count.index:
    fig, ax = plt.subplots(1, 1, figsize=(8, 4))
    df_data["first_nan_index"].plot(kind="hist", ax=ax)
    ax.set_title("First NaN index");

lambdahat
False    100
Name: count, dtype: int64


In [114]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=df_data["model_dim"], nbinsx=20))
fig.update_layout(
    title="Model Parameter Count"
)

## $\lambda$ vs $\hat{\lambda}$

In [115]:
df = df_data
s = 8
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df["true_lambda"], 
    y=df["lambdahat"], 
    mode='markers', 
    marker=dict(size=s),
    name='lambdahat'
))

fig.add_trace(go.Scatter(
    x=df["true_lambda"], 
    y=df["true_rank"], 
    mode='markers', 
    marker=dict(size=s),
    name='true_rank'
))

fig.add_trace(go.Scatter(
    x=df["true_lambda"], 
    y=df["model_dim"] / 2, 
    mode='markers', 
    marker=dict(size=s, color='green'),
    name='model dimension / 2'
))


fig.add_trace(go.Scatter(
    x=df["true_lambda"], 
    y=df["true_multiplicity"], 
    mode='markers', 
    marker=dict(size=s, color='purple'),
    name='multiplicity'
))

if "lambdahat_trained" in df.columns:
    fig.add_trace(go.Scatter(
        x=df["true_lambda"], 
        y=df["lambdahat_trained"], 
        mode='markers', 
        marker=dict(size=s),
        name='lambdahat_trained'
    ))

if "functional_rank" in df.columns:
    fig.add_trace(go.Scatter(
        x=df["true_lambda"], 
        y=df["functional_rank"] / 2, 
        mode='markers', 
        marker=dict(size=s),
        name='functional_rank / 2'
    ))



# Add the y=x line
sorted_true_lambda = sorted(df["true_lambda"])
fig.add_trace(go.Scatter(
    x=sorted_true_lambda, 
    y=sorted_true_lambda, 
    mode='lines', 
    line=dict(dash='dash', color='red', width=2),
    name='true_lambda'
))



fig.update_layout(
    title="Comparison of Metrics",
    xaxis_title="True lambda",
    yaxis_title="Metrics",
    legend_title="Legend",
    width=800,
    height=800
)


In [17]:
df = df_data
fig = px.scatter(
    df, 
    x="true_lambda", 
    y="lambdahat", 
    color="functional_rank",
    # color="true_multiplicity",
    # color="avg_width",
    # color="model_dim",
    # color="num_layers",
    size="num_layers",
    # size="true_rank",
    labels={
        "true_lambda": "True lambda",
        "lambdahat": "lambdahat",
        "model_dim": "Model Dimension",
        "num_layers": "Number of Layers",
        "true_rank": "True Rank"
    },
    title="Scatter Plot of True Lambda vs Lambdahat"
)

# Add the y=x line
sorted_true_lambda = sorted(df["true_lambda"])
fig.add_trace(
    px.line(
        x=sorted_true_lambda, 
        y=sorted_true_lambda
    ).data[0]
)

# Update the layout
fig.update_layout(
    xaxis_title="True lambda",
    yaxis_title="Lambdahat",
    legend_title="Legend",
    width=800,
    height=800
)


## Loss trace

In [66]:
sample_loss_trace = sample_doc["info"]["loss_trace"]
distances = sample_doc["info"]["sgld_distances"]
nsamples = len(sample_loss_trace)
# fig = go.Figure()

# fig.add_trace(go.Scatter(
#     x=list(range(len(sample_loss_trace))),
#     y=sample_loss_trace,
#     mode='lines+markers',
#     marker=dict(symbol='circle', size=2),
#     line=dict(color='blue'),
#     name='Loss Trace'
# ))

# fig.update_layout(
#     xaxis_title='Index',
#     yaxis_title='Loss',
#     title='Sample Loss Trace'
# )



fig = go.Figure()

# Add trace for loss_trace
fig.add_trace(
    go.Scatter(
        x=list(range(nsamples)),
        y=sample_loss_trace,
        name="Loss Trace",
        yaxis="y1"
    )
)

# Add trace for distances
fig.add_trace(
    go.Scatter(
        x=list(range(nsamples)),
        y=distances,
        name="Distances",
        yaxis="y2"
    )
)

# Set layout for the graph
fig.update_layout(
    title="Loss Trace and Distances",
    xaxis_title="Sample Index",
    yaxis=dict(
        title="Loss Trace",
        side="left",
        showgrid=False,
        zeroline=False
    ),
    yaxis2=dict(
        title="Distances",
        side="right",
        overlaying="y",
        showgrid=False,
        zeroline=False
    )
)



## Effects of `sgld_config.num_steps`

In [106]:
s = 500
sgld_num_steps = sample_doc["config"]["sgld_config"]["num_steps"]
n = sample_doc["config"]["num_training_data"]
data = []

# df = df_data[~np.isnan(df_data["lambdahat"])]
df = df_data

for i, row in df.iterrows():
    expt_id = row["_id"]
    trace = row["loss_trace"]
    init_loss = row["init_loss"]
    true_lambda = row["true_lambda"]
    for nstep in range(s, sgld_num_steps + 1, s):
        nstep_lambdhat = (np.mean(trace[:nstep]) - init_loss) * n / np.log(n)
        data.append([expt_id, nstep, nstep_lambdhat, true_lambda])
df = pd.DataFrame(data, columns=["_id", "nstep", "lambdahat", "true_lambda"])

fig = px.scatter(
    df, 
    x="true_lambda", 
    y="lambdahat", 
    color="nstep",
    labels={
        "true_lambda": "True lambda",
        "lambdahat": "lambdahat",
        "nstep": "Num SGLD step"
    },
    title="True Lambda vs Lambdahat"
)

# Add the y=x line
sorted_true_lambda = sorted(df["true_lambda"])
fig.add_trace(
    px.line(
        x=sorted_true_lambda, 
        y=sorted_true_lambda
    ).data[0]
)

# Update the layout
fig.update_layout(
    xaxis_title="True lambda",
    yaxis_title="Lambdahat",
    legend_title="Legend",
    width=800,
    height=800, 
    # yaxis_type="log", 
    # xaxis_type="log",
)


{'py/object': 'numpy.ndarray',
 'values': 'eJxjYBgFo2AUjIKBBQAEiAAB',
 'shape': [290],
 'dtype': 'float32',
 'byteorder': '<'}