In [59]:
import shutil
import re

import pandas as pd
from tensorboardX import SummaryWriter

In [65]:
class VWOutputParser:
    
    def __init__(self, fname):
        self.fname = fname

    def parse_vw_output(self):
        """Parse the output from `vw` command, return dataframe/dictionnaries with the associated data"""
        table_lst = []
        params = {}
        metrics = {}

        inside_table = False
        after_table = False

        with open(self.fname, "r") as f:
            for row in f:
                if "=" in row:
                    param_name, value = [element.strip() for element in row.split("=")]
                    if after_table:
                        metrics[param_name] = value
                    else:
                        params[param_name] = value
                elif ":" in row:
                    key, value = [element.strip() for element in row.split(":")]
                    params[param_name] = value
                
                elif not after_table:
                    if re.match("average\s+since", row):
                        inside_table = True
                    if row == "\n":
                        inside_table = False
                        after_table = True
                    if inside_table:
                        table_lst += [row.split()]

        df = pd.DataFrame(data=table_lst[2:], columns=[f"{x}_{y}" for x,y in zip(*table_lst[:2])])
        
        self.params = params
        self.df = df
        self.metrics = metrics

    def write_to_tensorboard(self, writer):
        """Write VW parsed data into tensorboard"""
        clean_params = {}
        for (key, value) in self.params.items():
            try:
                out = float(value)
            except:
                out = value
            key = f"hparam/{key}"
            clean_params[key] = out

        clean_metrics = {}
        for (key, value) in self.metrics.items():
            try:
                out = float(value)
            except:
                out = value
            key = key.replace(" ", "_").replace("'", '_')
            key = f"metric/{key}"
            clean_metrics[key] = out

        # Write hparams & metrics into tensorboard
        writer.add_hparams(hparam_dict=clean_params, metric_dict=clean_metrics)

        # Write scalar data into tensorboard
        for _, row in self.df.iterrows():
            for key in row.keys():
                writer.add_scalar(f'data/{key}', float(row[key]), row.example_counter)

# Parsing output files

Assume we have 2 files "out_lr_0.1.out" and "out_lr_0.5.out" that contain the output of `vw` command. We use the class i) parse the VW output and ii) write these parsed output to tensorboard.

In [67]:
shutil.rmtree('runs', ignore_errors=True) # Remove the "runs" folder (if it exists)

for fname in ["out_lr_0.1.out", "out_lr_0.5.out"]:
    with SummaryWriter(f"runs/{fname}") as writer:
        output_parser = VWOutputParser(fname)
        output_parser.parse_vw_output() # Parse the output and store the parsed result in instance attributes
        output_parser.write_to_tensorboard(writer) # Write the parsed result to tensorboard files

We can inspect the parsed output by displaying the attributes `params`, `df` and `metrics` (in this example I use the file "out_lr_0.5.out"):

In [68]:
print("Parameters:")
display(output_parser.params)
print("\n\n")
print("Dataframe:")
display(output_parser.df)
print("\n")

print("Metrics")
display(output_parser.metrics)

Parameters:


{'Num weight bits': '18',
 'learning rate': '0.5',
 'initial_t': '0',
 'power_t': '0.5',
 'Reading datafile': 'boston.txt',
 'num sources': 'gd, scorer'}




Dataframe:


Unnamed: 0,average_loss,since_last,example_counter,example_weight,current_label,current_predict,current_features
0,1.0,1.0,1,1.0,1.0,0.0,15
1,0.5,0.0,2,2.0,1.0,1.0,15
2,0.50493,0.50986,4,4.0,1.0,0.5378,15
3,0.32317,0.14141,8,8.0,1.0,1.0,15
4,0.161594,1.8e-05,16,16.0,1.0,1.0,15
5,0.217465,0.273335,32,32.0,1.0,0.4531,16
6,0.150157,0.08285,64,64.0,0.0,0.026,16
7,0.099326,0.048496,128,128.0,1.0,0.8743,15
8,0.113688,0.128049,256,256.0,0.0,0.1814,15
9,0.098885,0.084081,512,512.0,0.0,0.0,14




Metrics


{'number of examples': '16599',
 'weighted example sum': '16599.000000',
 'weighted label sum': '5130.000000',
 'average loss': '0.085618',
 'best constant': '0.309055',
 "best constant's loss": '0.213540',
 'total feature number': '250958'}

It is the same information as in original output fiule:

In [69]:
with open(fname, "r") as f:
    for line in f:
        print(line)

Num weight bits = 18

learning rate = 0.5

initial_t = 0

power_t = 0.5

using no cache

Reading datafile = boston.txt

num sources = 1

Enabled reductions: gd, scorer

average  since         example        example  current  current  current

loss     last          counter         weight    label  predict features

1.000000 1.000000            1            1.0   1.0000   0.0000       15

0.500000 0.000000            2            2.0   1.0000   1.0000       15

0.504930 0.509860            4            4.0   1.0000   0.5378       15

0.323170 0.141410            8            8.0   1.0000   1.0000       15

0.161594 0.000018           16           16.0   1.0000   1.0000       15

0.217465 0.273335           32           32.0   1.0000   0.4531       16

0.150157 0.082850           64           64.0   0.0000   0.0260       16

0.099326 0.048496          128          128.0   1.0000   0.8743       15

0.113688 0.128049          256          256.0   0.0000   0.1814       15

0.098885 0.084081

# Visualizing on tensorboard

In [70]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [71]:
%tensorboard --logdir runs --host localhost

Reusing TensorBoard on port 6008 (pid 25242), started 0:10:08 ago. (Use '!kill 25242' to kill it.)