In [None]:
import glob
from collections import defaultdict
from tqdm import tqdm
import pyscipopt
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import copy
import pandas as pd
from IPython.display import display

In [None]:
# N.B. Need to unzip .mps.gz* files to make them .mps files

# set paths to folders with .mps files
paths = {'item_placement': '/scratch/datasets/retro_branching/ml4co/instances/1_item_placement/train/',
         'load_balancing': None,
         'anonymous': '/scratch/datasets/retro_branching/ml4co/instances/3_anonymous/train/'}
paths = {'item_placement': '/scratch/datasets/retro_branching/ml4co/instances/1_item_placement/train/',
         'load_balancing': '/scratch/datasets/retro_branching/ml4co/instances/2_load_balancing/train/',
         'anonymous': '/scratch/datasets/retro_branching/ml4co/instances/3_anonymous/train/'}

In [None]:
# get .mps file names
filenames = {}
for dataset, path in paths.items():
    if path is not None:
        filenames[dataset] = glob.glob(path+'*.mps')
        print('{} #mps files: {}'.format(path, len(filenames[dataset])))

In [None]:
# characterise

def characterise_pyscipopt_instance(m):
    if type(m) == str:
        # is file path
        filename = copy.deepcopy(m)
        m = pyscipopt.Model()
        m.readProblem(filename)
    summary = {}
    summary['filename'] = m.getProbName()
    summary['num_vars'] = m.getNVars()
    summary['num_constraints'] = m.getNConss()
    summary['num_int_vars'] = m.getNIntVars()
    summary['num_binary_vars'] = m.getNBinVars()
    return summary
    
summary_dict = {}
for dataset in filenames.keys():
    summary_dict[dataset] = defaultdict(list)
    pbar = tqdm(total=len(filenames[dataset]), 
                          desc=dataset,
                          leave=True)
    for file in filenames[dataset]:
        summary = characterise_pyscipopt_instance(file)
        for key in summary.keys():
            summary_dict[dataset][key].append(summary[key])
        pbar.update(1)
    pbar.close()

In [None]:
print(summary_dict)

In [None]:
sns.set_context(context='paper') # paper notebook talk poster
sns.set_style("whitegrid")

for dataset in summary_dict.keys():
    print(dataset)
    for key in summary_dict[dataset].keys():
        if key != 'filename':
            if len(summary_dict[dataset][key]) != 0:
                sns.histplot(data=pd.DataFrame(summary_dict[dataset]), x=key)
                plt.title(dataset)
                plt.show()
sns.despine()

In [None]:
# print table (assuming from above plots that all instances in each data set have same characteristics)
table_dict = defaultdict(list)
for dataset in summary_dict.keys():
    if dataset != 'anonymous': # anonymous dataset instances do not all have same characteristics
        table_dict['dataset'].append(dataset)
        table_dict['num_instances'].append(len(summary_dict[dataset]['filename']))
        for key in summary_dict[dataset].keys():
            if key != 'filename':
                if len(summary_dict[dataset][key]) != 0:
                    table_dict[key].append(summary_dict[dataset][key][0])
                else:
                    table_dict[key].append(0)
display(pd.DataFrame(table_dict))