# Baselines for Bayes Minimum Risk Pruning

In [1]:
# here we comparet the performance of 
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score, recall_score,\
accuracy_score, f1_score, classification_report, confusion_matrix, roc_auc_score
import numpy as np
from joblib import Parallel, delayed

In [2]:
from c45 import C45

In [3]:
mainDir = "./ahmedDatasets/"

### Zoo Data

In [4]:
zooDir = mainDir + "Zoo/zoo.data"
zooData = pd.read_csv(zooDir,delimiter=",",header=None)
zooData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [5]:
zooTree = C45()
zooTrainX, zooTestX, zooTrainY, zooTestY = train_test_split(zooData[range(1,17)],
                                                            zooData[17],
                                                            test_size=0.4)
zooTree.fit(zooTrainX, zooTrainY)
accuracy_score(zooTree.predict(zooTestX),zooTestY)

0.9024390243902439

In [6]:
zooRoot = zooTree.getTree()

In [12]:
%%time
import copy
import pickle

def makePhishFile(root,data,outCol):
    terminals = []
    d = dict()
    reverseD = dict()

    def isLeaf(node):
        children = [child for child in node]
        return len(children) == 0

    def modifyDict(root,terminals,d,reverseD):
        if (isLeaf(root)):
            d[root] = []
        else:
            children = [child for child in root]
            for child in children:
                if d.get(root) is None:
                    d[root] = [child]
                else:
                    d[root].append(child)
                if reverseD.get(child) is None:
                    reverseD[child] = [root]
                else:
                    reverseD[child].append(root)
                if (not isLeaf(child)):
                    modifyDict(child,terminals,d,reverseD)
                else:
                    modifyDict(child,
                               terminals.append(child),
                               d,
                               reverseD)

    def populateDict(mainRoot,d,reverseD,terminals,df,output):
        root = copy.deepcopy(mainRoot)
        top = root
        modifyDict(root,terminals,d,reverseD)
        return np.array([d, 
                         top, 
                         reverseD, 
                         terminals,output,
                         df], dtype=object)

    values = populateDict(root,d,reverseD,terminals,data, outCol)

    pickle.dump(values, open('uciPhishGraph.p','wb'))

makePhishFile(zooRoot,zooData,17)

NameError: name 'zooRoot' is not defined

In [12]:
%%time
import sys
import pickle
import time

import numpy as np
from functools import reduce

'''
Takes in a dictionary representation of node: [list of node's neighbors],
Each node has only 1 parent.

'''

def getCategoryProportions(colData):
    categoryDict = dict()
    for i in range(len(colData)):
        if (categoryDict.get(colData[i]) is None):
            categoryDict[colData[i]] = [[i],1]
        else:
            indices = categoryDict[colData[i]][0]
            numPerClass = categoryDict[colData[i]][1]
            categoryDict[colData[i]] = [indices+[i],numPerClass+1]
    # classDict is a dictionary containing categories as keys
    # and the values are two-element lists where the first element is
    # the list of indices where that category exists and the second
    # element is the proportion of the category in the column of data
    outputClasses = []
    for key in categoryDict:
        outputClasses.append(key)
        categoryDict[key] = [categoryDict[key][0],
                     float(categoryDict[key][1])/len(colData)]
    return (categoryDict, outputClasses)

class Node:
    def __init__(self, children, splitFeature,
                 splitChildren, output, label = None, depth = 0):
        self.children = children
        self.splitFeature = splitFeature
        self.splitChildren = splitChildren
        self.output = output
        self.label = label
        self.depth = depth

class Graph(object):
    def __init__(self, dagList, parentList, root, terminals,outAttribute,trainData):
        self.root = root
        self.terminals = terminals
        self.dagList = dagList
        self.parentList = parentList
        self.outAttribute = outAttribute
        self.trainData = trainData
    def preprocessNodesBelow(self):
        nodeDict = dict()
        i = 0
        for node in self.dagList:
            lstNodes = []
            self.lookBelow(node,lstNodes)
            nodeDict[node] = set(lstNodes)
            i += 1
        return nodeDict
    def lookBelow(self, node, lstNodes):
        for child in self.dagList[node]:
            lstNodes.append(child)
            self.lookBelow(child, lstNodes)
    # destructively modifies the graph
    # we prune the graph over all training examples
    # when "turning" a parent into a leaf, the dagList[parent] = []
    def runPruning(self,par=True):
        def modifyDict(node,catProps):
            if len(node.output) != 0:
                catProps[node] = getCategoryProportions(node.output)
            else:
                catProps[node] = None,None
        parents = self.parentList
        currTerminals = set(self.terminals)
        nodeBelowDict = self.preprocessNodesBelow()
        self.catProps = dict()
        _ = [modifyDict(node,self.catProps) \
                     for node in self.dagList]
        if par:
            (currTerminals) = \
                self.parPruning(currTerminals,
                                nodeBelowDict)
        else:
            (currGraph, parents, currTerminals) = \
                self.seqPruning(currGraph,
                                currFrontier,
                                parents,
                                currTerminals,
                                nodeBelowDict)
    # leaves should be kept track of.
    def getBayesRisk(self, node):
        risk = 0
        examples = node.output
        for trueClass in examples:
            probDict, outputClasses = self.catProps[node]
            if probDict is None and outputClasses is None:
                return 0
            for _,c in enumerate(outputClasses):
                if trueClass != c:
                    risk += abs(float(trueClass)-float(c))*(probDict[c][1])
        return risk
    # when the root is reached, all leaves are considered.
    def seqPruning(self,currGraph,currFrontier,parents, currTerminals, nodeBelowDict):
        def getRisks(node,leaves):
            return (self.getBayesRisk(node),\
                   sum(map(lambda leaf: self.getBayesRisk(leaf), leaves)))
        def riskFrontier(node, currTerminals, nodeBelowDict):
            leaves = \
                list(nodeBelowDict[node].intersection(currTerminals))
            (parentRisk,leavesRisk) = getRisks(node,leaves)
            if (parentRisk < leavesRisk):
                currGraph[node] = []
                currTerminals = currTerminals.difference(leaves)
                currTerminals.add(node)
        numRound = 0
        while (len(currFrontier) > 1):
            start = time.time()
            _ = [riskFrontier(node, currTerminals, nodeBelowDict) \
                         for node in currFrontier]
            nextFrontier = set()
            # moving onto the next frontier from current frontier
            for node in currFrontier:
                if node == self.root:
                    parent = node
                else:
                    parent = self.parentList[node][0]
                nextFrontier.add(parent)
            currFrontier = nextFrontier
            print("{} & {} & {}".format(numRound,\
                                                len(currFrontier),\
                                                time.time()-start))
            numRound += 1
        return (currGraph, parents, currTerminals)
    def parPruning(self,currTerminals, nodeBelowDict):
        def getRisks(node,leaves):
            return (self.getBayesRisk(node),\
                   sum(map(lambda leaf: self.getBayesRisk(leaf), leaves)))
        def riskFrontier(node, currTerminals, nodeBelowDict):
            leaves = \
                list(nodeBelowDict[node].intersection(currTerminals))
            (parentRisk,leavesRisk) = getRisks(node,leaves)
            if (parentRisk < leavesRisk):
                self.dagList[node] = []
                currTerminals = currTerminals.difference(leaves)
                currTerminals.add(node)
        numRound = 0
        while (len(self.terminals) > 1):
            start = time.time()
            _ = Parallel(n_jobs=50,prefer="threads")(delayed(riskFrontier)(node, currTerminals, nodeBelowDict) \
                         for node in self.terminals)
            nextFrontier = set()
            # moving onto the next frontier from current frontier
            for node in self.terminals:
                if node == self.root:
                    parent = node
                else:
                    parent = self.parentList[node][0]
                nextFrontier.add(parent)
            self.terminals = nextFrontier
            print("{} & {} & {}".format(numRound,\
                                                len(self.terminals),\
                                                time.time()-start))
            numRound += 1
        return currTerminals
    
def addToGraph(currGraph, node, d):
    d[node] = currGraph[node]
    for otherNode in currGraph[node]:
        addToGraph(currGraph, otherNode, d)
        

print("Benchmarking...")
with open('uciPhishGraph.p','rb') as f:
    graphData = pickle.load(f)
dagList,root,parentList,terminals,outAttribute,trainData = graphData
g = Graph(dagList, parentList, root, terminals, outAttribute, trainData)
print("Starting...")
g.runPruning()
currGraph = g.dagList
newGraph = dict()
addToGraph(currGraph, g.root, newGraph)

Benchmarking...
Starting...
0 & 179 & 0.1080167293548584
1 & 141 & 0.11407995223999023
2 & 107 & 0.11007380485534668
3 & 75 & 0.10689806938171387
4 & 52 & 0.1083059310913086
5 & 36 & 0.10647702217102051
6 & 25 & 0.10843300819396973
7 & 18 & 0.1160588264465332
8 & 13 & 0.1087958812713623
9 & 9 & 0.11147522926330566
10 & 6 & 0.1093149185180664
11 & 4 & 0.10693216323852539
12 & 3 & 0.10615205764770508
13 & 1 & 0.1079859733581543
{<__main__.Node object at 0x7fe8eee7ba90>: [<__main__.Node object at 0x7fe8ed6bab00>, <__main__.Node object at 0x7fe8ef53e438>, <__main__.Node object at 0x7fe8ef53ef60>], <__main__.Node object at 0x7fe8f1bf46d8>: [], <__main__.Node object at 0x7fe8ed6bab00>: [<__main__.Node object at 0x7fe8f29a8f60>, <__main__.Node object at 0x7fe8f29a89e8>], <__main__.Node object at 0x7fe8f1bf4ac8>: [<__main__.Node object at 0x7fe8f1bf4160>, <__main__.Node object at 0x7fe8f29b1630>], <__main__.Node object at 0x7fe8f1be8208>: [<__main__.Node object at 0x7fe8f1be8e10>, <__main__.No

In [89]:
  
import sys
import pickle
import time

import numpy as np
from functools import reduce
import multiprocessing as mp
from multiprocessing import Manager

'''
Takes in a dictionary representation of node: [list of node's neighbors],
Each node has only 1 parent.

'''

def getCategoryProportions(colData):
    categoryDict = dict()
    for i in range(len(colData)):
        if (categoryDict.get(colData[i]) is None):
            categoryDict[colData[i]] = [[i],1]
        else:
            indices = categoryDict[colData[i]][0]
            numPerClass = categoryDict[colData[i]][1]
            categoryDict[colData[i]] = [indices+[i],numPerClass+1]
    # classDict is a dictionary containing categories as keys
    # and the values are two-element lists where the first element is
    # the list of indices where that category exists and the second
    # element is the proportion of the category in the column of data
    outputClasses = []
    for key in categoryDict:
        outputClasses.append(key)
        categoryDict[key] = [categoryDict[key][0],
                     float(categoryDict[key][1])/len(colData)]
    return (categoryDict, outputClasses)

class Node:
    def __init__(self, children, splitFeature,
                 splitChildren, output, label = None, depth = 0):
        self.children = children
        self.splitFeature = splitFeature
        self.splitChildren = splitChildren
        self.output = output
        self.label = label
        self.depth = depth
class Graph(object):
    def __init__(self, dagList, parentList, root, terminals,outAttribute,trainData):
        self.root = root
        self.terminals = terminals
        self.dagList = dagList
        self.parentList = parentList
        self.outAttribute = outAttribute
        self.trainData = trainData
    def preprocessNodesBelow(self):
        nodeDict = dict()
        i = 0
        for node in self.dagList:
            lstNodes = []
            self.lookBelow(node,lstNodes)
            nodeDict[node] = set(lstNodes)
            i += 1
        return nodeDict
    def lookBelow(self, node, lstNodes):
        for child in self.dagList[node]:
            lstNodes.append(child)
            self.lookBelow(child, lstNodes)
    # destructively modifies the graph
    # we prune the graph over all training examples
    # when "turning" a parent into a leaf, the dagList[parent] = []
    def runPruning(self,par=True):
        def modifyDict(node,catProps):
            if len(node.attrib) != 0:
                catProps[node] = getCategoryProportions(node.attrib['son_category'])
            else:
                catProps[node] = None,None
        parents = self.parentList
        currTerminals = set(self.terminals)
        nodeBelowDict = self.preprocessNodesBelow()
        self.catProps = dict()
        _ = [modifyDict(node,self.catProps) \
                     for node in self.dagList]
        if par:
            currTerminals = \
                self.parPruning(currTerminals,
                                nodeBelowDict)
        else:
            (currGraph, parents, currTerminals) = \
                self.seqPruning(currGraph,
                                currFrontier,
                                parents,
                                currTerminals,
                                nodeBelowDict)
    # leaves should be kept track of.
    def getBayesRisk(self, node):
        risk = 0
        if len(node.attrib) == 0:
            return sum([self.getBayesRisk(child) for child in node])
        examples = node.attrib['son_category']
        for trueClass in examples:
            probDict, outputClasses = self.catProps[node]
            if probDict is None and outputClasses is None:
                return 0
            for _,c in enumerate(outputClasses):
                if trueClass != c:
                    risk += abs(float(trueClass)-float(c))*(probDict[c][1])
        return risk
    # when the root is reached, all leaves are considered.
    def seqPruning(self,currGraph,currFrontier,parents, currTerminals, nodeBelowDict):
        def getRisks(node,leaves):
            return (self.getBayesRisk(node),\
                   sum(map(lambda leaf: self.getBayesRisk(leaf), leaves)))
        def riskFrontier(node, currTerminals, nodeBelowDict):
            leaves = \
                list(nodeBelowDict[node].intersection(currTerminals))
            (parentRisk,leavesRisk) = getRisks(node,leaves)
            if (parentRisk < leavesRisk):
                currGraph[node] = []
                currTerminals = currTerminals.difference(leaves)
                currTerminals.add(node)
        numRound = 0
        while (len(currFrontier) > 1):
            start = time.time()
            _ = [riskFrontier(node, currTerminals, nodeBelowDict) \
                         for node in currFrontier]
            nextFrontier = set()
            # moving onto the next frontier from current frontier
            for node in currFrontier:
                if node == self.root:
                    parent = node
                else:
                    parent = self.parentList[node][0]
                nextFrontier.add(parent)
            currFrontier = nextFrontier
            print("{} & {} & {}".format(numRound,\
                                                len(currFrontier),\
                                                time.time()-start))
            numRound += 1
        return (currGraph, parents, currTerminals)
    def parPruning(self,currTerminals, nodeBelowDict):
        def getRisks(node,leaves):
            return (self.getBayesRisk(node),\
                   sum(map(lambda leaf: self.getBayesRisk(leaf), leaves)))
        def riskFrontier(node, currTerminals, nodeBelowDict):
            leaves = \
                list(nodeBelowDict[node].intersection(currTerminals))
            (parentRisk,leavesRisk) = getRisks(node,leaves)
            if (parentRisk > leavesRisk):
                self.dagList[node] = []
                currTerminals = currTerminals.difference(leaves)
                currTerminals.add(node)
        numRound = 0
        while (len(currFrontier) > 1):
            start = time.time()
            _ = Parallel(n_jobs=50,prefer="threads")(delayed(riskFrontier)(node, currTerminals, nodeBelowDict) \
                         for node in self.terminals)
            nextFrontier = set()
            # moving onto the next frontier from current frontier
            for node in self.terminals:
                if node == self.root:
                    parent = node
                else:
                    parent = self.parentList[node][0]
                nextFrontier.add(parent)
            self.terminals = nextFrontier
            print("{} & {} & {}".format(numRound,\
                                                len(self.terminals),\
                                                time.time()-start))
            numRound += 1
        return (currGraph, currTerminals)
    
def addToGraph(currGraph, node, d):
    d[node] = currGraph[node]
    for otherNode in currGraph[node]:
        addToGraph(currGraph, otherNode, d)
        
#ray.init()
print("Benchmarking...")
with open('uciPhishGraph.p','rb') as f:
    graphData = pickle.load(f)
dagList,root,parentList,terminals,outAttribute,trainData = graphData
print(dagList)
g = Graph(dagList, parentList, root, terminals, outAttribute, trainData)
print("Starting...")
g.runPruning()
currGraph = g.dagList
newGraph = dict()
addToGraph(currGraph, g.root, newGraph)
print(newGraph)

Traceback (most recent call last):
  File "/Users/dylanc_home/Desktop/2021 Spring/Internships/TalkMeUp/env/lib/python3.7/site-packages/ray/function_manager.py", line 496, in _load_actor_class_from_gcs
    actor_class = pickle.loads(pickled_class)
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/managers.py", line 920, in RebuildProxy
    return func(token, serializer, incref=incref, **kwds)
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/managers.py", line 770, in __init__
    self._incref()
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/managers.py", line 824, in _incref
    conn = self._Client(self._token.address, authkey=self._authkey)
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/connection.py", line 498, in Client
    answer_challenge(c, authkey)
  File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/mult

Benchmarking...
{<Element 'DecisionTree' at 0x7f97a3b2ccc8>: [<Element 'attr1' at 0x7f97a3b2cd68>, <Element 'attr1' at 0x7f97a3b2cbd8>], <Element 'attr1' at 0x7f97a3b2cd68>: [<Element 'attr3' at 0x7f97a3b2cdb8>, <Element 'attr3' at 0x7f97a3b2c8b8>], <Element 'attr3' at 0x7f97a3b2cdb8>: [<Element 'attr7' at 0x7f97a3b2ce08>, <Element 'attr7' at 0x7f97a3b2ce58>], <Element 'attr7' at 0x7f97a3b2ce08>: [<Element 'attr4' at 0x7f97a3b2cea8>, <Element 'attr4' at 0x7f97a3b2cd18>], <Element 'attr4' at 0x7f97a3b2cea8>: [<Element 'attr5' at 0x7f97a3b2c9a8>, <Element 'attr5' at 0x7f97a3b2cf48>], <Element 'attr5' at 0x7f97a3b2c9a8>: [<Element 'attr12' at 0x7f97a3b2cef8>, <Element 'attr12' at 0x7f97a3b2cf98>], <Element 'attr12' at 0x7f97a3b2cef8>: [], <Element 'attr12' at 0x7f97a3b2cf98>: [], <Element 'attr5' at 0x7f97a3b2cf48>: [], <Element 'attr4' at 0x7f97a3b2cd18>: [], <Element 'attr7' at 0x7f97a3b2ce58>: [<Element 'attr11' at 0x7f97a3b2cae8>, <Element 'attr11' at 0x7f97a3b2cb38>], <Element 'attr1

[2m[36m(pid=32150)[0m 2021-01-20 17:30:03,313	ERROR function_manager.py:498 -- Failed to load actor class Graph.
[2m[36m(pid=32150)[0m Traceback (most recent call last):
[2m[36m(pid=32150)[0m   File "/Users/dylanc_home/Desktop/2021 Spring/Internships/TalkMeUp/env/lib/python3.7/site-packages/ray/function_manager.py", line 496, in _load_actor_class_from_gcs
[2m[36m(pid=32150)[0m     actor_class = pickle.loads(pickled_class)
[2m[36m(pid=32150)[0m   File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/managers.py", line 920, in RebuildProxy
[2m[36m(pid=32150)[0m     return func(token, serializer, incref=incref, **kwds)
[2m[36m(pid=32150)[0m   File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/managers.py", line 770, in __init__
[2m[36m(pid=32150)[0m     self._incref()
[2m[36m(pid=32150)[0m   File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/multiprocessing/managers.py", line 8

RayTaskError(RuntimeError): [36mray::Graph.runPruning()[39m (pid=32150, ip=192.168.1.3)
  File "python/ray/_raylet.pyx", line 422, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 456, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 459, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 463, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 415, in ray._raylet.execute_task.function_executor
RuntimeError: The actor with name Graph failed to be imported, and so cannot execute this method.

### Iris Data

In [11]:
irisDir = mainDir + "Iris/iris.data"
irisData = pd.read_csv(irisDir, delimiter = ",", header=None)
irisData.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [12]:
irisesDict = {'Iris-setosa':0, 'Iris-versicolor':1, 'Iris-virginica':2}
irisData[5] = irisData[4].apply(lambda row: irisesDict[row])
irisData.head()

Unnamed: 0,0,1,2,3,4,5
0,5.1,3.5,1.4,0.2,Iris-setosa,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0
2,4.7,3.2,1.3,0.2,Iris-setosa,0
3,4.6,3.1,1.5,0.2,Iris-setosa,0
4,5.0,3.6,1.4,0.2,Iris-setosa,0


In [13]:
irisTree = C45()
irisTrainX, irisTestX, irisTrainY, irisTestY = train_test_split(irisData[[0,1,2,3]],
                                                                irisData[5], test_size=0.4)
irisTree.fit(irisTrainX, irisTrainY)

C45(attrNames=['attr0', 'attr1', 'attr2', 'attr3'])

In [14]:

accuracy_score(irisTree.predict(irisTestX),irisTestY)

0.9333333333333333

In [15]:
irisRoot = irisTree.getTree()

In [16]:
testData = pd.read_csv("test.data",delimiter=",")
testData["class"] = testData["class"].apply(lambda row: 1 if row == "Yes" else 0)
testTree = C45()
testTree.fit(testData[['a','b','c']],testData['class'])

C45(attrNames=['attr0', 'attr1', 'attr2'])

In [17]:
# using the makePhishFile to update the uciPhishGraph.p file:
makePhishFile(irisRoot,irisData)

In [45]:
print("Benchmarking...")
with open('uciPhishGraph.p','rb') as f:
    graphData = pickle.load(f)
dagList,root,parentList,terminals,outAttribute,trainData = graphData
print(dagList)
g = Graph(dagList, parentList, root, terminals, outAttribute, trainData)
print("Starting...")
currGraph,_,currTerminals = g.runPruning()
newGraph = dict()
addToGraph(currGraph, g.root, newGraph)
print(newGraph)

### Diabetes Data

In [23]:
diabetesDir = mainDir + "Diabetes/Diabetes-Data/"
for i in range(1,71):
    dataDiabetesDir = diabetesDir
    if i < 10:
        dataDiabetesDir += "data-0{}".format(i)
    else:
        dataDiabetesDir += "data-{}".format(i)
    dataDiabetesI = pd.read_csv(dataDiabetesDir, delimiter="\t",header=None)
    print(dataDiabetesDir)
    print(dataDiabetesI[0])
    break

./ahmedDatasets/Diabetes/Diabetes-Data/data-01
0      04-21-1991
1      04-21-1991
2      04-21-1991
3      04-21-1991
4      04-21-1991
          ...    
938    09-02-1991
939    09-02-1991
940    09-03-1991
941    09-03-1991
942    09-03-1991
Name: 0, Length: 943, dtype: object


### Labor Data

In [31]:
laborDir = mainDir+"Labor/"
laborData = pd.read_csv(laborDir+"laborTrain.data")

In [32]:
laborData

Unnamed: 0,#,dur,wage1,wage2,wage3,cola,hours,pension,stby_pay,shift_diff,educ_allw,holidays,vacation,lngtrm_disabil,dntl_ins,bereavement,empl_hplan,event
0,1,2,3.0,7.0,*,*,38,*,12,25,true,11,ba,true,half,true,*,good
1,2,2,4.0,5.0,*,tcf,35,*,13,5,*,15,gnr,*,*,*,*,good
2,3,2,4.5,5.8,*,*,35,ret_allw,*,*,true,11,ba,*,full,*,full,good
3,4,2,7.0,5.3,*,*,*,*,*,*,*,11,*,true,full,*,*,good
4,5,2,4.3,4.4,*,*,38,*,*,4,*,12,gnr,*,full,*,full,good
5,6,3,3.7,4.0,5.0,tc,*,*,*,*,true,*,*,*,*,true,*,good
6,7,3,4.0,5.0,5.0,tc,*,empl_contr,*,*,*,12,gnr,true,none,true,half,good
7,8,2,4.5,4.5,*,tcf,*,*,*,*,true,10,ba,true,none,*,half,good
8,9,1,2.8,*,*,*,35,*,*,2,*,12,ba,*,*,*,*,good
9,10,1,5.0,*,*,*,40,*,*,2,*,11,avg,*,*,true,*,good
