In [136]:
import re
from collections import deque

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext

from pyspark.mllib.tree import RandomForestModel

In [2]:
conf = (SparkConf().setMaster("local[*]").setAppName('pyspark'))
sc = SparkContext(conf=conf)
sql_context = SQLContext(sc)

# Configuration

In [3]:
MODEL_LOCATION = "/Users/georg/Downloads/random_forest_d15_n5/model_40.72_-73.94/"

In [122]:
FEATURE_MAPPING = ["Feature %d" % (i + 1) for i in range(100000)]

# Load Model & Get Debug String

In [5]:
model = RandomForestModel.load(sc, MODEL_LOCATION)

In [170]:
debug_string = model.toDebugString().split('\n')[2:-1]
print(debug_string[:10])

[u'  Tree 0:', u'    If (feature 5 <= 368.0)', u'     If (feature 5 <= 153.0)', u'      If (feature 4 <= 26.0)', u'       If (feature 4 <= 14.0)', u'        If (feature 2 <= 8.0)', u'         If (feature 2 <= 3.0)', u'          If (feature 10 <= 127953.0)', u'           If (feature 1 <= 5.0)', u'            If (feature 4 <= 5.0)']


# Parse Debug String

In [139]:
class InternalNode(object):
    
    def __init__(self, parent_split_feature, parent_split_value, left, right):
        
        self.is_leaf = False
        
        self.parent_split_feature = parent_split_feature
        self.parent_split_value = parent_split_value
        self.left = left
        self.right = right
        
        assert left.parent_split_feature == right.parent_split_feature
        assert left.parent_split_value == right.parent_split_value
        
        self.split_feature = left.parent_split_feature
        self.split_value = left.parent_split_value
        
    def __str__(self):
        
        return self.to_string(0)
        
    def to_string(self, indention):
        
        return (" " * indention) + "InternalNode: %s, %s" % (self.split_feature, self.split_value) + "\n" \
                + self.left.to_string(indention + 1) + "\n" \
                + self.right.to_string(indention + 1)
    
        
class LeafNode(object):
    
    def __init__(self, parent_split_feature, parent_split_value, prediction):
        
        self.is_leaf = True
        
        self.parent_split_feature = parent_split_feature
        self.parent_split_value = parent_split_value
        self.prediction = prediction
        
    def to_string(self, indention):
        
        return (" " * indention) + "LeafNode: %s" % self.prediction

In [140]:
def split_debug_string(debug_string):
    indention = len(re.search(" +", debug_string[0]).group())

    result = []
    currentList = None
    for i in range(len(debug_string)):
        if not debug_string[i].startswith(" " * (indention + 1)):
            currentList = []
            result.append(currentList)
        currentList.append(debug_string[i])
    
    return result

In [141]:
tree_debug_strings = split_debug_string(debug_string)

In [142]:
def get_node_from_debug_string(debug_string):
    node_debug_string = debug_string[0].strip()
    if node_debug_string.startswith("Tree"):
        parent_split_feature = None
        parent_split_value = None
    else:
        match = re.match("(If|Else) \(feature (\d+) (<=|>) (-?\d+\.\d+)\)", node_debug_string)
        parent_split_feature = FEATURE_MAPPING[int(match.group(2))]
        parent_split_value = float(match.group(4))
    
    split = split_debug_string(debug_string[1:])
    if len(split) == 1:
        assert len(split[0]) == 1
        prediction_value = float(re.match("Predict: (-?\d+\.\d+)", split[0][0].strip()).group(1))
        return LeafNode(parent_split_feature, parent_split_value, prediction_value)
    
    assert len(split) == 2
    left_child = get_node_from_debug_string(split[0])
    right_child = get_node_from_debug_string(split[1])
    
    return InternalNode(parent_split_feature, parent_split_value, left_child, right_child)

In [143]:
trees = [get_node_from_debug_string(tree_debug_string) for tree_debug_string in tree_debug_strings]

In [173]:
print(str(trees[0])[:500] + "...")

InternalNode: Feature 6, 368.0
 InternalNode: Feature 6, 153.0
  InternalNode: Feature 5, 26.0
   InternalNode: Feature 5, 14.0
    InternalNode: Feature 3, 8.0
     InternalNode: Feature 3, 3.0
      InternalNode: Feature 11, 127953.0
       InternalNode: Feature 2, 5.0
        InternalNode: Feature 5, 5.0
         InternalNode: Feature 10, 16502.0
          InternalNode: Feature 12, 39174.0
           InternalNode: Feature 12183, 0.0
            InternalNode: Feature 5, 4.0
             Intern...


In [145]:
len(trees)

5

# Get Most Common Features

In [153]:
def get_features_and_levels(tree, maxlevel=10):
    result = []
    queue = deque([(tree, 0)])

    while len(queue):
        node, level = queue.popleft()
        if level > maxlevel:
            break
        if not node.is_leaf:
            result.append((node.split_feature, level))
            queue.append((node.left, level + 1))
            queue.append((node.right, level + 1))
    
    return result

In [168]:
def get_top_features(trees, maxlevel=3):
    return set([feature for tree in trees
                        for feature, level in get_features_and_levels(tree, maxlevel)])

In [169]:
get_top_features(trees)

{'Feature 1',
 'Feature 10',
 'Feature 11',
 'Feature 12',
 'Feature 14149',
 'Feature 16989',
 'Feature 17',
 'Feature 19',
 'Feature 2',
 'Feature 2093',
 'Feature 3',
 'Feature 4',
 'Feature 5',
 'Feature 6',
 'Feature 7',
 'Feature 8',
 'Feature 9'}