Skip to content

Commit

Permalink
added support for text features (using bag of words model encoded int…
Browse files Browse the repository at this point in the history
…o fixed size vector) and added support for arbitrary feature interactions
  • Loading branch information
eandrejko committed Dec 9, 2010
1 parent 193dd34 commit 6fb68b5
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 16 deletions.
69 changes: 54 additions & 15 deletions src/random_forests/core.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
(ns random-forests.core
(:require [clojure.contrib.duck-streams :as duck-streams]))
(:require [clojure.contrib.duck-streams :as duck-streams])
(:require [clojure.contrib.str-utils :as str-utils])
(:require [clojure.contrib.combinatorics :as combinatorics]))

(defn targets
"returns collection of targets from examples"
Expand Down Expand Up @@ -58,19 +60,38 @@
([name i]
(feature name i :categorical))
([name i type]
(hash-map :name name :index i :type type)))
(hash-map :name name :index i :type type))
([name i type vector-size]
(hash-map :name name :index i :type type :vector-size vector-size)))

(defn interaction?
"determines if a feature is an interaction of two or more features"
[feature]
(not (map? feature)))

(defn feature-value
"creates a filter function for a feature value pair"
[feature value]
(let [i (:index feature)]
(if (= :continuous (:type feature))
(with-meta
(fn [example] (<= (nth example i) value))
{:feature feature :value value :op-t "<=" :op-f ">"})
(with-meta
(fn [example] (= (nth example i) value))
{:feature feature :value value :op-t "==" :op-f "!="}))))
(cond
(interaction? feature)
(let [truth-conditions (map feature-value feature value)
text (map #(:text (meta %)) truth-conditions)]
(with-meta
(fn [example] (reduce (fn [x y] (and x y)) (map #(% example) truth-conditions)))
{:feature feature :value value :text (str-utils/str-join " and " text)}))
(= :continuous (:type feature))
(with-meta
(fn [example] (<= (nth example i) value))
{:feature feature :value value :text (str (:name feature) "<=" value)})
(= :text (:type feature))
(with-meta
(fn [example] (= (nth (nth example i) value) 1))
{:feature feature :value value :text (str (:name feature) " contains " value)})
:else
(with-meta
(fn [example] (= (nth example i) value))
{:feature feature :value value :text (str (:name feature) "==" value)}))))

(defn pairs
"returns seq of pairs from collection"
Expand All @@ -80,10 +101,17 @@
(defn feature-values
"determines set of values for feature"
[examples feature]
(let [values (map #(nth % (:index feature)) examples)]
(if (= :continuous (:type feature))
(set (map #(/ (+ (last %) (first %)) 2) (pairs (sort values))))
(set values))))
(cond
(interaction? feature)
(apply combinatorics/cartesian-product (map #(feature-values examples %) feature))
(= (:type feature) :text)
(range 0 (:vector-size feature))
(= :continuous (:type feature))
(let [values (map #(nth % (:index feature)) examples)]
(set (map #(/ (+ (last %) (first %)) 2) (pairs (sort values)))))
:else
(set (map #(nth % (:index feature)) examples))
))

(defn determine-split
"returns a feature value pair as {:feature feature, :value value} representing the best split of the provided examples from the provided features"
Expand Down Expand Up @@ -116,7 +144,7 @@
(if (feature-value x)
(child-eq x)
(child-neq x)))
{:tree (str "if(" (:name (:feature mfv)) (:op-t mfv) (:value mfv) "){" (:tree (meta child-eq)) "}else{" (:tree (meta child-neq)) "}" )})))
{:tree (str "if(" (:text mfv) "){" (:tree (meta child-eq)) "}else{" (:tree (meta child-neq)) "}" )})))
;; examples cannot be split all features are identical
(let [t (target-mode examples)]
(with-meta (fn [x] t) {:tree t}))))
Expand All @@ -137,6 +165,12 @@
;; else determine best splitting node and recurse with new examples and features
(build-tree-with-split examples features (determine-split examples features))))))

(defn encode-text-into-vector
"encodes text into a binary vector of the specified size"
[coll size]
(let [hash-values (set (map #(mod (.hashCode %) size) coll))]
(vec (map #(if (contains? hash-values %) 1 0) (range 0 (dec size))))))

(defn read-dataset
"reads dataset for training and test from a csv file"
[file-name]
Expand Down Expand Up @@ -222,9 +256,14 @@

;; everything but the last column is an input feature
(def features (set (map #(feature (str "V" %) %) (range (dec (count (first (:training data))))))))

(def features-with-interactions (set
(concat
features
(for [a features b features :when (not (= a b))] [a b]))))

(def forest (doall
(take 50 (build-random-forest (:training data) features 3))))
(take 50 (build-random-forest (:training data) features-with-interactions 3))))

(println "AUC: " (auc forest (:test data)))

Expand Down
25 changes: 24 additions & 1 deletion test/random_forests/test/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -95,4 +95,27 @@
(deftest build-tree-builds-decision-tree
(let [examples (list ["M" "<25" 0] ["M" "<25" 0] ["F" "<30" 1] ["F" "<30" 1] )
features (set (map #(feature % %) #{0 1}))]
(is (= 0 ((build-tree examples features) ["M" "<25"])))))
(is (= 0 ((build-tree examples features) ["M" "<25"])))))

(deftest feature-value-selects-text-features
(let [examples (list ["M" (encode-text-into-vector #{"the" "hat"} 5) 0] ["M" (encode-text-into-vector #{"the" "shoe"} 5) 1])
fv (feature-value (feature "description" 1 :text) (mod (.hashCode "hat") 5))]
(is (= true (fv (first examples))))
(is (= false (fv (last examples))))))

(deftest feature-values-determines-set-of-values-for-text-features
(let [examples (list ["M" (encode-text-into-vector #{"the" "hat"} 5) 0] ["M" (encode-text-into-vector #{"the" "shoe"} 5) 1])
feature (feature "description" 1 :text 5)]
(is (= (range 0 5) (feature-values examples feature)))))

(deftest feature-value-selects-interaction-of-features
(let [examples (list ["M" (encode-text-into-vector #{"the" "hat"} 5) 0] ["F" (encode-text-into-vector #{"the" "hat"} 5) 1])
fv (feature-value [(feature "description" 1 :text) (feature "gender" 0)] [(mod (.hashCode "hat") 5) "M"])]
(is (= true (fv (first examples))))
(is (= false (fv (last examples))))))

(deftest feature-values-determines-set-of-values-for-interaction-features
(let [examples (list ["M" (encode-text-into-vector #{"the" "hat"} 5) 0] ["F" (encode-text-into-vector #{"the" "hat"} 5) 1])
feature [(feature "description" 1 :text 5) (feature "gender" 0)]]
(is (= (for [x (range 0 5) y ["F" "M"]] (list x y)) (feature-values examples feature)))))

0 comments on commit 6fb68b5

Please sign in to comment.