Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
747deaa
stuct of vectors to vector of structs
valeriy42 Feb 13, 2020
8624ca3
eliminate s_NextIndex
valeriy42 Feb 13, 2020
cacc369
single instance of path
valeriy42 Feb 14, 2020
a16c113
single instance of path
valeriy42 Feb 14, 2020
f0c4f8b
magic formula for memory reservation
valeriy42 Feb 14, 2020
9cc441a
scale extracted
valeriy42 Feb 14, 2020
f30da51
formatting
valeriy42 Feb 17, 2020
16f3906
performance optimization. all tests passed
valeriy42 Feb 17, 2020
af3c587
simplify extendPath
valeriy42 Feb 17, 2020
9731f41
sumUnwoundPath simplified
valeriy42 Feb 17, 2020
1cb6bd0
unwindPath simplified
valeriy42 Feb 17, 2020
33f8429
formatting
valeriy42 Feb 17, 2020
d827cb5
fix for root method
valeriy42 Feb 17, 2020
ed86747
minor refactorings, undo test changes
valeriy42 Feb 17, 2020
da4d17d
comment formatting
valeriy42 Feb 17, 2020
488e6d1
move scale into element accessor
valeriy42 Feb 18, 2020
685ad17
fix numerical issue with random test
valeriy42 Feb 18, 2020
5106e28
formatting
valeriy42 Feb 18, 2020
33026b9
move find method
valeriy42 Feb 18, 2020
cbd1558
extendPath refactoring
valeriy42 Feb 19, 2020
3276b9d
sumUnwoundPath refactoring
valeriy42 Feb 19, 2020
ff42b5e
formatting
valeriy42 Feb 19, 2020
5e58c57
refactorings
valeriy42 Feb 19, 2020
82ff3f8
reduce memory requirement
valeriy42 Feb 19, 2020
323b676
Merge branch 'master' of https://github.com/elastic/ml-cpp into Perfo…
valeriy42 Feb 19, 2020
ca3d891
Comments
valeriy42 Feb 19, 2020
b864d9f
Change method signature
valeriy42 Feb 19, 2020
9152695
reviewers comments
valeriy42 Feb 19, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ progress, memory usage, etc. (See {ml-pull}906[#906].)

=== Enhancements

* Improve computational performance of the feature importance computation. (See {ml-pull}1005[1005].)
* Improve initialization of learn rate for better and more stable results in regression
and classification. (See {ml-pull}948[#948].)
* Add number of processed training samples to the definition of decision tree nodes.
Expand Down
133 changes: 80 additions & 53 deletions include/maths/CTreeShapFeatureImportance.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,67 +50,89 @@ class MATHS_EXPORT CTreeShapFeatureImportance {
private:
using TSizeVec = std::vector<std::size_t>;

//! Manages variables for the current path through the tree as the main algorithm proceeds.
struct SPath {
explicit SPath(std::size_t length)
: s_FractionOnes(length), s_FractionZeros(length),
s_FeatureIndex(length, -1), s_Scale(length), s_MaxLength(length) {}

void extend(int featureIndex, double fractionZero, double fractionOne) {
if (s_NextIndex < s_MaxLength) {
s_FeatureIndex[s_NextIndex] = featureIndex;
s_FractionZeros[s_NextIndex] = fractionZero;
s_FractionOnes[s_NextIndex] = fractionOne;
if (s_NextIndex == 0) {
s_Scale[s_NextIndex] = 1.0;
} else {
s_Scale[s_NextIndex] = 0.0;
}
++s_NextIndex;
}
//! Collects the elements of the path through decision tree that are updated together
struct SPathElement {
double s_FractionOnes = 1.0;
double s_FractionZeros = 1.0;
int s_FeatureIndex = -1;
};

using TElementVec = std::vector<SPathElement>;
using TElementItr = TElementVec::iterator;
using TDoubleVecItr = TDoubleVec::iterator;

class CSplitPath {
public:
CSplitPath(TElementItr fractionsIterator, TDoubleVecItr scaleIterator) {
m_FractionsIterator = fractionsIterator;
m_ScaleIterator = scaleIterator;
}

void reduce(std::size_t pathIndex) {
for (int i = static_cast<int>(pathIndex); i < this->depth(); ++i) {
s_FeatureIndex[i] = s_FeatureIndex[i + 1];
s_FractionZeros[i] = s_FractionZeros[i + 1];
s_FractionOnes[i] = s_FractionOnes[i + 1];
}
--s_NextIndex;
CSplitPath(const CSplitPath& parentSplitPath, int nextIndex)
: CSplitPath(parentSplitPath.fractionsBegin() + nextIndex,
parentSplitPath.scaleBegin() + nextIndex) {
std::copy(parentSplitPath.fractionsBegin(),
parentSplitPath.fractionsBegin() + nextIndex,
this->fractionsBegin());
std::copy(parentSplitPath.scaleBegin(),
parentSplitPath.scaleBegin() + nextIndex, this->scaleBegin());
}

//! Indicator whether or not the feature \p pathIndex is decicive for the path.
double fractionOnes(std::size_t pathIndex) const {
return s_FractionOnes[pathIndex];
TElementItr& fractions() { return m_FractionsIterator; }
const TElementItr& fractions() const { return m_FractionsIterator; }
TDoubleVecItr& scale() { return m_ScaleIterator; }
const TDoubleVecItr& scale() const { return m_ScaleIterator; }

SPathElement& operator[](int index) {
return m_FractionsIterator[index];
}

//! Fraction of all training data that reached the \pathIndex in the path.
double fractionZeros(std::size_t pathIndex) const {
return s_FractionZeros[pathIndex];
TElementItr& fractionsBegin() { return m_FractionsIterator; }
const TElementItr& fractionsBegin() const {
return m_FractionsIterator;
}

int featureIndex(std::size_t pathIndex) const {
return s_FeatureIndex[pathIndex];
TDoubleVecItr& scaleBegin() { return m_ScaleIterator; }
const TDoubleVecItr& scaleBegin() const { return m_ScaleIterator; }

void setValues(int index, double fractionOnes, double fractionZeros, int featureIndex) {
m_FractionsIterator[index].s_FractionOnes = fractionOnes;
m_FractionsIterator[index].s_FractionZeros = fractionZeros;
m_FractionsIterator[index].s_FeatureIndex = featureIndex;
}

//! Scaling coefficients (factorials), see. Equation (2) in the paper by Lundberg et al.
double scale(std::size_t pathIndex) const { return s_Scale[pathIndex]; }
void scale(int index, double value) { m_ScaleIterator[index] = value; }

//! Current depth in the tree
int depth() const { return static_cast<int>(s_NextIndex) - 1; }
double scale(int index) const { return m_ScaleIterator[index]; }

//! Get next index.
std::size_t nextIndex() const { return s_NextIndex; }
int featureIndex(int nextIndex) const {
return m_FractionsIterator[nextIndex].s_FeatureIndex;
}

double fractionZeros(int nextIndex) const {
return m_FractionsIterator[nextIndex].s_FractionZeros;
}

//! Set next index.
void nextIndex(std::size_t nextIndex) { s_NextIndex = nextIndex; }
double fractionOnes(int nextIndex) const {
return m_FractionsIterator[nextIndex].s_FractionOnes;
}

int find(int feature, int nextIndex) {
auto featureIndexEnd{(this->fractionsBegin() + nextIndex)};
auto it = std::find_if(this->fractionsBegin(), featureIndexEnd,
[feature](const SPathElement& el) {
return el.s_FeatureIndex == feature;
});
if (it != featureIndexEnd) {
return std::distance(this->fractionsBegin(), it);
} else {
return -1;
}
}

TDoubleVec s_FractionOnes;
TDoubleVec s_FractionZeros;
TIntVec s_FeatureIndex;
TDoubleVec s_Scale;
std::size_t s_NextIndex = 0;
std::size_t s_MaxLength = 0;
private:
TElementItr m_FractionsIterator;
TDoubleVecItr m_ScaleIterator;
};

private:
Expand All @@ -119,19 +141,24 @@ class MATHS_EXPORT CTreeShapFeatureImportance {
void shapRecursive(const TTree& tree,
const CDataFrameCategoryEncoder& encoder,
const CEncodedDataFrameRowRef& encodedRow,
SPath& splitPath,
std::size_t nodeIndex,
double parentFractionZero,
double parentFractionOne,
int parentFeatureIndex,
const CSplitPath& path,
std::size_t offset,
core::CDataFrame::TRowItr& row) const;
core::CDataFrame::TRowItr& row,
int nextIndex) const;
//! Extend the \p path object, update the variables and factorial scaling coefficients.
static void extendPath(SPath& path, double fractionZero, double fractionOne, int featureIndex);
//! Sum the scaling coefficients for the \p path without the feature defined in \p pathIndex.
static double sumUnwoundPath(const SPath& path, std::size_t pathIndex);
static void extendPath(CSplitPath& splitPath,
double fractionZero,
double fractionOne,
int featureIndex,
int& nextIndex);
//! Sum the scaling coefficients for the \p scalePath without the feature defined in \p pathIndex.
static double sumUnwoundPath(const CSplitPath& path, int pathIndex, int nextIndex);
//! Updated the scaling coefficients in the \p path if the feature defined in \p pathIndex was seen again.
static void unwindPath(SPath& path, std::size_t pathIndex);
static void unwindPath(CSplitPath& path, int pathIndex, int& nextIndex);

private:
TTreeVec m_Trees;
Expand Down
Loading