Skip to content

Commit

Permalink
Improve robustness of PAM initialization. Fixes Github#4
Browse files Browse the repository at this point in the history
  • Loading branch information
kno10 committed Aug 3, 2015
1 parent a5bb9f1 commit b48eb01
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,19 +46,19 @@
/**
* The original PAM algorithm or k-medoids clustering, as proposed by Kaufman
* and Rousseeuw in "Partitioning Around Medoids".
*
*
* Reference:
* <p>
* Clustering my means of Medoids<br />
* Kaufman, L. and Rousseeuw, P.J.<br />
* in: Statistical Data Analysis Based on the L1-Norm and Related Methods
* </p>
*
*
* @author Erich Schubert
*
*
* @apiviz.has MedoidModel
* @apiviz.composedOf KMedoidsInitialization
*
*
* @param <V> vector datatype
*/
@Title("Partioning Around Medoids")
Expand Down Expand Up @@ -93,7 +93,7 @@ public class KMedoidsPAM<V> extends AbstractDistanceBasedAlgorithm<V, Clustering

/**
* Constructor.
*
*
* @param distanceFunction distance function
* @param k k parameter
* @param maxiter Maxiter parameter
Expand All @@ -108,7 +108,7 @@ public KMedoidsPAM(DistanceFunction<? super V> distanceFunction, int k, int maxi

/**
* Run k-medoids
*
*
* @param database Database
* @param relation relation to use
* @return result
Expand Down Expand Up @@ -155,7 +155,7 @@ public Clustering<MedoidModel> run(Database database, Relation<V> relation) {

/**
* Run the PAM optimization phase.
*
*
* @param distQ Distance query
* @param ids IDs to process
* @param medoids Medoids list
Expand Down Expand Up @@ -268,7 +268,7 @@ protected void runPAMOptimization(DistanceQuery<V> distQ, DBIDs ids, ArrayModifi
/**
* Returns a list of clusters. The k<sup>th</sup> cluster contains the ids of
* those FeatureVectors, that are nearest to the k<sup>th</sup> mean.
*
*
* @param means Object centroids
* @param ids Object ids
* @param nearest Distance to nearest medoid
Expand Down Expand Up @@ -296,6 +296,9 @@ else if(dist < mindist2) {
mindist2 = dist;
}
}
if(minIndex < 0) {
throw new AbortException("Too many infinite distances. Cannot assign objects.");
}
assignment.put(iditer, minIndex);
nearest.put(iditer, mindist);
second.put(iditer, mindist2);
Expand All @@ -316,9 +319,9 @@ protected Logging getLogger() {

/**
* Parameterization class.
*
*
* @author Erich Schubert
*
*
* @apiviz.exclude
*/
public static class Parameterizer<V> extends AbstractDistanceBasedAlgorithm.Parameterizer<V> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,16 @@

/**
* PAM initialization for k-means (and of course, PAM).
*
*
* Reference:
* <p>
* Clustering my means of Medoids<br />
* Kaufman, L. and Rousseeuw, P.J.<br />
* in: Statistical Data Analysis Based on the L_1–Norm and Related Methods
* </p>
*
* TODO: enforce using a distance matrix?
*
*
* @author Erich Schubert
*
*
* @param <O> Object type for KMedoids initialization
*/
@Reference(title = "Clustering my means of Medoids", //
Expand Down Expand Up @@ -98,11 +96,11 @@ public <T extends NumberVector, V extends NumberVector> List<V> chooseInitialMea
public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> distQ) {
ArrayModifiableDBIDs medids = DBIDUtil.newArray(k);
DBIDVar bestid = DBIDUtil.newVar();
double best = Double.POSITIVE_INFINITY;
WritableDoubleDataStore mindist = null;

// First mean is chosen by having the smallest distance sum to all others.
{
double best = Double.POSITIVE_INFINITY;
WritableDoubleDataStore newd = null;
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Choosing initial mean", ids.size(), LOG) : null;
for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
Expand Down Expand Up @@ -138,6 +136,7 @@ public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> dis
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Choosing initial centers", k, LOG) : null;
LOG.incrementProcessed(prog); // First one was just chosen.
for(int i = 1; i < k; i++) {
double best = Double.POSITIVE_INFINITY;
WritableDoubleDataStore bestd = null, newd = null;
for(DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
if(medids.contains(iter)) {
Expand All @@ -162,8 +161,8 @@ public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> dis
newd = null;
}
}
if(bestid == null) {
throw new AbortException("No median found that improves the criterion function?!?");
if(bestd == null) {
throw new AbortException("No median found that improves the criterion function?!? Too many infinite distances.");
}
medids.add(bestid);
if(newd != null) {
Expand All @@ -181,9 +180,9 @@ public DBIDs chooseInitialMedoids(int k, DBIDs ids, DistanceQuery<? super O> dis

/**
* Parameterization class.
*
*
* @author Erich Schubert
*
*
* @apiviz.exclude
*/
public static class Parameterizer<V> extends AbstractParameterizer {
Expand Down

0 comments on commit b48eb01

Please sign in to comment.