src/java/com/facebook/LinkBench/distributions/PiecewiseLinearDistribution.java

/*
 * Copyright 2012, Facebook, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.facebook.LinkBench.distributions;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Random;

/**
 * A distribution where the cumulative density function is an arbitrary
 * piecewise linear function.
 * 
 * Rather confusingly there are two possible ways of looking at the 
 * distribution.  The first is to divide the keyspace by ids, and order
 * these IDs by the number of accesses.  Then DIST-A determines how likely
 * it is that that given key will be chosen.  The second is to divide the
 * keyspace into buckets, where there are multiple keys in each bucket which
 * have been accessed the same number of times.  There DIST-B determines how
 * likely a random key is to fall into each bucket.  The input data is 
 * represented as DIST-B, but the probability distribution represented by
 * this class is DIST-A, so we need to convert from one representation to
 * another. 
 *
 * The conversion process works as follows.
 * Suppose you have items numbered 0 to n - 1.  Then item i gets assigned
 * the percentile rank p = i / (n - 1), a number between 0 and 1.
 * 
 * The input is a set of tuples (r, v), where v is the total number of 
 * observations of the item at percentile p.  So the values of the are 
 * denominated not in probability density, but rather in number of observation.
 * 
 * This means that to convert the input to a probability density distribution,
 * we need to calculate the expected value of the distribution, and then divide
 * the value by that.
 * 
 * This is an abstract class: the init method needs to be implemented
 * @author tarmstrong
 *
 */
public abstract class PiecewiseLinearDistribution implements ProbabilityDistribution {

  //helper class to store (value, probability)
  public static class Point implements Comparable<Point> {
    public int value;
    public double probability;
    
    public Point(int input_value, double input_probability) {
      this.value = input_value;
      this.probability = input_probability;
    }
    
    public int compareTo(Point obj) {
      Point p = (Point)obj;
      return this.value - p.value;
    }
    
    public String toString() {
      return "(" + value + ", " + probability + ")";
    }
  }

  protected void init(long min, long max, ArrayList<Point> cdf) {
    double pdf[] = getPDF(cdf);
    double ccdf[] = getCCDF(pdf);
    double cs[] = getCumulativeSum(ccdf);
    long right_points[] = new long[cs.length];
    init(min, max, cdf, cs, right_points, expectedValue(cdf));
  }
  
  /**
   * Init with precalculated values
   * @param min
   * @param max
   * @param cdf
   * @param cs
   * @param right_points
   * @param expectedValue
   */
  protected void init(long min, long max, ArrayList<Point> cdf,
        double cs[], long right_points[], double expectedValue) {
    this.min = min;
    this.max = max;
    this.cdf = cdf;
    this.cs = cs;
    this.right_points = right_points;
    this.expected_val = expectedValue;
  }
  
  protected long max;
  protected long min;
  protected ArrayList<Point> cdf;
  
  protected double[] cs;
  protected long[] right_points;
  
  /**
   * Total number of observations in data
   */
  private double expected_val;


  @Override
  public double pdf(long id) {
    long n = (max - min);
    double totalSum = expected_val * n;
    return expectedCount(id) / totalSum;
  }

  @Override
  public double expectedCount(long id) {
    return expectedCount(min, max, id, cdf);
  }
  
  public static double expectedCount(long min, long max, long id, 
                ArrayList<Point> cdf) {
    if (id < min || id >= max) {
      return 0.0;
    }
    long n = (max - min);
    // Put in into range [0.0, 1.0] with most popular at 0.0
    double u = 1.0 - (id - min) / (double) n;
    int ix = binarySearch(cdf, u);
    Point p1 = cdf.get(ix);
    assert(u <= p1.probability);

    // Assuming piecewise linear, so equally as probably as p1.value
    return p1.value;
  }

  @Override
  public double cdf(long id) {
    // Since this should be the CDF function for DIST-A, rather
    // than DIST-B, it is non-trivial to calculate (requires some kind
    // of integration of DIST-B).
    throw new RuntimeException("Cdf not implemented yet");
  }

  @Override
  public long quantile(double p) {
    // This is not implemented, due to similar reasons to cdf
    throw new RuntimeException("Quantile not implemented yet");
  }

  @Override
  public long choose(Random rng) {
    return choose(rng, min, max, cs, right_points);
  }

  protected static long choose(Random rng, long startid1, long maxid1,
      double[] cs, long[] right_points) {
    double max_probability = cs[cs.length - 1];
    double p = max_probability * rng.nextDouble();
  
    int idx = binarySearch(cs, p);
    if (idx == 0) idx = 1;
  
    /*
     * TODO: this algorithm does not appear to generate data
     * faithful to the distribution.
     * Additional problems include data races if multiple threads are
     * concurrently modifying the shared arrays, and the fact
     * that a workload cannot be reproduced.
     */
    long result = right_points[idx] % (maxid1 - startid1);
    right_points[idx] = (result + 1) % (maxid1 - startid1);
    long id1 = startid1 + result;
    return id1;
  }

  /**
   * Get the expected value of the distribution (e.g. the
   * average number of links
   * @param cdf
   * @return
   */
  protected static double expectedValue(ArrayList<Point> cdf) {
    // This function is not entirely precise since it assumes
    // that the ID space is continuous, which is not an accurate 
    // approximation for small ID counts
    
    if (cdf.size() == 0) return 0;
    // Assume CDF is piecewise linear
    double sum = 0;
    sum = cdf.get(0).probability * cdf.get(0).value;
    for (int i = 1; i < cdf.size(); i++) {
      Point prev = cdf.get(i-1);
      Point curr = cdf.get(i);
      double p = curr.probability - prev.probability;
      sum += p * curr.value;
    }
    return sum;
  }

  public static int binarySearch(ArrayList<Point> points, double p) {
    int left = 0, right = points.size() - 1;
    while (left < right) {
      int mid = (left + right)/2;
      if (points.get(mid).probability >= p) {
        right = mid;
      } else {
        left = mid + 1;
      }
    }
    if (points.get(left).probability >= p) {
      return left;
    } else {
      return left + 1;
    }
  }

  public static int binarySearch(double[] a, double p) {
    // Use built-in binary search
    int res = Arrays.binarySearch(a, p);
    if (res >= 0) {
      return res;
    } else {
      // Arrays.binarySearch returns (-(insertion point) - 1) when not found
      return -(res + 1);
    }
  }

  protected static double[] getPDF(ArrayList<Point> cdf) {
    int max_value = cdf.get(cdf.size() - 1).value;
    double[] pdf = new double[max_value + 1];
  
    // set all 0
    for (int i = 0; i < pdf.length; ++i) pdf[i] = 0;
  
    // convert cdf to pdf
    pdf[cdf.get(0).value] = cdf.get(0).probability;
    for (int i = 1; i < cdf.size(); ++i) {
      pdf[cdf.get(i).value] = cdf.get(i).probability -
        cdf.get(i - 1).probability;
    }
    return pdf;
  }

  protected static double[] getCCDF(double[] pdf) {
    int length = pdf.length;
    double[] ccdf = new double[length];
    ccdf[length - 1] = pdf[length - 1];
    for (int i = length - 2; i >= 0; --i) {
      ccdf[i] = ccdf[i + 1] + pdf[i];
    }
    return ccdf;
  }

  protected static double[] getCumulativeSum(double[] cdf) {
    int length = cdf.length;
    double[] cs = new double[length];
    cs[0] = 0; //ignore cdf[0]
    for (int i = 1; i < length; ++i) {
      cs[i] = cs[i - 1] + cdf[i];
    }
    return cs;
  }
}