Skip to content

Commit

Permalink
Merge pull request apache#7 from apache/master
Browse files Browse the repository at this point in the history
merge upstream changes
  • Loading branch information
nchammas committed Aug 6, 2014
2 parents 4e98236 + e537b33 commit 8f641ac
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 10 deletions.
Expand Up @@ -32,12 +32,12 @@ import org.apache.spark.util.Utils
* :: Experimental ::
* Maps a sequence of terms to their term frequencies using the hashing trick.
*
* @param numFeatures number of features (default: 1000000)
* @param numFeatures number of features (default: 2^20^)
*/
@Experimental
class HashingTF(val numFeatures: Int) extends Serializable {

def this() = this(1000000)
def this() = this(1 << 20)

/**
* Returns the index of the input term.
Expand Down
Expand Up @@ -19,11 +19,11 @@ package org.apache.spark.mllib.feature

import breeze.linalg.{DenseVector => BDV, SparseVector => BSV}

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.annotation.Experimental
import org.apache.spark.mllib.linalg.{Vector, Vectors}

/**
* :: DeveloperApi ::
* :: Experimental ::
* Normalizes samples individually to unit L^p^ norm
*
* For any 1 <= p < Double.PositiveInfinity, normalizes samples using
Expand All @@ -33,7 +33,7 @@ import org.apache.spark.mllib.linalg.{Vector, Vectors}
*
* @param p Normalization in L^p^ space, p = 2 by default.
*/
@DeveloperApi
@Experimental
class Normalizer(p: Double) extends VectorTransformer {

def this() = this(2)
Expand Down
Expand Up @@ -19,22 +19,22 @@ package org.apache.spark.mllib.feature

import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.annotation.Experimental
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.rdd.RDDFunctions._
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.rdd.RDD

/**
* :: DeveloperApi ::
* :: Experimental ::
* Standardizes features by removing the mean and scaling to unit variance using column summary
* statistics on the samples in the training set.
*
* @param withMean False by default. Centers the data with mean before scaling. It will build a
* dense output, so this does not work on sparse input and will raise an exception.
* @param withStd True by default. Scales the data to unit standard deviation.
*/
@DeveloperApi
@Experimental
class StandardScaler(withMean: Boolean, withStd: Boolean) extends VectorTransformer {

def this() = this(false, true)
Expand Down
19 changes: 17 additions & 2 deletions mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
Expand Up @@ -17,6 +17,9 @@

package org.apache.spark.mllib.feature

import java.lang.{Iterable => JavaIterable}

import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

Expand All @@ -25,6 +28,7 @@ import com.github.fommil.netlib.BLAS.{getInstance => blas}
import org.apache.spark.Logging
import org.apache.spark.SparkContext._
import org.apache.spark.annotation.Experimental
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.rdd.RDDFunctions._
import org.apache.spark.rdd._
Expand Down Expand Up @@ -239,7 +243,7 @@ class Word2Vec extends Serializable with Logging {
a += 1
}
}

/**
* Computes the vector representation of each word in vocabulary.
* @param dataset an RDD of words
Expand Down Expand Up @@ -369,11 +373,22 @@ class Word2Vec extends Serializable with Logging {

new Word2VecModel(word2VecMap.toMap)
}

/**
* Computes the vector representation of each word in vocabulary (Java version).
* @param dataset a JavaRDD of words
* @return a Word2VecModel
*/
def fit[S <: JavaIterable[String]](dataset: JavaRDD[S]): Word2VecModel = {
fit(dataset.rdd.map(_.asScala))
}
}

/**
* Word2Vec model
* :: Experimental ::
* Word2Vec model
*/
@Experimental
class Word2VecModel private[mllib] (
private val model: Map[String, Array[Float]]) extends Serializable {

Expand Down
@@ -0,0 +1,66 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.mllib.feature;

import java.io.Serializable;
import java.util.List;

import scala.Tuple2;

import com.google.common.collect.Lists;
import com.google.common.base.Strings;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;

import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;

public class JavaWord2VecSuite implements Serializable {
private transient JavaSparkContext sc;

@Before
public void setUp() {
sc = new JavaSparkContext("local", "JavaWord2VecSuite");
}

@After
public void tearDown() {
sc.stop();
sc = null;
}

@Test
@SuppressWarnings("unchecked")
public void word2Vec() {
// The tests are to check Java compatibility.
String sentence = Strings.repeat("a b ", 100) + Strings.repeat("a c ", 10);
List<String> words = Lists.newArrayList(sentence.split(" "));
List<List<String>> localDoc = Lists.newArrayList(words, words);
JavaRDD<List<String>> doc = sc.parallelize(localDoc);
Word2Vec word2vec = new Word2Vec()
.setVectorSize(10)
.setSeed(42L);
Word2VecModel model = word2vec.fit(doc);
Tuple2<String, Object>[] syms = model.findSynonyms("a", 2);
Assert.assertEquals(2, syms.length);
Assert.assertEquals("b", syms[0]._1());
Assert.assertEquals("c", syms[1]._1());
}
}
9 changes: 9 additions & 0 deletions python/pyspark/rdd.py
Expand Up @@ -134,6 +134,7 @@ class MaxHeapQ(object):

"""
An implementation of MaxHeap.
>>> import pyspark.rdd
>>> heap = pyspark.rdd.MaxHeapQ(5)
>>> [heap.insert(i) for i in range(10)]
Expand Down Expand Up @@ -381,6 +382,7 @@ def mapPartitionsWithSplit(self, f, preservesPartitioning=False):
def getNumPartitions(self):
"""
Returns the number of partitions in RDD
>>> rdd = sc.parallelize([1, 2, 3, 4], 2)
>>> rdd.getNumPartitions()
2
Expand Down Expand Up @@ -570,6 +572,7 @@ def sortByKey(self, ascending=True, numPartitions=None, keyfunc=lambda x: x):
"""
Sorts this RDD, which is assumed to consist of (key, value) pairs.
# noqa
>>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
>>> sc.parallelize(tmp).sortByKey(True, 2).collect()
[('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]
Expand Down Expand Up @@ -1209,6 +1212,7 @@ def collectAsMap(self):
def keys(self):
"""
Return an RDD with the keys of each tuple.
>>> m = sc.parallelize([(1, 2), (3, 4)]).keys()
>>> m.collect()
[1, 3]
Expand All @@ -1218,6 +1222,7 @@ def keys(self):
def values(self):
"""
Return an RDD with the values of each tuple.
>>> m = sc.parallelize([(1, 2), (3, 4)]).values()
>>> m.collect()
[2, 4]
Expand Down Expand Up @@ -1642,6 +1647,7 @@ def repartition(self, numPartitions):
Internally, this uses a shuffle to redistribute data.
If you are decreasing the number of partitions in this RDD, consider
using `coalesce`, which can avoid performing a shuffle.
>>> rdd = sc.parallelize([1,2,3,4,5,6,7], 4)
>>> sorted(rdd.glom().collect())
[[1], [2, 3], [4, 5], [6, 7]]
Expand All @@ -1656,6 +1662,7 @@ def repartition(self, numPartitions):
def coalesce(self, numPartitions, shuffle=False):
"""
Return a new RDD that is reduced into `numPartitions` partitions.
>>> sc.parallelize([1, 2, 3, 4, 5], 3).glom().collect()
[[1], [2, 3], [4, 5]]
>>> sc.parallelize([1, 2, 3, 4, 5], 3).coalesce(1).glom().collect()
Expand Down Expand Up @@ -1694,6 +1701,7 @@ def name(self):
def setName(self, name):
"""
Assign a name to this RDD.
>>> rdd1 = sc.parallelize([1,2])
>>> rdd1.setName('RDD1')
>>> rdd1.name()
Expand Down Expand Up @@ -1753,6 +1761,7 @@ class PipelinedRDD(RDD):

"""
Pipelined maps:
>>> rdd = sc.parallelize([1, 2, 3, 4])
>>> rdd.map(lambda x: 2 * x).cache().map(lambda x: 2 * x).collect()
[4, 8, 12, 16]
Expand Down

0 comments on commit 8f641ac

Please sign in to comment.