In [1]:
sc

<pyspark.context.SparkContext at 0x7f2c484c9710>

In [2]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

In [3]:
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.regression import LabeledPoint

## Vectors - Dense and Sparse

In [4]:
vd1 = Vectors.dense(1.0, 0.0, 3.0)
vd1

DenseVector([1.0, 0.0, 3.0])

In [5]:
vd2 = Vectors.dense([4.0, 10.0, 6.0])
vd2

DenseVector([4.0, 10.0, 6.0])

In [6]:
vs1 = Vectors.sparse(3, [0, 2], [12.0, 30.0])
vs1

SparseVector(3, {0: 12.0, 2: 30.0})

In [7]:
vs2 = Vectors.sparse(3, {0: 5.0, 2: 7.0})
vs2

SparseVector(3, {0: 5.0, 2: 7.0})

## Labeled Points

In [8]:
LabeledPoint(1.0, vd1)

LabeledPoint(1.0, [1.0,0.0,3.0])

In [9]:
LabeledPoint(0.0, vs1)

LabeledPoint(0.0, (3,[0,2],[12.0,30.0]))

## Vector Operations

In [10]:
vd1.dot(vd2)

22.0

In [11]:
vd1.dot(vs2)

26.0

In [12]:
vd1.norm(2)

3.1622776601683795

In [13]:
vd1.squared_distance(vd2)

118.0

In [14]:
sc

<pyspark.context.SparkContext at 0x7f2c484c9710>

In [15]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Local Matrices

In [16]:
from pyspark.rdd import RDD
from pyspark.mllib.linalg import Vectors, Vector, Matrix, Matrices

In [17]:
md = Matrices.dense(3, 2, [1, 3, 5, 2, 4, 6])
md

DenseMatrix(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0], False)

In [18]:
ms = Matrices.sparse(5, 4, [0,0,1,2,2], [1,2], [34,55])
ms

SparseMatrix(5, 4, [0, 0, 1, 2, 2], [1, 2], [34.0, 55.0], False)

In [19]:
ms.toDense().values

array([  0.,   0.,   0.,   0.,   0.,   0.,  34.,   0.,   0.,   0.,   0.,
         0.,  55.,   0.,   0.,   0.,   0.,   0.,   0.,   0.])

## Distributed Matrices

In [20]:
from pyspark.mllib.linalg.distributed import RowMatrix, IndexedRow, IndexedRowMatrix, MatrixEntry, CoordinateMatrix

### RowMatrix

In [21]:
rows = sc.parallelize([Vectors.dense(1.0,2.0), 
                       Vectors.dense(4.0,5.0), 
                       Vectors.dense(7.0,8.0)])

In [22]:
rows.collect()

[DenseVector([1.0, 2.0]), DenseVector([4.0, 5.0]), DenseVector([7.0, 8.0])]

In [23]:
mat = RowMatrix(rows)

In [24]:
print mat.numRows()
print mat.numCols()

3
2


In [25]:
mat.computeCovariance()

DenseMatrix(2, 2, [9.0, 9.0, 9.0, 9.0], 0)

### IndexedRowMatrix

In [26]:
idx_rows = sc.parallelize([IndexedRow(0,Vectors.dense(1.0,2.0)), 
                           IndexedRow(1,Vectors.dense(4.0,5.0)), 
                           IndexedRow(2,Vectors.dense(7.0,8.0))])

In [27]:
idx_rows

ParallelCollectionRDD[5] at parallelize at PythonRDD.scala:475

In [28]:
!rm -rf metastore_db/ 

idx_mat = IndexedRowMatrix(idx_rows)

In [29]:
idx_mat.rows

PythonRDD[20] at RDD at PythonRDD.scala:48

In [30]:
idx_mat.rows.collect()

[IndexedRow(0, [1.0,2.0]), IndexedRow(1, [4.0,5.0]), IndexedRow(2, [7.0,8.0])]

### CoordinateMatrix

In [31]:
entries = sc.parallelize([MatrixEntry(0,0,9.0),
                          MatrixEntry(1,1,8.0),
                          MatrixEntry(2,1,6.0)])

In [32]:
coord_mat = CoordinateMatrix(entries)

In [33]:
coord_mat.toIndexedRowMatrix().rows.collect()

[IndexedRow(0, (2,[0],[9.0])),
 IndexedRow(1, (2,[1],[8.0])),
 IndexedRow(2, (2,[1],[6.0]))]