In [1]:
import os

os.environ["PYSPARK_SUBMIT_ARGS"]='--packages com.databricks:spark-csv_2.10:1.1.0 pyspark-shell'
os.environ["SPARK_HOME"]='/home/cheshire/spark/'

import sys

sys.path.append(os.environ.get('SPARK_HOME', None)+"/python")
sys.path.append(os.environ.get('SPARK_HOME', None)+"/python/lib/py4j-0.8.2.1-src.zip")

import py4j
from pyspark import SparkContext,SparkConf,SQLContext

conf = (SparkConf().setMaster("local[2]")
        .setAppName("ML demo")
        .set("spark.executor.memory", "2g")
        .set("spark.cores.max", "2"))

sc = SparkContext(conf=conf)

sqlCtx = SQLContext(sc)

In [2]:
sc.version

u'1.5.0'

In [3]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from numpy import array
import random
import math

In [4]:
points = (sc.parallelize(range(100000))
   .map(lambda x: LabeledPoint(x*25+17+random.uniform(-10,10), [float(x)])))

In [5]:
points.takeSample(False, 10)

[LabeledPoint(2034737.08514, [81389.0]),
 LabeledPoint(406994.669373, [16279.0]),
 LabeledPoint(293826.072539, [11752.0]),
 LabeledPoint(1217683.60407, [48707.0]),
 LabeledPoint(1001757.769, [40070.0]),
 LabeledPoint(1905722.73116, [76228.0]),
 LabeledPoint(1654993.16047, [66199.0]),
 LabeledPoint(1710859.82861, [68434.0]),
 LabeledPoint(1604484.89625, [64179.0]),
 LabeledPoint(414690.048177, [16587.0])]

In [6]:
x = points.map(lambda p: p.features)
y = points.map(lambda p: p.label)

In [7]:
x.take(5)

[DenseVector([0.0]),
 DenseVector([1.0]),
 DenseVector([2.0]),
 DenseVector([3.0]),
 DenseVector([4.0])]

In [8]:
y.take(5)

[21.283261109702135,
 50.75498884962461,
 59.11946298258715,
 101.15115966717568,
 115.14982690022691]

In [9]:
numIters = 20
w = array([0])
gamma = 1e-15

In [10]:
for i in range(numIters):
    grad = x.zip(y).map(lambda (x,y): (w.dot(x)-y)*x).sum()
    w = w - gamma*grad
    print i, w, grad

0 [ 8.33329329] [-8.33329328683e+15]
1 [ 13.88886381] [-5.55557052421e+15]
2 [ 17.59260546] [-3.70374164507e+15]
3 [ 20.06178496] [-2.46917950421e+15]
4 [ 21.70791697] [-1.64613201214e+15]
5 [ 22.80534652] [-1.09742954672e+15]
6 [ 23.53697204] [-7.31625517382e+14]
7 [ 24.02472603] [-4.87753994351e+14]
8 [ 24.34989757] [-3.25171540389e+14]
9 [ 24.56668047] [-2.16782897158e+14]
10 [ 24.71120361] [-1.44523143566e+14]
11 [ 24.80755274] [-9.63491262522e+13]
12 [ 24.87178597] [-6.42332280374e+13]
13 [ 24.91460851] [-4.28225398808e+13]
14 [ 24.94315674] [-2.85482379429e+13]
15 [ 24.96218905] [-1.9032310625e+13]
16 [ 24.97487725] [-1.26881971057e+13]
17 [ 24.98333621] [-8.45896040766e+12]
18 [ 24.98897585] [-5.63963610556e+12]
19 [ 24.99273538] [-3.75952726609e+12]
