In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--conf spark.sql.catalogImplementation=in-memory pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.2
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from numpy import array
import random

In [3]:
def gen_point(i):
    x1 = random.uniform(-100,100)
    x2 = random.uniform(-20,60)
    return LabeledPoint(x1 * 10.0 + x2 * 25 + random.uniform(-2,2), [x1, x2])

points = sc.parallelize(range(100000)).map(gen_point)

In [4]:
points.takeSample(False, 10)

[LabeledPoint(1045.3687323617437, [9.989702114274678,37.8834649301104]),
 LabeledPoint(-861.2328368839092, [-73.94622253366974,-4.815575435722215]),
 LabeledPoint(1175.1804739227186, [78.05962913215089,15.75845737278619]),
 LabeledPoint(1892.9646003244372, [86.28148056034837,41.22765255487302]),
 LabeledPoint(1341.3478674754544, [50.156191148039255,33.54443176166531]),
 LabeledPoint(1100.2469136647467, [19.55681522777077,36.1508956145733]),
 LabeledPoint(1828.942237069067, [89.07555717521817,37.53444120026234]),
 LabeledPoint(-292.78017106731767, [-50.637567209638235,8.498824561278234]),
 LabeledPoint(-138.02836578152443, [-34.306488397251826,8.174382014414618]),
 LabeledPoint(1602.7332661927396, [23.82022532450756,54.530313886252145])]

## Do it the easy way

In [5]:
model = LinearRegressionWithSGD.train(points, 
            iterations=100, step=1e-4, intercept=True)

In [6]:
model

(weights=[9.976290677363732,20.70170508262199], intercept=1.4439299695365633)

## Do it the hard way

In [5]:
weights = array([0, 0])
step = 1e-9

In [6]:
x = points.map(lambda p: p.features)

In [7]:
y = points.map(lambda p: p.label)

In [8]:
x.take(5)

[DenseVector([-65.7918, 55.1743]),
 DenseVector([-33.6416, 19.6272]),
 DenseVector([-50.3937, -16.2745]),
 DenseVector([-35.947, 6.919]),
 DenseVector([88.8145, 31.7886])]

In [9]:
y.take(5)

[-140.2251203501606,
 859.1893897753312,
 699.240988008658,
 869.5942206335931,
 1414.2990293729565]

In [10]:
prediction = x.map(lambda point: point.dot(weights))

In [11]:
prediction.take(5)

[0.0, 0.0, 0.0, 0.0, 0.0]

In [12]:
gradient = x.zip(y).map(lambda xy: (xy[0].dot(weights) - xy[1]) * xy[0])

In [13]:
gradient.take(5)

[DenseVector([370.2101, -945.1173]),
 DenseVector([16326.1865, -36005.2544]),
 DenseVector([8922.5447, -23150.5864]),
 DenseVector([-39944.1573, -14274.4316]),
 DenseVector([-68287.7384, -52683.5141])]

In [14]:
gradient_average = gradient.mean()

In [15]:
weights = weights - step * gradient_average

In [16]:
weights

array([3.35182652e-05, 2.33382322e-05])

In [17]:
iterations = 20
weights = array([0, 0])
step = 1e-4

In [18]:
for i in range(iterations):
    gradient = x.zip(y).map(lambda xy: (xy[0].dot(weights) - xy[1]) * xy[0]).mean()
    weights = weights - step * gradient
    mse = x.zip(y).map(lambda xy: (xy[0].dot(weights) - xy[1]) ** 2).mean()
    print(i, weights, mse)

0 [3.35182652 2.33382322] 627666.002275555
1 [5.58419883 4.44817586] 459431.56725968904
2 [7.07062439 6.36423042] 352641.30401024123
3 [8.06002392 8.10094066] 278835.13536749297
4 [8.7182852  9.67533085] 224361.14399650635
5 [ 9.1559576  11.10273046] 182324.3834678253
6 [ 9.44670976 12.39696864] 148978.78048046472
7 [ 9.63963105 13.57053836] 122098.26973536759
8 [ 9.76743024 14.63473708] 100231.73383566189
9 [ 9.85189951 15.59978893] 82354.34383443753
10 [ 9.9075561  16.47495185] 67698.12116435009
11 [ 9.9440693  17.26861238] 55664.69378070585
12 [ 9.96787792 17.98836994] 45776.68892692989
13 [ 9.98326825 18.64111231] 37648.03712468688
14 [ 9.99309243 19.23308326] 30964.115052335626
15 [ 9.99924723 19.76994344] 25467.442085875446
16 [10.00299303 20.25682534] 20946.816999393148
17 [10.00516644 20.69838283] 17228.78326595479
18 [10.00632197 21.09883604] 14170.787120216513
19 [10.00682638 21.46201187] 11655.629077069301


In [19]:
sc.stop()