In [1]:
# %load_ext autoreload
# %autoreload 2

### Import Library

In [2]:
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("h6").master('local[*]').getOrCreate()
sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### Import Data

In [3]:
spark_df = spark.read.csv("file:///root/code/pca.csv", header='true', inferSchema='true')

In [4]:
(tr_data, te_data) = spark_df.randomSplit([0.7, 0.3], seed=22)

In [5]:
tr_data = tr_data.cache()
te_data = te_data.cache()

In [6]:
import numpy as np

In [7]:
w = len(tr_data.first()) - 1
m = tr_data.count()

**BGD**

In [8]:
from h6 import DistributedLRwithGD

In [9]:
bgd = DistributedLRwithGD(w, m)
bgd.forward(tr_data)

                                                                                

In [11]:
bgd.validate(tr_data)

0.7616127833519137

In [12]:
bgd.validate(te_data)

0.7635033354852593

**SGD**

In [13]:
from h6 import DistributedLRwithSGD

In [14]:
sgd = DistributedLRwithSGD(w, m, 0.1)
sgd.forward(tr_data)

In [15]:
sgd.validate(tr_data)

0.7627276105536975

In [16]:
sgd.validate(te_data)

0.7626425650957607

**SGD with Hogwild**

In [17]:
from h6 import DistributedLRwithSGDHogwild
# from pyspark.util import InheritableThread

In [18]:
np.random.seed(22)
sgdH = DistributedLRwithSGDHogwild(w, m, 0.1)

In [19]:
sgdH.forward(tr_data)

In [20]:
sgdH.validate(tr_data)

0.7538089929394277

In [21]:
sgdH.validate(te_data)

0.7579083279535184

**Accumulator**

In [22]:
from pyspark.accumulators import AccumulatorParam

class VecAccumulator(AccumulatorParam):
    def zero(value):
        return [0.0 for _ in range(len(value))]
    def addInPlace(val1, val2):
        for i in range(len(val1)):
             val1[i] += val2[i]
        return val1

**BGD with Broadcast**

In [23]:
from h6 import BroadcastLRwithGD

In [24]:
Bbgd = BroadcastLRwithGD(w, m)

In [25]:
Bbgd.forward(sc, tr_data)

In [26]:
Bbgd.validate(tr_data)

0.7616127833519137

In [27]:
Bbgd.validate(te_data)

0.7635033354852593

**Steep Gradient Descent**

In [28]:
from h6 import LRwithSteepGD

In [29]:
Stgd = LRwithSteepGD(w, m)

In [30]:
Stgd.forward(tr_data, 10)

In [31]:
Stgd.validate(tr_data)

0.7628205128205128

In [32]:
Stgd.validate(te_data)

0.7622121799010114

**Compare with GD with 10 iterations**

In [33]:
bgd10 = DistributedLRwithGD(w, m)
bgd10.forward(tr_data, 10)

In [34]:
bgd10.validate(tr_data)

0.6174284652545522

In [35]:
bgd10.validate(te_data)

0.6176027544652464

**Steep GD with Broadcast**

In [36]:
from h6 import BroadcastLRwithSteepGD

In [37]:
BStgd = BroadcastLRwithSteepGD(w, m)

In [38]:
BStgd.forward(sc, tr_data, 10)

In [39]:
BStgd.validate(tr_data)

0.7628205128205128

In [40]:
BStgd.validate(te_data)

0.7622121799010114

**End**

In [41]:
tr_data.unpersist()
te_data.unpersist()

DataFrame[PC1: double, PC2: double, PC3: double, PC4: double, PC5: double, y: int]