In [28]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [6]:
!ls

gdrive	sample_data


In [0]:
DIR = '/content/gdrive/My Drive/Spark/'

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

In [10]:
!ls

gdrive	sample_data  spark-2.4.4-bin-hadoop2.7	spark-2.4.4-bin-hadoop2.7.tgz


In [0]:
import findspark
findspark.init("spark-2.4.4-bin-hadoop2.7")# SPARK_HOME
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
import numpy as np
import pandas as pd

# load data

from sklearn.datasets import load_boston

boston = load_boston()

In [0]:
boston_pd = pd.DataFrame(data = np.c_[boston['data'],boston['target']],
                         columns = np.append(boston['feature_names'],'target')).sample(frac=1)

In [14]:
print(boston_pd.shape)
boston_pd.head(10)

(506, 14)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
300,0.04417,70.0,2.24,0.0,0.4,6.871,47.4,7.8278,5.0,358.0,14.8,390.86,6.07,24.8
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88,11.9
345,0.03113,0.0,4.39,0.0,0.442,6.014,48.5,8.0136,3.0,352.0,18.8,385.64,10.53,17.5
493,0.17331,0.0,9.69,0.0,0.585,5.707,54.0,2.3817,6.0,391.0,19.2,396.9,12.01,21.8
485,3.67367,0.0,18.1,0.0,0.583,6.312,51.9,3.9917,24.0,666.0,20.2,388.62,10.58,21.2
218,0.11069,0.0,13.89,1.0,0.55,5.951,93.8,2.8893,5.0,276.0,16.4,396.9,17.92,21.5
173,0.09178,0.0,4.05,0.0,0.51,6.416,84.1,2.6463,5.0,296.0,16.6,395.5,9.04,23.6
251,0.21409,22.0,5.86,0.0,0.431,6.438,8.9,7.3967,7.0,330.0,19.1,377.07,3.59,24.8
239,0.09252,30.0,4.93,0.0,0.428,6.606,42.2,6.1899,6.0,300.0,16.6,383.78,7.37,23.3
439,9.39063,0.0,18.1,0.0,0.74,5.627,93.9,1.8172,24.0,666.0,20.2,396.9,22.88,12.8


In [15]:
from sklearn.linear_model import LinearRegression
from scipy.stats.stats import pearsonr

y = boston_pd['target']
X = boston_pd.drop('target', axis=1)

X_train = X[:400]
X_test = X[400:]
y_train = y[:400]
y_test = y[400:]

linear = LinearRegression()
linear.fit(X_train, y_train)

y_pred = linear.predict(X_test)

# error metrics
r = pearsonr(y_pred, y_test)
mae = sum(abs(y_pred - y_test))/len(y_test)
print("R-sqaured: " + str(r[0]**2))
print("MAE: " + str(mae))

R-sqaured: 0.6881658441577952
MAE: 3.6765852900103853


In [0]:
from pyspark.ml.feature import VectorAssembler

boston_sp = spark.createDataFrame(boston_pd)

In [17]:
display(boston_sp.take(5))

[Row(CRIM=0.04417, ZN=70.0, INDUS=2.24, CHAS=0.0, NOX=0.4, RM=6.871, AGE=47.4, DIS=7.8278, RAD=5.0, TAX=358.0, PTRATIO=14.8, B=390.86, LSTAT=6.07, target=24.8),
 Row(CRIM=0.04741, ZN=0.0, INDUS=11.93, CHAS=0.0, NOX=0.573, RM=6.03, AGE=80.8, DIS=2.505, RAD=1.0, TAX=273.0, PTRATIO=21.0, B=396.9, LSTAT=7.88, target=11.9),
 Row(CRIM=0.03113, ZN=0.0, INDUS=4.39, CHAS=0.0, NOX=0.442, RM=6.014, AGE=48.5, DIS=8.0136, RAD=3.0, TAX=352.0, PTRATIO=18.8, B=385.64, LSTAT=10.53, target=17.5),
 Row(CRIM=0.17331, ZN=0.0, INDUS=9.69, CHAS=0.0, NOX=0.585, RM=5.707, AGE=54.0, DIS=2.3817, RAD=6.0, TAX=391.0, PTRATIO=19.2, B=396.9, LSTAT=12.01, target=21.8),
 Row(CRIM=3.67367, ZN=0.0, INDUS=18.1, CHAS=0.0, NOX=0.583, RM=6.312, AGE=51.9, DIS=3.9917, RAD=24.0, TAX=666.0, PTRATIO=20.2, B=388.62, LSTAT=10.58, target=21.2)]

In [0]:
boston_train = spark.createDataFrame(boston_pd[:400])
boston_test = spark.createDataFrame(boston_pd[400:])

assembler = VectorAssembler(inputCols= boston_train.schema.names
                            [:(boston_pd.shape[1]-1)],outputCol='features')

In [0]:
boston_train = assembler.transform(boston_train).select('features', 'target') 
boston_test = assembler.transform(boston_test).select('features', 'target') 

In [20]:
display(boston_train.take(5))

[Row(features=DenseVector([0.0442, 70.0, 2.24, 0.0, 0.4, 6.871, 47.4, 7.8278, 5.0, 358.0, 14.8, 390.86, 6.07]), target=24.8),
 Row(features=DenseVector([0.0474, 0.0, 11.93, 0.0, 0.573, 6.03, 80.8, 2.505, 1.0, 273.0, 21.0, 396.9, 7.88]), target=11.9),
 Row(features=DenseVector([0.0311, 0.0, 4.39, 0.0, 0.442, 6.014, 48.5, 8.0136, 3.0, 352.0, 18.8, 385.64, 10.53]), target=17.5),
 Row(features=DenseVector([0.1733, 0.0, 9.69, 0.0, 0.585, 5.707, 54.0, 2.3817, 6.0, 391.0, 19.2, 396.9, 12.01]), target=21.8),
 Row(features=DenseVector([3.6737, 0.0, 18.1, 0.0, 0.583, 6.312, 51.9, 3.9917, 24.0, 666.0, 20.2, 388.62, 10.58]), target=21.2)]

In [21]:
from pyspark.ml.regression import LinearRegression

linear = LinearRegression(maxIter = 10, regParam=0.1,
                          elasticNetParam = 0.5, labelCol='target')

model = linear.fit(boston_train)
boston_pred = model.transform(boston_test)

r = boston_pred.stat.corr('prediction','target')
print('R-squared:' + str(r**2))

R-squared:0.6838911265390709


In [22]:
display(boston_pred.take(5))

[Row(features=DenseVector([4.0384, 0.0, 18.1, 0.0, 0.532, 6.229, 90.7, 3.0993, 24.0, 666.0, 20.2, 395.33, 12.87]), target=19.6, prediction=22.04745996058116),
 Row(features=DenseVector([0.0839, 0.0, 12.83, 0.0, 0.437, 5.874, 36.6, 4.5026, 5.0, 398.0, 18.7, 396.06, 9.1]), target=20.3, prediction=22.416722588491734),
 Row(features=DenseVector([0.0716, 0.0, 25.65, 0.0, 0.581, 6.004, 84.1, 2.1974, 2.0, 188.0, 19.1, 377.67, 14.27]), target=20.3, prediction=21.10472343071149),
 Row(features=DenseVector([0.4357, 0.0, 10.59, 1.0, 0.489, 5.344, 100.0, 3.875, 4.0, 277.0, 18.6, 396.9, 23.09]), target=20.0, prediction=17.75514265690711),
 Row(features=DenseVector([0.069, 0.0, 2.18, 0.0, 0.458, 7.147, 54.2, 6.0622, 3.0, 222.0, 18.7, 396.9, 5.33]), target=36.2, prediction=28.43188721570101)]

In [23]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

crossval = CrossValidator(estimator=LinearRegression(labelCol = "target"),  
                         estimatorParamMaps=ParamGridBuilder().addGrid(
                           LinearRegression.elasticNetParam, [0, 0.5, 1.0]).build(),
                         evaluator=RegressionEvaluator(
                           labelCol = "target", metricName = "r2"),
                         numFolds=10)

# cross validate the model and select the best fit
cvModel = crossval.fit(boston_train) 
model = cvModel.bestModel

# calculate results 
boston_pred = model.transform(boston_test)
r = boston_pred.stat.corr("prediction", "target")
print("R-sqaured: " + str(r**2))

R-sqaured: 0.6881658441577725


In [0]:
# sklearn version 
from sklearn.ensemble import RandomForestRegressor as RFR
from multiprocessing.pool import ThreadPool

# allow up to 5 concurrent threads
pool = ThreadPool(5)

# hyperparameters to test out (n_trees)
parameters = [10, 20, 50]

# define a function to train a RF model and return metrics 
def sklearn_random_forest(trees, X_train, X_test, y_train, y_test):

    # train a random forest regressor with the specified number of trees
    rf= RFR(n_estimators = trees)
    model = rf.fit(X_train, y_train)

    # make predictions
    y_pred = model.predict(X_test)
    r = pearsonr(y_pred, y_test)

    # return the number of trees, and the R value 
    return [trees, r[0]**2]  

  

In [25]:
# run the tasks 
pool.map(lambda trees: sklearn_random_forest(trees, X_train,
                                           X_test, y_train, y_test), parameters)

[[10, 0.8392271491578889], [20, 0.8785870074065042], [50, 0.8893608363223877]]

In [26]:
from pyspark.ml.regression import RandomForestRegressor

def mllib_random_forest(trees, boston_train, boston_test):

  rf = RandomForestRegressor(numTrees = trees, labelCol='target')
  model = rf.fit(boston_train)

  boston_pred = model.transform(boston_test)
  r = boston_pred.stat.corr('prediction','target')

  return [trees, r**2]

# run the tasks 
pool.map(lambda trees: mllib_random_forest(trees, boston_train, boston_test), parameters)


[[10, 0.9088046609171109], [20, 0.9009897943400009], [50, 0.9033009014714433]]