# Create entry points to spark

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from pyspark import SparkContext
sc = SparkContext(master = 'local')

from pyspark.sql import SparkSession
spark = SparkSession.builder \
          .appName("Python Spark SQL basic example") \
          .config("spark.some.config.option", "some-value") \
          .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/02 17:46:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Linear regression without cross-valiation

## Import data

In [3]:
ad = spark.read.csv('data/Advertising.csv', header=True, inferSchema=True)
ad.show(5)

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3|  9.3|
|151.5| 41.3|     58.5| 18.5|
|180.8| 10.8|     58.4| 12.9|
+-----+-----+---------+-----+
only showing top 5 rows



## Transform data structure

This is alternative to VectorAssembler()

In [4]:
from pyspark.ml.linalg import Vectors
ad_df = ad.rdd.map(lambda x: [Vectors.dense(x[0:3]), x[-1]]).toDF(['features', 'label'])
#To visualize, need to make features one dimension, use first column only
#ad_df = ad.rdd.map(lambda x: [Vectors.dense(x[0:1]), x[-1]]).toDF(['features', 'label'])
ad_df.show(5)

                                                                                

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
+-----------------+-----+
only showing top 5 rows



[Stage 4:>                                                          (0 + 1) / 1]                                                                                

In [5]:
#x=[] 
#y=[]
#for i in ad_df.select("features").collect():
#    x.append(i[0][0])

#for i in ad_df.select("label").collect():
#    y.append(i[0])

In [6]:
#plt.figure(figsize = (10, 8))
#plt.scatter(x, y);

## Build linear regression model

In [7]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol = 'label')

## Fit the model

In [8]:
lr_model = lr.fit(ad_df)

[Stage 5:>                                                          (0 + 1) / 1]                                                                                

23/05/02 17:46:56 WARN Instrumentation: [b7093648] regParam is zero, which might cause numerical instability and overfitting.


[Stage 6:>                                                          (0 + 1) / 1]

23/05/02 17:46:57 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/05/02 17:46:57 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/05/02 17:46:57 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


                                                                                

## Prediction

In [9]:
pred = lr_model.transform(ad_df)
pred.show(5)

+-----------------+-----+------------------+
|         features|label|        prediction|
+-----------------+-----+------------------+
|[230.1,37.8,69.2]| 22.1| 20.52397440971517|
| [44.5,39.3,45.1]| 10.4|12.337854820894362|
| [17.2,45.9,69.3]|  9.3|12.307670779994238|
|[151.5,41.3,58.5]| 18.5| 17.59782951168913|
|[180.8,10.8,58.4]| 12.9|13.188671856831299|
+-----------------+-----+------------------+
only showing top 5 rows



## Module evaluation

In [10]:
from pyspark.ml.evaluation import RegressionEvaluator 
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label')
evaluator.setMetricName('r2').evaluate(pred)

                                                                                

0.897210638178952

# Linear regression with cross-validation

## Training and test datasets

In [11]:
training, test = ad_df.randomSplit([0.8, 0.2], seed=123)

## Build cross-validation model

In [12]:
##=====build cross valiation model======

# estimator
lr = LinearRegression(featuresCol = 'features', labelCol = 'label')

# parameter grid
from pyspark.ml.tuning import ParamGridBuilder
param_grid = ParamGridBuilder().\
    addGrid(lr.regParam, [0, 0.5, 1]).\
    addGrid(lr.elasticNetParam, [0, 0.5, 1]).\
    build()
    
# evaluator
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='r2')

# cross-validation model
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)

## Fit cross-validation model

In [13]:
cv_model = cv.fit(training)

23/05/02 17:47:53 WARN Instrumentation: [b1fc64bb] regParam is zero, which might cause numerical instability and overfitting.
23/05/02 17:47:54 WARN Instrumentation: [398cf2ce] regParam is zero, which might cause numerical instability and overfitting.
23/05/02 17:47:54 WARN Instrumentation: [078938e9] regParam is zero, which might cause numerical instability and overfitting.
23/05/02 17:47:56 WARN Instrumentation: [d57d1278] regParam is zero, which might cause numerical instability and overfitting.
23/05/02 17:47:57 WARN Instrumentation: [f4657757] regParam is zero, which might cause numerical instability and overfitting.
23/05/02 17:47:57 WARN Instrumentation: [ec862b9c] regParam is zero, which might cause numerical instability and overfitting.
23/05/02 17:47:58 WARN Instrumentation: [458f73d9] regParam is zero, which might cause numerical instability and overfitting.
23/05/02 17:47:59 WARN Instrumentation: [612b80a4] regParam is zero, which might cause numerical instability and overf

## Prediction

In [14]:
pred_training_cv = cv_model.transform(training)
pred_test_cv = cv_model.transform(test)

## Evaluation

In [15]:
# performance on training data
evaluator.setMetricName('r2').evaluate(pred_training_cv)

0.8952845631627804

In [16]:
# performance on test data
evaluator.setMetricName('r2').evaluate(pred_test_cv)

0.9013819610158471

## Intercept and coefficients

In [17]:
print('Intercept: ', cv_model.bestModel.intercept, "\n",
     'coefficients: ', cv_model.bestModel.coefficients)

Intercept:  2.9592600706772934 
 coefficients:  [0.046137295249098154,0.19200356629524304,-0.006269704193266607]


## Get parameter values from the best model

Parameters can be extracted by calling the java property.

In [18]:
print('best regParam: ' + str(cv_model.bestModel._java_obj.getRegParam()) + "\n" +
     'best ElasticNetParam:' + str(cv_model.bestModel._java_obj.getElasticNetParam()))

best regParam: 0.0
best ElasticNetParam:0.0


In [19]:
#y_pred=[]
#for i in x:
#    y_pred.append(cv_model.bestModel.coefficients[0]*i+cv_model.bestModel.intercept)

#plt.figure(figsize = (10, 8))
#plt.scatter(x, y);
#plt.plot(x, y_pred,'r-');
