In [1]:
import findspark
findspark.init()

In [1]:
from pyspark.sql import SparkSession 

pyspark = SparkSession.builder \
.master("local[4]")\
.appName("LinearRegression")\
.config("spark.executer.memory","3g")\
.config("spark.driver.memory","3g")\
.getOrCreate()

sc = pyspark.sparkContext

### Reading of dataset

In [7]:
ad_df = spark.read.format("csv")\
.option("header","True")\
.option("inferSchema", "True")\
.option("sep", ",")\
.load("data/Advertising.csv")

In [8]:
ad_df.toPandas().head()

Unnamed: 0,_c0,TV,radio,newspaper,sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


## 1. Data Exploring 

#### TV, radio and newspaper ads prices are added as a one Adverstiment

In [21]:
ad_df2 = ad_df.withColumn("Ads Total", ad_df.TV + ad_df.radio + ad_df.newspaper)\
.withColumnRenamed("Sales", "Sales")\
.drop("TV", "radio", "newspaper")

ad_df2.toPandas().head()

Unnamed: 0,_c0,Sales,Ads Total
0,1,22.1,337.1
1,2,10.4,128.9
2,3,9.3,132.4
3,4,18.5,251.3
4,5,12.9,250.0


In [24]:
ad_df2.describe("Sales","Ads Total").toPandas().head()

Unnamed: 0,summary,Sales,Ads Total
0,count,200.0,200.0
1,mean,14.022500000000004,200.86049999999992
2,stddev,5.217456565710477,92.9851805869837
3,min,1.6,11.7
4,max,27.0,433.6


Now we checked our dataset. There does not have any categorical feautre and so that we don't apply StringIndexer and OneHotEncoder. Preprocessing is started from VectorAssembler

## 2. Data Preparation

### 2.1 Transforming by VectorAssembler

In [34]:
from pyspark.ml.feature import VectorAssembler
vector_assembler = VectorAssembler()\
.setInputCols(["Ads Total"])\
.setOutputCol("features")

### 2.2 Creating of Regression Model 

In [35]:
from pyspark.ml.regression import LinearRegression
linear_reg = LinearRegression()\
.setLabelCol("Sales")\
.setFeaturesCol("features")

### 2.3 Using of Pipeline

In [32]:
from pyspark.ml import Pipeline

pipeline = Pipeline()\
.setStages([vector_assembler, linear_reg])

### 2.4 Splitting Train-Test dataset

In [33]:
train_df, test_df = ad_df2.randomSplit([0.8, 0.2], seed=142)

### 2.5 Model Training

In [36]:
pipeline_model = pipeline.fit(train_df)

### 2.6 Model Testing

In [39]:
result_df = pipeline_model.transform(test_df)

In [40]:
result_df.toPandas().headd()

Unnamed: 0,_c0,Sales,Ads Total,features,prediction
0,3,9.3,132.4,[132.39999999999998],10.791216
1,6,7.2,132.6,[132.6],10.800663
2,9,4.8,11.7,[11.7],5.089785
3,10,10.6,223.6,[223.6],15.099174
4,17,12.5,218.4,[218.4],14.853545


In [41]:
pipeline_model.stages

[VectorAssembler_94014994f03d, LinearRegression_cb22fe86a40b]

In [42]:
lr_model = pipeline_model.stages[1]

In [43]:
lr_model.coefficients

DenseVector([0.0472])

In [44]:
lr_model.intercept

4.537119328969264

#### Our model can explain 72 percent of variability

In [45]:
lr_model.summary.r2

0.7204575410444246

In [48]:
lr_model.summary.pValues

[0.0, 3.1086244689504383e-15]

In [50]:
lr_model.summary.tValues

[19.986932892719693, 8.769612805482973]

#### RMSE value --> it is lower than standard deviation

In [51]:
lr_model.summary.rootMeanSquaredError

2.678303634403656

## 3. Linear Regression Prediction

Linear Regression ---> y = B0 + B1*X

B0 = 4.5371193 

B1 = 0.0472

y = 4.537 + 0.0472* AdsTotal 

---> What will be sales if is spent 100.000 for Adverstiment ? (AdsTotal = 100.000)


In [78]:
df_predict_rdd = spark.sparkContext.parallelize([100.0])
predict_df = df_predict_rdd.map(lambda x: (x,)).toDF(["Ads Total"])

In [79]:
predict_df.show()

+---------+
|Ads Total|
+---------+
|    100.0|
+---------+



In [81]:
vector_df = vector_assembler.transform(predict_df)

In [None]:
y = 4.537 + 0.0472* 100.0 

y = 9.2

In [82]:
lr_model.transform(vector_df).toPandas().head()

Unnamed: 0,Ads Total,features,prediction
0,100.0,[100.0],9.260757
