<a href="https://colab.research.google.com/github/consultantleonardoferreira/machine-learning-with-pyspark/blob/master/lrmdm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Tutorial para instalação do PySpark in Google Colab**  - V.1 - 17/03/2020 - 
V.2  - 22/03/2020 

---

---

**Passo 1 - Instalação das dependências para funcionamento do PySpark**

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

**Passo 2 - Configuração das Variáveis de Ambiente**

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

**Passo 3 - Rodar o Spark localmente para testar nossa instalação**

In [0]:
import findspark 
findspark.init('spark-2.4.4-bin-hadoop2.7')
from pyspark.sql import SparkSession
sc = SparkSession.builder.master('local[*]').getOrCreate()

** Passo 4 - Instalando o pacote Pyspark**

In [66]:
!pip install pyspark



**Linear Regression using Pyspark**

In [0]:
#import Linear Regression from spark's MLlib
from pyspark.ml.regression import LinearRegression

In [8]:
# Carregando o dataset no CO
from google.colab import files
files.upload()

Saving Linear_regression_dataset.csv to Linear_regression_dataset (1).csv


{'Linear_regression_dataset.csv': b'var_1,var_2,var_3,var_4,var_5,output\n734,688,81,0.328,0.259,0.418\n700,600,94,0.32,0.247,0.389\n712,705,93,0.311,0.247,0.417\n734,806,69,0.315,0.26,0.415\n613,759,61,0.302,0.24,0.378\n748,676,85,0.318,0.255,0.422\n669,588,97,0.315,0.251,0.411\n667,845,68,0.324,0.251,0.381\n758,890,64,0.33,0.274,0.436\n726,670,88,0.335,0.268,0.422\n583,794,55,0.302,0.236,0.371\n676,746,72,0.317,0.265,0.4\n767,699,89,0.332,0.274,0.433\n637,597,86,0.317,0.252,0.374\n609,724,69,0.308,0.244,0.382\n776,733,83,0.325,0.259,0.437\n701,832,66,0.325,0.26,0.39\n650,709,74,0.316,0.249,0.386\n804,668,95,0.337,0.265,0.453\n713,614,94,0.31,0.238,0.404\n684,680,81,0.317,0.255,0.4\n651,674,79,0.304,0.243,0.395\n651,710,76,0.319,0.247,0.38\n619,651,75,0.296,0.234,0.369\n718,649,94,0.327,0.269,0.397\n765,648,88,0.338,0.271,0.421\n697,577,90,0.317,0.24,0.394\n808,707,93,0.334,0.273,0.446\n716,784,73,0.309,0.245,0.407\n731,594,98,0.322,0.261,0.428\n731,662,94,0.322,0.25,0.413\n641,605,89

In [0]:
#Load the dataset
df = sc.read.csv("Linear_regression_dataset.csv",inferSchema=True,header=True)

In [15]:
#validate the size of data
print((df.count(), len(df.columns)))

(1232, 6)


In [16]:
#explore the data
df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)



In [18]:
#view statistical measures of data 
df.describe().show(5,False)

+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|summary|var_1            |var_2            |var_3             |var_4               |var_5               |output             |
+-------+-----------------+-----------------+------------------+--------------------+--------------------+-------------------+
|count  |1232             |1232             |1232              |1232                |1232                |1232               |
|mean   |715.0819805194806|715.0819805194806|80.90422077922078 |0.3263311688311693  |0.25927272727272715 |0.39734172077922014|
|stddev |91.5342940441652 |93.07993263118064|11.458139049993724|0.015012772334166148|0.012907228928000298|0.03326689862173776|
|min    |463              |472              |40                |0.277               |0.214               |0.301              |
|max    |1009             |1103             |116               |0.373               |0.294               |0.491

In [20]:
#sneak into the dataset
df.head(10)

[Row(var_1=734, var_2=688, var_3=81, var_4=0.328, var_5=0.259, output=0.418),
 Row(var_1=700, var_2=600, var_3=94, var_4=0.32, var_5=0.247, output=0.389),
 Row(var_1=712, var_2=705, var_3=93, var_4=0.311, var_5=0.247, output=0.417),
 Row(var_1=734, var_2=806, var_3=69, var_4=0.315, var_5=0.26, output=0.415),
 Row(var_1=613, var_2=759, var_3=61, var_4=0.302, var_5=0.24, output=0.378),
 Row(var_1=748, var_2=676, var_3=85, var_4=0.318, var_5=0.255, output=0.422),
 Row(var_1=669, var_2=588, var_3=97, var_4=0.315, var_5=0.251, output=0.411),
 Row(var_1=667, var_2=845, var_3=68, var_4=0.324, var_5=0.251, output=0.381),
 Row(var_1=758, var_2=890, var_3=64, var_4=0.33, var_5=0.274, output=0.436),
 Row(var_1=726, var_2=670, var_3=88, var_4=0.335, var_5=0.268, output=0.422)]

In [0]:
#import corr function from pyspark functions
from pyspark.sql.functions import corr

In [23]:
# check for correlation
df.select(corr('var_1','output')).show()

+-------------------+
|corr(var_1, output)|
+-------------------+
| 0.9187399607627283|
+-------------------+



In [24]:
# check for correlation
df.select(corr('var_2','output')).show()

+-------------------+
|corr(var_2, output)|
+-------------------+
|0.43652698913681093|
+-------------------+



In [25]:
# check for correlation
df.select(corr('var_3','output')).show()

+-------------------+
|corr(var_3, output)|
+-------------------+
| 0.4014958408311139|
+-------------------+



In [26]:
# check for correlation
df.select(corr('var_4','output')).show()

+-------------------+
|corr(var_4, output)|
+-------------------+
| 0.7909100204842113|
+-------------------+



In [27]:
# check for correlation
df.select(corr('var_5','output')).show()

+-------------------+
|corr(var_5, output)|
+-------------------+
| 0.7904806260381185|
+-------------------+



In [0]:
#import vectorassembler to create dense vectors
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [29]:
#select the columns to create input vector
df.columns

['var_1', 'var_2', 'var_3', 'var_4', 'var_5', 'output']

In [0]:
#create the vector assembler 
vec_assmebler=VectorAssembler(inputCols=['var_1', 'var_2', 'var_3', 'var_4', 'var_5'],outputCol='features')

In [0]:
#transform the values
features_df=vec_assmebler.transform(df)

In [32]:
#validate the presence of dense vectors 
features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)
 |-- features: vector (nullable = true)



In [33]:
#view the details of dense vector
features_df.select('features').show(5,False)

+------------------------------+
|features                      |
+------------------------------+
|[734.0,688.0,81.0,0.328,0.259]|
|[700.0,600.0,94.0,0.32,0.247] |
|[712.0,705.0,93.0,0.311,0.247]|
|[734.0,806.0,69.0,0.315,0.26] |
|[613.0,759.0,61.0,0.302,0.24] |
+------------------------------+
only showing top 5 rows



In [0]:
#create data containing input features and output column
model_df=features_df.select('features','output')

In [36]:
model_df.show(5,False)

+------------------------------+------+
|features                      |output|
+------------------------------+------+
|[734.0,688.0,81.0,0.328,0.259]|0.418 |
|[700.0,600.0,94.0,0.32,0.247] |0.389 |
|[712.0,705.0,93.0,0.311,0.247]|0.417 |
|[734.0,806.0,69.0,0.315,0.26] |0.415 |
|[613.0,759.0,61.0,0.302,0.24] |0.378 |
+------------------------------+------+
only showing top 5 rows



In [37]:
#size of model df
print((model_df.count(), len(model_df.columns)))

(1232, 2)


### Split Data - Train & Test sets

In [0]:
#split the data into 70/30 ratio for train test purpose
train_df,test_df=model_df.randomSplit([0.7,0.3])

In [39]:
print((train_df.count(), len(train_df.columns)))

(874, 2)


In [40]:
print((test_df.count(), len(test_df.columns)))

(358, 2)


In [41]:
train_df.describe().show()

+-------+--------------------+
|summary|              output|
+-------+--------------------+
|  count|                 874|
|   mean| 0.39770823798626953|
| stddev|0.033520344828115486|
|    min|               0.301|
|    max|               0.484|
+-------+--------------------+



In [42]:
test_df.describe().show()

+-------+-------------------+
|summary|             output|
+-------+-------------------+
|  count|                358|
|   mean| 0.3964469273743023|
| stddev|0.03266897429196123|
|    min|              0.315|
|    max|              0.491|
+-------+-------------------+



###Build Linear Regression Model

In [0]:
#Build Linear Regression model 
lin_Reg=LinearRegression(labelCol='output')

In [0]:
#fit the linear regression model on training data set 
lr_model=lin_Reg.fit(train_df)

In [45]:
#intercept
lr_model.intercept

0.18582111500838108

In [46]:
#Coefficients
print(lr_model.coefficients)

[0.00034438984776910124,4.9436512588476106e-05,0.0001596347186933619,-0.6824152637027401,0.5375329988345778]


In [0]:
#make training predictions on train date
training_predictions=lr_model.evaluate(train_df)

In [51]:
training_predictions.predictions.show()

+--------------------+------+-------------------+
|            features|output|         prediction|
+--------------------+------+-------------------+
|[463.0,527.0,67.0...| 0.311| 0.3107737716547627|
|[464.0,640.0,66.0...| 0.301|0.31292700397936235|
|[468.0,746.0,52.0...| 0.329|0.31863278210987755|
|[473.0,499.0,73.0...| 0.315|0.31583850188324475|
|[486.0,610.0,61.0...| 0.332|0.31838608800698354|
|[498.0,615.0,67.0...| 0.318| 0.3218633895897134|
|[501.0,774.0,51.0...| 0.315| 0.3279970367270344|
|[510.0,588.0,72.0...| 0.317| 0.3228326806659396|
|[511.0,576.0,76.0...| 0.329|0.32868169334704245|
|[513.0,698.0,61.0...| 0.339| 0.3302355496825252|
|[514.0,549.0,81.0...| 0.339|0.32886538413650723|
|[516.0,504.0,86.0...| 0.327|0.32800591407184676|
|[519.0,595.0,73.0...| 0.332| 0.3270782988057389|
|[522.0,621.0,72.0...| 0.317|0.32673639628898665|
|[524.0,665.0,65.0...| 0.336| 0.3340871438820543|
|[527.0,569.0,75.0...| 0.341| 0.3332095977492928|
|[528.0,652.0,71.0...| 0.319|0.33013363648651256|


In [48]:
#MSE
training_predictions.meanSquaredError

0.00014546102347069449

In [49]:
#R2
training_predictions.r2

0.8703934731683968

In [50]:
MSA#
training_predictions.meanAbsoluteError

0.009642729129914028

In [0]:
#make predictions on test data 
test_results=lr_model.evaluate(test_df)

In [54]:
test_results.predictions.show()

+--------------------+------+-------------------+
|            features|output|         prediction|
+--------------------+------+-------------------+
|[470.0,509.0,76.0...| 0.319|0.31139434550994954|
|[495.0,628.0,66.0...| 0.315|0.32648281757124853|
|[495.0,752.0,50.0...| 0.327|0.33121784775207097|
|[498.0,672.0,61.0...| 0.325| 0.3316835712733849|
|[531.0,734.0,55.0...|  0.34|0.34149584693047863|
|[543.0,615.0,76.0...| 0.333| 0.3399755974094625|
|[543.0,747.0,60.0...| 0.342|0.34254023498400393|
|[550.0,789.0,54.0...| 0.359| 0.3426574128164298|
|[555.0,741.0,54.0...| 0.348| 0.3489334482922859|
|[556.0,675.0,67.0...| 0.348|0.34734697424493455|
|[558.0,740.0,60.0...|  0.36| 0.3477526778463875|
|[559.0,613.0,75.0...| 0.359| 0.3469848084912696|
|[562.0,587.0,80.0...| 0.344|0.33729457334520224|
|[569.0,711.0,65.0...|  0.34|  0.346563220848934|
|[569.0,776.0,53.0...| 0.348|0.35884051190570043|
|[570.0,786.0,57.0...| 0.366| 0.3585183284321177|
|[572.0,646.0,71.0...| 0.329| 0.3401712678062649|


In [56]:
#view the residual errors based on predictions 
test_results.residuals.show(10)

+--------------------+
|           residuals|
+--------------------+
|0.007605654490050462|
|-0.01148281757124...|
|-0.00421784775207...|
|-0.00668357127338...|
|-0.00149584693047...|
|-0.00697559740946...|
|-5.40234984003906...|
| 0.01634258718357018|
|-9.33448292285921...|
|6.530257550654284E-4|
+--------------------+
only showing top 10 rows



In [57]:
#coefficient of determination value for model
test_results.r2

0.8655587601261803

In [58]:
#RMSE
test_results.rootMeanSquaredError

0.011961739725677989

In [59]:
#MSE
test_results.meanSquaredError

0.00014308321726486293