In [0]:
# Linear Regression Is used to predit, value of variable based on other variable. 
# Variable you want to predict is call dependent variable and the variable the you are using to predict other variable values is called independent variable

In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
import pyspark.sql.functions as F

In [0]:
spark = SparkSession.builder.appName("Liner_regression").getOrCreate()

In [0]:
df = spark.read.csv('dbfs:/FileStore/shared_uploads/dwija.pce21@sot.pdpu.ac.in/insurance.csv', inferSchema=True, header=True)

In [0]:
df.show()

+---+------+------+--------+------+---------+-----------+
|age|gender|   bmi|children|smoker|   region|    charges|
+---+------+------+--------+------+---------+-----------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|
| 33|  male|22.705|       0|    no|northwest|21984.47061|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552|
| 31|female| 25.74|       0|    no|southeast|  3756.6216|
| 46|female| 33.44|       1|    no|southeast|  8240.5896|
| 37|female| 27.74|       3|    no|northwest|  7281.5056|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107|
| 60|female| 25.84|       0|    no|northwest|28923.13692|
| 25|  male| 26.22|       0|    no|northeast|  2721.3208|
| 62|female| 26.29|       0|   yes|southeast| 27808.7251|
| 23|  male|  34.4|       0|    no|southwest|   1826.843|
| 56|female| 39.82|       0|    no|southeast| 11090.7178|
| 27|  male| 4

In [0]:
df.count()

Out[8]: 1338

In [0]:
len(df.columns)

Out[9]: 7

In [0]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- children: integer (nullable = true)
 |-- smoker: string (nullable = true)
 |-- region: string (nullable = true)
 |-- charges: double (nullable = true)



In [0]:
df.describe().show()

+-------+------------------+------+------------------+-----------------+------+---------+------------------+
|summary|               age|gender|               bmi|         children|smoker|   region|           charges|
+-------+------------------+------+------------------+-----------------+------+---------+------------------+
|  count|              1338|  1338|              1338|             1338|  1338|     1338|              1338|
|   mean| 39.20702541106129|  null|30.663396860986538|  1.0949177877429|  null|     null|13270.422265141257|
| stddev|14.049960379216147|  null| 6.098186911679012|1.205492739781914|  null|     null|12110.011236693992|
|    min|                18|female|             15.96|                0|    no|northeast|         1121.8739|
|    max|                64|  male|             53.13|                5|   yes|southwest|       63770.42801|
+-------+------------------+------+------------------+-----------------+------+---------+------------------+



In [0]:
df.head(5)

Out[13]: [Row(age=19, gender='female', bmi=27.9, children=0, smoker='yes', region='southwest', charges=16884.924),
 Row(age=18, gender='male', bmi=33.77, children=1, smoker='no', region='southeast', charges=1725.5523),
 Row(age=28, gender='male', bmi=33.0, children=3, smoker='no', region='southeast', charges=4449.462),
 Row(age=33, gender='male', bmi=22.705, children=0, smoker='no', region='northwest', charges=21984.47061),
 Row(age=32, gender='male', bmi=28.88, children=0, smoker='no', region='northwest', charges=3866.8552)]

In [0]:
df.tail(5)

Out[14]: [Row(age=50, gender='male', bmi=30.97, children=3, smoker='no', region='northwest', charges=10600.5483),
 Row(age=18, gender='female', bmi=31.92, children=0, smoker='no', region='northeast', charges=2205.9808),
 Row(age=18, gender='female', bmi=36.85, children=0, smoker='no', region='southeast', charges=1629.8335),
 Row(age=21, gender='female', bmi=25.8, children=0, smoker='no', region='southwest', charges=2007.945),
 Row(age=61, gender='female', bmi=29.07, children=0, smoker='yes', region='northwest', charges=29141.3603)]

In [0]:
df.corr('age','charges')

Out[15]: 0.299008193330648

In [0]:
df.corr('bmi','charges')

Out[16]: 0.19834096883362903

In [0]:
from pyspark.ml.feature import VectorAssembler

In [0]:
df.columns

Out[18]: ['age', 'gender', 'bmi', 'children', 'smoker', 'region', 'charges']

In [0]:
assembler  = VectorAssembler(inputCols=['age', 'bmi', 'children'],outputCol="features")

In [0]:
assembler

Out[32]: VectorAssembler_c95f86682e64

In [0]:
output = assembler.transform(df)

In [0]:
output.show()

+---+------+------+--------+------+---------+-----------+-----------------+
|age|gender|   bmi|children|smoker|   region|    charges|         features|
+---+------+------+--------+------+---------+-----------+-----------------+
| 19|female|  27.9|       0|   yes|southwest|  16884.924|  [19.0,27.9,0.0]|
| 18|  male| 33.77|       1|    no|southeast|  1725.5523| [18.0,33.77,1.0]|
| 28|  male|  33.0|       3|    no|southeast|   4449.462|  [28.0,33.0,3.0]|
| 33|  male|22.705|       0|    no|northwest|21984.47061|[33.0,22.705,0.0]|
| 32|  male| 28.88|       0|    no|northwest|  3866.8552| [32.0,28.88,0.0]|
| 31|female| 25.74|       0|    no|southeast|  3756.6216| [31.0,25.74,0.0]|
| 46|female| 33.44|       1|    no|southeast|  8240.5896| [46.0,33.44,1.0]|
| 37|female| 27.74|       3|    no|northwest|  7281.5056| [37.0,27.74,3.0]|
| 37|  male| 29.83|       2|    no|northeast|  6406.4107| [37.0,29.83,2.0]|
| 60|female| 25.84|       0|    no|northwest|28923.13692| [60.0,25.84,0.0]|
| 25|  male|

In [0]:
output.select('features','charges').show(truncate=False)

+-----------------+-----------+
|features         |charges    |
+-----------------+-----------+
|[19.0,27.9,0.0]  |16884.924  |
|[18.0,33.77,1.0] |1725.5523  |
|[28.0,33.0,3.0]  |4449.462   |
|[33.0,22.705,0.0]|21984.47061|
|[32.0,28.88,0.0] |3866.8552  |
|[31.0,25.74,0.0] |3756.6216  |
|[46.0,33.44,1.0] |8240.5896  |
|[37.0,27.74,3.0] |7281.5056  |
|[37.0,29.83,2.0] |6406.4107  |
|[60.0,25.84,0.0] |28923.13692|
|[25.0,26.22,0.0] |2721.3208  |
|[62.0,26.29,0.0] |27808.7251 |
|[23.0,34.4,0.0]  |1826.843   |
|[56.0,39.82,0.0] |11090.7178 |
|[27.0,42.13,0.0] |39611.7577 |
|[19.0,24.6,1.0]  |1837.237   |
|[52.0,30.78,1.0] |10797.3362 |
|[23.0,23.845,0.0]|2395.17155 |
|[56.0,40.3,0.0]  |10602.385  |
|[30.0,35.3,0.0]  |36837.467  |
+-----------------+-----------+
only showing top 20 rows



In [0]:
final = output.select('features','charges')

In [0]:
final.show()

+-----------------+-----------+
|         features|    charges|
+-----------------+-----------+
|  [19.0,27.9,0.0]|  16884.924|
| [18.0,33.77,1.0]|  1725.5523|
|  [28.0,33.0,3.0]|   4449.462|
|[33.0,22.705,0.0]|21984.47061|
| [32.0,28.88,0.0]|  3866.8552|
| [31.0,25.74,0.0]|  3756.6216|
| [46.0,33.44,1.0]|  8240.5896|
| [37.0,27.74,3.0]|  7281.5056|
| [37.0,29.83,2.0]|  6406.4107|
| [60.0,25.84,0.0]|28923.13692|
| [25.0,26.22,0.0]|  2721.3208|
| [62.0,26.29,0.0]| 27808.7251|
|  [23.0,34.4,0.0]|   1826.843|
| [56.0,39.82,0.0]| 11090.7178|
| [27.0,42.13,0.0]| 39611.7577|
|  [19.0,24.6,1.0]|   1837.237|
| [52.0,30.78,1.0]| 10797.3362|
|[23.0,23.845,0.0]| 2395.17155|
|  [56.0,40.3,0.0]|  10602.385|
|  [30.0,35.3,0.0]|  36837.467|
+-----------------+-----------+
only showing top 20 rows



In [0]:
train,test = final.randomSplit([0.7,0.3])

In [0]:
train.count()

Out[40]: 925

In [0]:
test.count()

Out[41]: 413

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
lr = LinearRegression(featuresCol="features",labelCol="charges")

In [0]:
trained_model = lr.fit(train)

In [0]:
trained_model

Out[45]: LinearRegressionModel: uid=LinearRegression_076a75a94c75, numFeatures=3

In [0]:
trained_model.intercept

Out[47]: -7890.359979949355

In [0]:
trained_model.coefficients

Out[48]: DenseVector([230.4378, 392.2971, 200.3792])

In [0]:
predictions = trained_model.evaluate(train)

In [0]:
print(predictions.r2)

0.11706042381867132


In [0]:
predictions.meanSquaredError

Out[51]: 133152751.00669008

In [0]:
predictions.meanAbsoluteError

Out[52]: 9134.641708005716

In [0]:
main_predictions = trained_model.evaluate(test)

In [0]:
main_predictions.r2

Out[54]: 0.11807445484369261

In [0]:
main_predictions.meanAbsoluteError

Out[55]: 8943.692471593364

In [0]:
main_predictions.meanSquaredError

Out[56]: 120578344.13161448