In [0]:
# File location and type
file_location = "/FileStore/tables/rest-3.csv"
file_type = "csv"

df=spark.read.csv(file_location,header=True, inferSchema=True,sep=';')
df.printSchema()

root
 |-- bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- gender: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- hr: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- size: integer (nullable = true)



In [0]:
from pyspark.ml.feature import StringIndexer

indexer=StringIndexer(inputCols=['gender','smoker', 'hr','type'],outputCols=['gender_idx','smoker_idx','hr_idx','type_idx'])
indexed=indexer.fit(df).transform(df)
indexed.show()

+-----+----+------+------+---+---------+----+----------+----------+------+--------+
| bill| tip|gender|smoker| hr|     type|size|gender_idx|smoker_idx|hr_idx|type_idx|
+-----+----+------+------+---+---------+----+----------+----------+------+--------+
| 5.31|0.64|     f|     y| 12|    lunch|   2|       0.0|       1.0|   0.0|     2.0|
| 59.7|7.16|     m|     n|  8|breakfast|   3|       1.0|       0.0|   3.0|     1.0|
| 8.44|1.01|     f|     y|  9|breakfast|   1|       0.0|       1.0|  10.0|     1.0|
|14.06|1.69|     f|     y| 14|    lunch|   2|       0.0|       1.0|   4.0|     2.0|
|31.84|3.82|     f|     n| 20|   dinner|   3|       0.0|       0.0|   1.0|     0.0|
|60.04| 7.2|     m|     n| 22|   dinner|   4|       1.0|       0.0|   9.0|     0.0|
|78.97|9.48|     m|     n| 17|   dinner|   2|       1.0|       0.0|   6.0|     0.0|
|15.11|1.81|     f|     n| 15|   dinner|   1|       0.0|       0.0|   5.0|     0.0|
|23.26|2.79|     m|     n| 12|    lunch|   3|       1.0|       0.0|   0.0|  

In [0]:
from pyspark.ml.feature import VectorAssembler
assembler=VectorAssembler(inputCols=['tip','size','gender_idx','smoker_idx','hr_idx','type_idx'], outputCol='indep')
output=assembler.transform(indexed)
output.show()

In [0]:
finalized=output.select('indep','bill')
finalized.show()

+--------------------+-----+
|               indep| bill|
+--------------------+-----+
|[0.64,2.0,0.0,1.0...| 5.31|
|[7.16,3.0,1.0,0.0...| 59.7|
|[1.01,1.0,0.0,1.0...| 8.44|
|[1.69,2.0,0.0,1.0...|14.06|
|[3.82,3.0,0.0,0.0...|31.84|
|[7.2,4.0,1.0,0.0,...|60.04|
|[9.48,2.0,1.0,0.0...|78.97|
|[1.81,1.0,0.0,0.0...|15.11|
|[2.79,3.0,1.0,0.0...|23.26|
|[8.38,2.0,0.0,1.0...|69.85|
|[7.02,2.0,1.0,1.0...|58.52|
|[7.74,1.0,0.0,0.0...|64.46|
|[4.07,3.0,1.0,1.0...|33.91|
|[2.93,5.0,1.0,0.0...|24.44|
|[8.25,6.0,0.0,0.0...|68.77|
+--------------------+-----+



In [0]:
from pyspark.ml.regression import LinearRegression
## train & test
train,test = finalized.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='indep',labelCol='bill')
regressor=regressor.fit(train)

In [0]:
regressor.coefficients

Out[25]: DenseVector([8.3444, 0.0454, 0.0184, 0.0357, 0.0021, 0.0498])

In [0]:
regressor.intercept

Out[26]: -0.25666612668166205

In [0]:
### Predictions
pred_results = regressor.evaluate(test)

In [0]:
pred_results.predictions.show()

+--------------------+-----+------------------+
|               indep| bill|        prediction|
+--------------------+-----+------------------+
|[1.01,1.0,0.0,1.0...| 8.44| 8.322996922080211|
|[1.69,2.0,0.0,1.0...|14.06| 14.07995626113749|
|[1.81,1.0,0.0,0.0...|15.11|14.902538473999225|
|[2.79,3.0,1.0,0.0...|23.26|23.278554997570218|
|[3.82,3.0,0.0,0.0...|31.84|31.757293050380365|
|[7.2,4.0,1.0,0.0,...|60.04|  60.0418483719427|
|[7.74,1.0,0.0,0.0...|64.46|   64.430528106269|
|[8.25,6.0,0.0,0.0...|68.77| 68.87383768022099|
+--------------------+-----+------------------+



In [0]:
### performance metric
pred_results.r2,pred_results.meanAbsoluteError,pred_results.meanSquaredError

Out[30]: (0.9999824075691232, 0.07260509476782406, 0.00949590720683546)