In [0]:
##Build machine learning with pipeline ,Pyspark 

from pyspark.ml.feature import StringIndexer,VectorAssembler
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.regression import LinearRegression


# File location and type
file_location = "/FileStore/tables/tips-3.csv"
file_type = "csv"

#
df=spark.read.csv(file_location,header=True,inferSchema=True) 


In [0]:
train_data,test_data=df.randomSplit([0.75,0.25])

In [0]:
# define stage 1 : trans for the categorical features to numeric form
stage_1=StringIndexer(inputCols=['sex','smoker','day','time'],outputCols=['sex_indexed','smoker_indexed','day_indexed','time_indexed'])

# define stage 2: create a vector of all the features required to train the linear regression model 
stage_2 = VectorAssembler(inputCols=['tip', 'sex_indexed', 'smoker_indexed', 'day_indexed','time_indexed','size'],
                          outputCol='features')

# define stage 3: linear regression model                          
stage_3 = LinearRegression(featuresCol='features',labelCol='total_bill')
                         

# setup the pipeline
regression_pipeline = Pipeline(stages=[stage_1,stage_2,stage_3])



# fit the pipeline for the trainind data
model = regression_pipeline.fit(train_data)

# transform the data
sample_data_train = model.transform(train_data)


##setup for the test data witout the total_bill
test_data=test_data.drop('total_bill')
#Predict the model with test data
prediction=model.transform(test_data)




In [0]:

prediction.show()


+----+------+------+----+------+----+-----------+--------------+-----------+------------+--------------------+------------------+
| tip|   sex|smoker| day|  time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|            features|        prediction|
+----+------+------+----+------+----+-----------+--------------+-----------+------------+--------------------+------------------+
|1.25|Female|    No|Thur| Lunch|   2|        1.0|           0.0|        2.0|         1.0|[1.25,1.0,0.0,2.0...|10.672790624833503|
|1.25|  Male|    No| Sat|Dinner|   2|        0.0|           0.0|        0.0|         0.0|(6,[0,5],[1.25,2.0])| 12.40408897111673|
| 2.0|Female|   Yes| Fri| Lunch|   2|        1.0|           1.0|        3.0|         1.0|[2.0,1.0,1.0,3.0,...|14.797450296887355|
|1.71|  Male|    No| Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|[1.71,0.0,0.0,1.0...|13.779345864579536|
| 2.0|Female|    No|Thur| Lunch|   2|        1.0|           0.0|        2.0|         1.0|[