In [1]:
import pyspark
from pyspark.sql import SparkSession

appName = 'MLApp'
master = 'spark://spark-master:7077'

spark = SparkSession.builder \
                    .master(master) \
                    .appName(appName) \
                    .config("spark.executor.memory", "512m") \
                    .config("spark.executor.core", "1") \
                    .getOrCreate()

spark

In [3]:
df = spark.read.csv('hdfs://hadoop-namenode:8020/data', header=True)
df = df.select('city', 'district', 'area (m^2)', 'bedroom', 'price (tỷ)')
df = df.withColumnRenamed('area (m^2)', 'area') \
    .withColumnRenamed('price (tỷ)', 'price')
df.printSchema()

                                                                                

root
 |-- city: string (nullable = true)
 |-- district: string (nullable = true)
 |-- area: string (nullable = true)
 |-- bedroom: string (nullable = true)
 |-- price: string (nullable = true)



In [4]:
from pyspark.sql.functions import col
df = df.withColumn('area', col('area').cast('double'))
df = df.withColumn('price', col('price').cast('double'))
df = df.withColumn('bedroom', col('bedroom').cast('double'))

In [5]:
df.printSchema()

root
 |-- city: string (nullable = true)
 |-- district: string (nullable = true)
 |-- area: double (nullable = true)
 |-- bedroom: double (nullable = true)
 |-- price: double (nullable = true)



In [6]:
#df.show(n=2, truncate=False, vertical=True)
df.show(50)

+-----------+------------+-----+-------+-------+
|       city|    district| area|bedroom|  price|
+-----------+------------+-----+-------+-------+
|    Đà Nẵng|   Thanh Khê|93.58|    2.0|2.05876|
|     Hà Nội| Nam Từ Liêm|225.0|    4.0|    0.0|
|Hồ Chí Minh|      Quận 1|118.5|    3.0|   17.1|
|   Hà Nội|      Tây Hồ| 70.0|    2.0|    3.2|
|     Hà Nội| Nam Từ Liêm|145.0|    4.0|    0.0|
|     Hà Nội|   Long Biên| 80.0|    3.0|    0.0|
|Hồ Chí Minh|  Bình Thạnh| 83.0|    2.0|    4.1|
| Bình Dương|    Thuận An| 72.0|    2.0|   1.89|
|Hồ Chí Minh|  Bình Chánh| 65.0|    2.0|   1.97|
|Hồ Chí Minh|      Quận 8| 78.0|    2.0|   2.45|
|     Hà Nội| Nam Từ Liêm|135.0|    4.0|   5.67|
|Hồ Chí Minh|      Quận 7|125.0|    3.0|    4.7|
|Hồ Chí Minh|  Bình Thạnh|120.0|    3.0|    6.9|
|Hồ Chí Minh|      Quận 7| 70.0|    2.0|   3.65|
|     Hà Nội| Nam Từ Liêm| 88.0|    2.0|    4.3|
|     Hà Nội|   Long Biên| 95.0|    3.0|    0.0|
|     Hà Nội| Nam Từ Liêm|105.0|    2.0|   4.41|
|     Hà Nội| Nam Từ

In [7]:
df=df.filter("area>0").filter("price>0").filter("bedroom>0").filter("area<500").filter("price<20")
df.show(50)

+-----------+-----------+-----+-------+-------+
|       city|   district| area|bedroom|  price|
+-----------+-----------+-----+-------+-------+
|    Đà Nẵng|  Thanh Khê|93.58|    2.0|2.05876|
|Hồ Chí Minh|     Quận 1|118.5|    3.0|   17.1|
|   Hà Nội|     Tây Hồ| 70.0|    2.0|    3.2|
|Hồ Chí Minh| Bình Thạnh| 83.0|    2.0|    4.1|
| Bình Dương|   Thuận An| 72.0|    2.0|   1.89|
|Hồ Chí Minh| Bình Chánh| 65.0|    2.0|   1.97|
|Hồ Chí Minh|     Quận 8| 78.0|    2.0|   2.45|
|     Hà Nội|Nam Từ Liêm|135.0|    4.0|   5.67|
|Hồ Chí Minh|     Quận 7|125.0|    3.0|    4.7|
|Hồ Chí Minh| Bình Thạnh|120.0|    3.0|    6.9|
|Hồ Chí Minh|     Quận 7| 70.0|    2.0|   3.65|
|     Hà Nội|Nam Từ Liêm| 88.0|    2.0|    4.3|
|     Hà Nội|Nam Từ Liêm|105.0|    2.0|   4.41|
|     Hà Nội|Nam Từ Liêm|106.0|    3.0|    4.6|
|Hồ Chí Minh|     Quận 7| 65.0|    2.0|   2.99|
|     Hà Nội|     Tây Hồ| 59.3|    2.0|    2.7|
|     Hà Nội|  Long Biên| 74.0|    2.0|   2.22|
| Bình Dương|      Dĩ An| 70.0|    2.0| 

In [None]:
from pyspark.ml import Pipeline 
from pyspark.ml.regression import LinearRegression 
from pyspark.ml.feature import VectorIndexer 
from pyspark.ml.evaluation import RegressionEvaluator 
def transData(dataset, categoricalCols, continuousCols, labelCol): 
    from pyspark.ml import Pipeline 
    from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler 
    from pyspark.sql.functions import col 
    indexers = [ StringIndexer(inputCol=c, outputCol="{0}_tndexed".format(c)) 
                for c in categoricalCols ] 
    encoders = [ OneHotEncoder (inputCol=indexer.getOutputCol(), 
                                outputCol="{0}_encoded".format(indexer.getOutputCol())) 
                for indexer in indexers ] 
    assenbler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                                + continuousCols, outputCol="features") 
    pipeline = Pipeline(stages=indexers + encoders + [assenbler]) 
    model=pipeline.fit(dataset) 
    data = model.transform(dataset) 
    data = data.withColumn('price',col(labelCol)) 
    return data.select('features','price','city', 'district', 'area', 'bedroom') 
categoricalCols= ["city", "district"] 
continuousCols = [ "area", "bedroom"] 
labelCol = "price"

data= transData(df, categoricalCols, continuousCols, labelCol)
data.show(50)

[Stage 3:>                                                          (0 + 1) / 1]

In [9]:
(trainingData, testData) = data.randomSplit([0.8, 0.2], seed=100)

In [10]:
print(trainingData.count())
trainingData.show(5, trainingData.count())

                                                                                

6274


[Stage 12:>                                                         (0 + 1) / 1]

+---------------------------------------+-----+-----------+--------+----+-------+
|                               features|price|       city|district|area|bedroom|
+---------------------------------------+-----+-----------+--------+----+-------+
| (121,[0,25,119,120],[1.0,1.0,3.0,2.0])| 2.77|Hồ Chí Minh|  Quận 2| 3.0|    2.0|
|(121,[0,25,119,120],[1.0,1.0,30.6,1.0])| 2.26|Hồ Chí Minh|  Quận 2|30.6|    1.0|
|(121,[0,25,119,120],[1.0,1.0,33.0,1.0])|  1.6|Hồ Chí Minh|  Quận 2|33.0|    1.0|
|(121,[0,25,119,120],[1.0,1.0,34.0,1.0])| 1.92|Hồ Chí Minh|  Quận 2|34.0|    1.0|
|(121,[0,25,119,120],[1.0,1.0,34.0,1.0])| 1.92|Hồ Chí Minh|  Quận 2|34.0|    1.0|
+---------------------------------------+-----+-----------+--------+----+-------+
only showing top 5 rows



                                                                                

In [11]:
print(testData.count())
testData.show(5, testData.count())

                                                                                

1558


                                                                                

+---------------------------------------+-----+-----------+--------+----+-------+
|                               features|price|       city|district|area|bedroom|
+---------------------------------------+-----+-----------+--------+----+-------+
|(121,[0,25,119,120],[1.0,1.0,34.0,1.0])|  1.5|Hồ Chí Minh|  Quận 2|34.0|    1.0|
|(121,[0,25,119,120],[1.0,1.0,40.0,1.0])|  2.1|Hồ Chí Minh|  Quận 2|40.0|    1.0|
|(121,[0,25,119,120],[1.0,1.0,44.0,1.0])|  1.8|Hồ Chí Minh|  Quận 2|44.0|    1.0|
|(121,[0,25,119,120],[1.0,1.0,45.0,1.0])| 3.05|Hồ Chí Minh|  Quận 2|45.0|    1.0|
|(121,[0,25,119,120],[1.0,1.0,48.5,1.0])|  2.4|Hồ Chí Minh|  Quận 2|48.5|    1.0|
+---------------------------------------+-----+-----------+--------+----+-------+
only showing top 5 rows



                                                                                

In [12]:
from pyspark.ml.regression import LinearRegression 
from pyspark.ml.regression import DecisionTreeRegressor 
from pyspark.ml.regression import RandomForestRegressor 
from pyspark.ml.regression import GBTRegressor 
#Linear Regression 
lr = LinearRegression(labelCol="price", featuresCol="features") 
lrModel = lr.fit(trainingData) 
#Decision Tree Regression 
dt = DecisionTreeRegressor (labelCol="price", featuresCol="features") 
dtModel = dt.fit(trainingData) 
#Random Forest Regression 
rf = RandomForestRegressor (labelCol="price", featuresCol="features") 
rfModel = rf.fit(trainingData) 
#Gradient-Boosted Tree Regression 
gbt = GBTRegressor (labelCol="price", featuresCol="features") 
gbtModel = gbt.fit(trainingData)

22/01/03 04:08:35 WARN Instrumentation: [c3ea3584] regParam is zero, which might cause numerical instability and overfitting.
22/01/03 04:08:36 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/01/03 04:08:36 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/01/03 04:08:36 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
22/01/03 04:08:36 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK
22/01/03 04:08:36 WARN Instrumentation: [c3ea3584] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
                                                                                

In [13]:
lr_predictions = lrModel.transform(testData) 
dt_predictions = dtModel.transform(testData) 
rf_predictions = rfModel.transform(testData) 
gbt_predictions = gbtModel.transform(testData)

In [14]:
lr_predictions.select('features','city', 'district', 'area', 
                      'bedroom','price', 'prediction').show(10, testData.count())

+---------------------------------------+-----------+--------+----+-------+-----+------------------+
|                               features|       city|district|area|bedroom|price|        prediction|
+---------------------------------------+-----------+--------+----+-------+-----+------------------+
|(121,[0,25,119,120],[1.0,1.0,34.0,1.0])|Hồ Chí Minh|  Quận 2|34.0|    1.0|  1.5|3.3129353835951854|
|(121,[0,25,119,120],[1.0,1.0,40.0,1.0])|Hồ Chí Minh|  Quận 2|40.0|    1.0|  2.1| 3.607181894057821|
|(121,[0,25,119,120],[1.0,1.0,44.0,1.0])|Hồ Chí Minh|  Quận 2|44.0|    1.0|  1.8| 3.803346234366245|
|(121,[0,25,119,120],[1.0,1.0,45.0,1.0])|Hồ Chí Minh|  Quận 2|45.0|    1.0| 3.05|  3.85238731944335|
|(121,[0,25,119,120],[1.0,1.0,48.5,1.0])|Hồ Chí Minh|  Quận 2|48.5|    1.0|  2.4|  4.02403111721322|
|(121,[0,25,119,120],[1.0,1.0,49.0,1.0])|Hồ Chí Minh|  Quận 2|49.0|    1.0|2.687| 4.048551659751774|
|(121,[0,25,119,120],[1.0,1.0,50.0,1.0])|Hồ Chí Minh|  Quận 2|50.0|    1.0|  7.3| 4.0975927

In [15]:
dt_predictions.select('features','city', 'district', 'area', 
                      'bedroom','price', 'prediction').show(10, testData.count())

+---------------------------------------+-----------+--------+----+-------+-----+------------------+
|                               features|       city|district|area|bedroom|price|        prediction|
+---------------------------------------+-----------+--------+----+-------+-----+------------------+
|(121,[0,25,119,120],[1.0,1.0,34.0,1.0])|Hồ Chí Minh|  Quận 2|34.0|    1.0|  1.5|2.4696012590299263|
|(121,[0,25,119,120],[1.0,1.0,40.0,1.0])|Hồ Chí Minh|  Quận 2|40.0|    1.0|  2.1|2.4696012590299263|
|(121,[0,25,119,120],[1.0,1.0,44.0,1.0])|Hồ Chí Minh|  Quận 2|44.0|    1.0|  1.8|2.4696012590299263|
|(121,[0,25,119,120],[1.0,1.0,45.0,1.0])|Hồ Chí Minh|  Quận 2|45.0|    1.0| 3.05|2.4696012590299263|
|(121,[0,25,119,120],[1.0,1.0,48.5,1.0])|Hồ Chí Minh|  Quận 2|48.5|    1.0|  2.4|2.4696012590299263|
|(121,[0,25,119,120],[1.0,1.0,49.0,1.0])|Hồ Chí Minh|  Quận 2|49.0|    1.0|2.687|2.4696012590299263|
|(121,[0,25,119,120],[1.0,1.0,50.0,1.0])|Hồ Chí Minh|  Quận 2|50.0|    1.0|  7.3|2.46960125

In [16]:
rf_predictions.select('features','city', 'district', 'area', 
                      'bedroom','price', 'prediction').show(10, testData.count())

+---------------------------------------+-----------+--------+----+-------+-----+------------------+
|                               features|       city|district|area|bedroom|price|        prediction|
+---------------------------------------+-----------+--------+----+-------+-----+------------------+
|(121,[0,25,119,120],[1.0,1.0,34.0,1.0])|Hồ Chí Minh|  Quận 2|34.0|    1.0|  1.5|2.7504638325277915|
|(121,[0,25,119,120],[1.0,1.0,40.0,1.0])|Hồ Chí Minh|  Quận 2|40.0|    1.0|  2.1|2.7604027214166806|
|(121,[0,25,119,120],[1.0,1.0,44.0,1.0])|Hồ Chí Minh|  Quận 2|44.0|    1.0|  1.8|2.7604027214166806|
|(121,[0,25,119,120],[1.0,1.0,45.0,1.0])|Hồ Chí Minh|  Quận 2|45.0|    1.0| 3.05|2.7604027214166806|
|(121,[0,25,119,120],[1.0,1.0,48.5,1.0])|Hồ Chí Minh|  Quận 2|48.5|    1.0|  2.4|2.9493308276825316|
|(121,[0,25,119,120],[1.0,1.0,49.0,1.0])|Hồ Chí Minh|  Quận 2|49.0|    1.0|2.687|2.9493308276825316|
|(121,[0,25,119,120],[1.0,1.0,50.0,1.0])|Hồ Chí Minh|  Quận 2|50.0|    1.0|  7.3|2.94933082

In [17]:
gbt_predictions.select('features','city', 'district', 'area', 
                      'bedroom','price', 'prediction').show(50, testData.count())

+---------------------------------------+-----------+--------+----+-------+-----+------------------+
|                               features|       city|district|area|bedroom|price|        prediction|
+---------------------------------------+-----------+--------+----+-------+-----+------------------+
|(121,[0,25,119,120],[1.0,1.0,34.0,1.0])|Hồ Chí Minh|  Quận 2|34.0|    1.0|  1.5|2.3988657017210455|
|(121,[0,25,119,120],[1.0,1.0,40.0,1.0])|Hồ Chí Minh|  Quận 2|40.0|    1.0|  2.1|2.3988657017210455|
|(121,[0,25,119,120],[1.0,1.0,44.0,1.0])|Hồ Chí Minh|  Quận 2|44.0|    1.0|  1.8|2.3988657017210455|
|(121,[0,25,119,120],[1.0,1.0,45.0,1.0])|Hồ Chí Minh|  Quận 2|45.0|    1.0| 3.05|2.3988657017210455|
|(121,[0,25,119,120],[1.0,1.0,48.5,1.0])|Hồ Chí Minh|  Quận 2|48.5|    1.0|  2.4|2.3988657017210455|
|(121,[0,25,119,120],[1.0,1.0,49.0,1.0])|Hồ Chí Minh|  Quận 2|49.0|    1.0|2.687|2.3988657017210455|
|(121,[0,25,119,120],[1.0,1.0,50.0,1.0])|Hồ Chí Minh|  Quận 2|50.0|    1.0|  7.3|2.39886570

In [18]:
from pyspark.ml.evaluation import RegressionEvaluator 

eval1 = RegressionEvaluator( 
    labelCol="price", predictionCol="prediction", metricName="rmse") 
lr_rmse = float(eval1.evaluate(lr_predictions)) 
dt_rmse = float(eval1.evaluate(dt_predictions)) 
rf_rmse = float(eval1.evaluate(rf_predictions)) 
gbt_rmse = float(eval1.evaluate(gbt_predictions)) 

eval2 = RegressionEvaluator( 
    labelCol="price", predictionCol="prediction", metricName="mae") 
lr_mae = float(eval2.evaluate(lr_predictions)) 
dt_mae = float(eval2.evaluate(dt_predictions)) 
rf_mae = float(eval2.evaluate(rf_predictions)) 
gbt_mae = float(eval2.evaluate(gbt_predictions)) 

num = float(testData.count()) 

lr_rate = lr_predictions.filter("prediction/price < 1.3 and prediction/price > 0.7").count()/num*100 
dt_rate = dt_predictions.filter("prediction/price < 1.3 and prediction/price > 0.7").count()/num*100
rf_rate = rf_predictions.filter("prediction/price < 1.3 and prediction/price > 0.7").count()/num*100  
gbt_rate = gbt_predictions.filter("prediction/price < 1.3 and prediction/price > 0.7").count()/num*100 



                                                                                

In [20]:
evaluation = spark.createDataFrame( 
    [
        ("Linear Regression", lr_rmse, lr_mae, lr_rate), 
        ("Decision Tree Regression", dt_rmse, dt_mae, dt_rate), 
        ("Random Forest Regression", rf_rmse, rf_mae, rf_rate), 
        ("Gradient-Boosted Tree Regression", gbt_rmse, gbt_mae, gbt_rate), 
    ],
    ["Algorithms", 'Root mean squared error', "Mean absolute error", "Accuracy Rate (%)"] 
)   


In [22]:
evaluation.show(1)

22/01/03 04:12:47 WARN TaskSetManager: Lost task 0.0 in stage 284.0 (TID 290) (172.18.0.6 executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/bin/spark-3.1.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 586, in main
    func, profiler, deserializer, serializer = read_command(pickleSer, infile)
  File "/usr/bin/spark-3.1.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 69, in read_command
    command = serializer._read_with_length(file)
  File "/usr/bin/spark-3.1.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 160, in _read_with_length
    return self.loads(obj)
  File "/usr/bin/spark-3.1.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 430, in loads
    return pickle.loads(obj, encoding=encoding)
AttributeError: Can't get attribute '_fill_function' on <module 'pyspark.cloudpickle' from '/usr/bin/spark-3.1.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/cloudpick

Py4JJavaError: An error occurred while calling o992.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 284.0 failed 4 times, most recent failure: Lost task 0.3 in stage 284.0 (TID 293) (172.18.0.7 executor 1): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/bin/spark-3.1.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 586, in main
    func, profiler, deserializer, serializer = read_command(pickleSer, infile)
  File "/usr/bin/spark-3.1.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 69, in read_command
    command = serializer._read_with_length(file)
  File "/usr/bin/spark-3.1.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 160, in _read_with_length
    return self.loads(obj)
  File "/usr/bin/spark-3.1.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 430, in loads
    return pickle.loads(obj, encoding=encoding)
AttributeError: Can't get attribute '_fill_function' on <module 'pyspark.cloudpickle' from '/usr/bin/spark-3.1.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/cloudpickle/__init__.py'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:517)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:652)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:635)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:470)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2258)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2207)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2206)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2206)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1079)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1079)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2445)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2387)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2376)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:868)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2196)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2217)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2236)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:472)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:425)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:47)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3696)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3687)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:103)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3685)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2722)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:2929)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:301)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:338)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/usr/bin/spark-3.1.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 586, in main
    func, profiler, deserializer, serializer = read_command(pickleSer, infile)
  File "/usr/bin/spark-3.1.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 69, in read_command
    command = serializer._read_with_length(file)
  File "/usr/bin/spark-3.1.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 160, in _read_with_length
    return self.loads(obj)
  File "/usr/bin/spark-3.1.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 430, in loads
    return pickle.loads(obj, encoding=encoding)
AttributeError: Can't get attribute '_fill_function' on <module 'pyspark.cloudpickle' from '/usr/bin/spark-3.1.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/cloudpickle/__init__.py'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:517)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:652)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:635)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:470)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:489)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:458)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:755)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:345)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:898)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:898)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:497)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1439)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:500)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more
