# Chapter 4

## Referrence
- pandas udf: https://docs.microsoft.com/ko-kr/azure/databricks/spark/latest/spark-sql/udf-python-pandas

## Spark Session

In [1]:
import gc
import logging
import subprocess
from datetime import datetime
from pathlib import Path

import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import Row
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
# from pytz import timezone
# from pytz import utc

In [2]:
EXECUTOR_MEMORY = "2g"
EXECUTOR_CORES = 2
EXECUTORE_INSTANCES = 3
DRIVER_MEMORY = "1g"
DRIVER_MAX_RESULT_SIZE = "1g"

In [3]:
spark = (
    SparkSession.builder.appName(f"Advanced analytics with SPARK - Chapter 4")
    .master("yarn")
    .config("spark.executor.memory", EXECUTOR_MEMORY)
    .config("spark.executor.cores", EXECUTOR_CORES)
    .config("spark.executor.instances", EXECUTORE_INSTANCES)
    .config("spark.driver.memory", DRIVER_MEMORY)
    .config("spark.driver.maxResultSize", DRIVER_MAX_RESULT_SIZE)
    .config("spark.kryoserializer.buffer.max", "1024m")
#     .config("spark.sql.warehouse.dir", "/user/bigdata/members/shyeon/advanced-spark/data")
    .enableHiveSupport()
    .getOrCreate()
)

spark.sparkContext.getConf().getAll()

[('spark.app.name', 'Advanced analytics with SPARK - Chapter 4'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS',
  'desktop'),
 ('spark.executor.instances', '3'),
 ('spark.driver.extraJavaOptions',
  '"-Dio.netty.tryReflectionSetAccessible=true"'),
 ('spark.driver.appUIAddress', 'http://192.168.0.2:4040'),
 ('spark.history.ui.port', '18081'),
 ('spark.driver.memory', '1g'),
 ('spark.driver.host', '192.168.0.2'),
 ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'),
 ('spark.executor.extraJavaOptions',
  '"-Dio.netty.tryReflectionSetAccessible=true"'),
 ('spark.history.provider',
  'org.apache.spark.deploy.history.FsHistoryProvider'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.history.fs.update.interval', '10s'),
 ('spark.submit.deployMode', 'client'),
 ('spark.history.fs.logDirectory', 'hdfs://desktop:9000/spark-logs'),
 ('spark.ui.filters',
  'org.apache.hadoop.yarn.server.webpro

## Load dataset and Preprocessing

In [4]:
!cat /home/shyeon/workspace/apache-project/advanced-spark/data/ch04/covtype.info

The Forest CoverType dataset


1.	Title of Database:

	Forest Covertype data


2.	Sources:

	(a) Original owners of database:
		Remote Sensing and GIS Program
		Department of Forest Sciences
		College of Natural Resources
		Colorado State University
		Fort Collins, CO  80523
		(contact Jock A. Blackard, jblackard 'at' fs.fed.us
		      or Dr. Denis J. Dean, denis.dean 'at' utdallas.edu)

	NOTE:	Reuse of this database is unlimited with retention of 
		copyright notice for Jock A. Blackard and Colorado 
		State University.

	(b) Donors of database:
		Jock A. Blackard (jblackard 'at' fs.fed.us)
		GIS Coordinator
		USFS - Forest Inventory & Analysis
		Rocky Mountain Research Station
		507 25th Street
		Ogden, UT 84401

		Dr. Denis J. Dean (denis.dean 'at' utdallas.edu)
		Professor
		Program in Geography and Geospatial Sciences
		School of Economic, Political and Policy Sciences
		800 West Campbell Rd
		Richardson, TX  75080-3021 
		
		Dr. Charles W. Anderson (anderson 'at' cs.colostate.edu

In [5]:
!head /home/shyeon/workspace/apache-project/advanced-spark/data/ch04/covtype.data

2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2590,56,2,212,-6,390,220,235,151,6225,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2804,139,9,268,65,3180,234,238,135,6121,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
2785,155,18,242,118,3090,238,238,122,6211,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
2595,45,2,153,-1,391,220,234,150,6172,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2579,132,6,300,-15,67,230,237,140,6031,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2
2606,45,7,270,5,633,222,225,138,6256,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5
2605,49,4,234,7,573,222,230,144,6228,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,

In [6]:
# Elevation                               quantitative    meters                       Elevation in meters
# Aspect                                  quantitative    azimuth                      Aspect in degrees azimuth
# Slope                                   quantitative    degrees                      Slope in degrees
# Horizontal_Distance_To_Hydrology        quantitative    meters                       Horz Dist to nearest surface water features
# Vertical_Distance_To_Hydrology          quantitative    meters                       Vert Dist to nearest surface water features
# Horizontal_Distance_To_Roadways         quantitative    meters                       Horz Dist to nearest roadway
# Hillshade_9am                           quantitative    0 to 255 index               Hillshade index at 9am, summer solstice
# Hillshade_Noon                          quantitative    0 to 255 index               Hillshade index at noon, summer soltice
# Hillshade_3pm                           quantitative    0 to 255 index               Hillshade index at 3pm, summer solstice
# Horizontal_Distance_To_Fire_Points      quantitative    meters                       Horz Dist to nearest wildfire ignition points
# Wilderness_Area (4 binary columns)      qualitative     0 (absence) or 1 (presence)  Wilderness area designation
# Soil_Type (40 binary columns)           qualitative     0 (absence) or 1 (presence)  Soil Type designation
# Cover_Type (7 types)                    integer         1 to 7                       Forest Cover Type designation

wilderness_area_cols = [f"wilderness_area_{i}" for i in range(4)] # 황야 지역 (4 dummy variables)
soil_type_cols = [f"soil_type_{i}" for i in range(40)] # 토양 유형 (40 dummy variables)

schema = [
    T.StructField("elevation", T.DoubleType(), True),
    T.StructField("aspect", T.DoubleType(), True),
    T.StructField("slope", T.DoubleType(), True),
    T.StructField("horz_dist_to_hydro", T.DoubleType(), True), # 가장 가까운 지표수까지 거리
    T.StructField("vert_dist_to_hydro", T.DoubleType(), True), # 가장 가까운 지표수까지 거리
    T.StructField("horz_dist_to_road", T.DoubleType(), True), # 가장 가까운 도로까지 거리
    T.StructField("hillshade_9am", T.IntegerType(), True), # 언덕 그늘
    T.StructField("hillshade_noon", T.IntegerType(), True),
    T.StructField("hillshade_3pm", T.IntegerType(), True),
    T.StructField("horz_dist_to_fire", T.DoubleType(), True),
]
wilderness_area_schema = [T.StructField(col, T.IntegerType(), True) for col in wilderness_area_cols] 
soil_type_schema = [T.StructField(col, T.IntegerType(), True) for col in soil_type_cols] 
cover_type_schema = [T.StructField("cover_type", T.IntegerType(), True)]

schema.extend(wilderness_area_schema)
schema.extend(soil_type_schema)
schema.extend(cover_type_schema)
schema = T.StructType(schema)

df = (spark
      .read.format("csv")
      .option("header", False)
      .option("sep", ",")
      .schema(schema)
      .load("/data/advanced-spark/ch04/covtype.data"))

wilderness_area_cols = [f"wilderness_area_{i}" for i in range(4)]
soil_type_cols = [f"soil_type_{i}" for i in range(40)]

df.select("cover_type").show(5) # 모든 컬럼의 스키마가 반영되었는지 확인

+----------+
|cover_type|
+----------+
|         5|
|         5|
|         2|
|         2|
|         5|
+----------+
only showing top 5 rows



## Prepare dataset

In [7]:
from pyspark.ml.feature import RFormula

print(f"the number of column: {len(df.columns)}")

transformer = RFormula(formula="cover_type ~ .").fit(df)
prepared_df = transformer.transform(df).select("features", "label")
prepared_df.show(5, False) # features와 label이 추가됨

train_df, test_df = prepared_df.randomSplit([0.7, 0.3], seed=42)

the number of column: 55
+--------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                |label|
+--------------------------------------------------------------------------------------------------------+-----+
|(54,[0,1,2,3,5,6,7,8,9,10,42],[2596.0,51.0,3.0,258.0,510.0,221.0,232.0,148.0,6279.0,1.0,1.0])           |5.0  |
|(54,[0,1,2,3,4,5,6,7,8,9,10,42],[2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,1.0,1.0])    |5.0  |
|(54,[0,1,2,3,4,5,6,7,8,9,10,25],[2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,1.0,1.0])  |2.0  |
|(54,[0,1,2,3,4,5,6,7,8,9,10,43],[2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,1.0,1.0])|2.0  |
|(54,[0,1,2,3,4,5,6,7,8,9,10,42],[2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,1.0,1.0])    |5.0  |
+----------------------------------------------------------------------

## Modeling

In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

def build_pipeline(ml_model):
    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    label_indexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(prepared_df)

    # Automatically identify categorical features, and index them.
    # We specify maxCategories so features with > 4 distinct values are treated as continuous.
    feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=8).fit(prepared_df)

    # Chain indexers and tree in a Pipeline
    stages = [label_indexer, feature_indexer, ml_model]
    pipeline = Pipeline(stages=stages)
    return pipeline

def predict(train_df, test_df, pipeline, summarize_result=False, summarize_model=False):
    model = pipeline.fit(train_df)
    # Make predictions.
    predictions = model.transform(test_df)
    if summarize_result:
        predictions.select("indexedLabel", "prediction", "probability").show(5, False)
    if summarize_model:
        treeModel = model.stages[2]
        print(treeModel)
    return predictions

def evaluate(predictions):
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                                  predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))
    return accuracy

### Decision Tree (Spark Documents)

In [9]:
from pyspark.ml.classification import DecisionTreeClassifier

# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel",
                            featuresCol="indexedFeatures",
                            predictionCol="prediction",
                            seed=42)
print(dt.explainParams())

pipeline = build_pipeline(dt)
predictions = predict(train_df, test_df, pipeline, True, True)
evaluate(predictions)

cacheNodeIds: If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval. (default: False)
checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
featuresCol: features column name. (default: features, current: indexedFeatures)
impurity: Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini (default: gini)
labelCol: label column name. (default: label, current: indexedLabel)
leafCol: Leaf indices column name. Predicted leaf index of each instance in each tree by preorder. (default: )
maxBins: Max number of bin

0.7002302520139874

### Confusion Matrix
#### Spark API

In [10]:
from pyspark.mllib.evaluation import MulticlassMetrics
import pandas as pd

# important: need to cast to float type, and order by prediction, else it won't work
# select only prediction and label columns
preds_and_labels = predictions.select('prediction','indexedLabel').orderBy('prediction') # 컬럼순서 주의(pred, real 순) 
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

cm = metrics.confusionMatrix().toArray() # python list
pd.DataFrame(cm)

Unnamed: 0,0,1,2,3,4,5,6
0,65472.0,17543.0,1459.0,188.0,0.0,26.0,38.0
1,17680.0,44667.0,56.0,1230.0,0.0,0.0,0.0
2,1881.0,0.0,8690.0,0.0,0.0,0.0,213.0
3,7.0,3276.0,13.0,2788.0,0.0,0.0,0.0
4,2030.0,0.0,3016.0,0.0,0.0,0.0,183.0
5,2607.0,1.0,247.0,0.0,0.0,33.0,0.0
6,6.0,0.0,507.0,0.0,0.0,0.0,300.0


#### Custom

In [11]:
cm = (predictions.select('prediction', 'indexedLabel')
      .groupby("indexedLabel")
      .pivot("prediction", list(range(0, 7)))
      .count()
      .fillna(0)
      .sort("indexedLabel")
      .toPandas())
cm = cm.reindex(cm["indexedLabel"], axis=0).drop("indexedLabel", axis=1)
cm

Unnamed: 0_level_0,0,1,2,3,4,5,6
indexedLabel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,65472,17543,1459,188,0,26,38
1.0,17680,44667,56,1230,0,0,0
2.0,1881,0,8690,0,0,0,213
3.0,7,3276,13,2788,0,0,0
4.0,2030,0,3016,0,0,0,183
5.0,2607,1,247,0,0,33,0
6.0,6,0,507,0,0,0,300


## Hyperparameter Tuning

In [12]:
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

param_grid = (ParamGridBuilder()
              .addGrid(dt.impurity, ["entropy"])
              .addGrid(dt.maxDepth, [30]) # must <= 30
              .addGrid(dt.maxBins, [470])
              .addGrid(dt.minInfoGain, [0.0])
              .build())

evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                              predictionCol="prediction",
                                              metricName="accuracy")

validator = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, parallelism=4, seed=42)
validated_model = validator.fit(train_df)

In [13]:
validated_model.bestModel.stages[-1].extractParamMap()

{Param(parent='DecisionTreeClassifier_3f5fdea9e4a0', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False,
 Param(parent='DecisionTreeClassifier_3f5fdea9e4a0', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,
 Param(parent='DecisionTreeClassifier_3f5fdea9e4a0', name='featuresCol', doc='features column name.'): 'indexedFeatures',
 Param(parent='DecisionTreeClassifier_3f5fdea9e4a0', name='impurity', doc='Criterion used for information gain calculation (case-insensitive). Supported options: entropy, gini'

In [14]:
predictions = validated_model.bestModel.transform(test_df)
evaluate(predictions)

cm = (predictions.select('prediction', 'indexedLabel')
      .groupby("indexedLabel")
      .pivot("prediction", list(range(0, 7)))
      .count()
      .fillna(0)
      .sort("indexedLabel")
      .toPandas())
cm = cm.reindex(cm["indexedLabel"], axis=0).drop("indexedLabel", axis=1)
cm

Test Error = 0.0644591 


Unnamed: 0_level_0,0,1,2,3,4,5,6
indexedLabel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,80172,3747,224,53,174,356,0
1.0,3840,59434,7,289,6,57,0
2.0,223,2,10029,0,410,29,91
3.0,55,300,0,5726,0,3,0
4.0,200,22,453,0,4509,8,37
5.0,416,46,26,0,19,2381,0
6.0,0,0,89,0,44,0,680


## Feature Engineering
- onehot으로 분리된 열들을 한 열로 통합
- pandas udf를 사용함
- soil_type은 indexer 사용 불가(카디널리티가 30 이하여야함)

In [15]:
cols = [col for col in df.columns if (col.find("wilderness_area_") == -1)] 
new_df = df.select(*cols, F.array(*wilderness_area_cols).alias("wilderness_area"))

print(f"the number of column: {len(new_df.columns)}")
new_df.select("wilderness_area").show(5)

the number of column: 52
+---------------+
|wilderness_area|
+---------------+
|   [1, 0, 0, 0]|
|   [1, 0, 0, 0]|
|   [1, 0, 0, 0]|
|   [1, 0, 0, 0]|
|   [1, 0, 0, 0]|
+---------------+
only showing top 5 rows



In [16]:
from typing import Iterator
import numpy as np
import pandas as pd

# Declare the function and create the UDF
@F.pandas_udf(T.LongType())
def unhot_udf(arrs: pd.Series) -> pd.Series:
    return pd.Series(np.where(arr==1)[0][0] for arr in arrs)

# Execute function as a Spark vectorized UDF
new_df = new_df.select(*cols, unhot_udf(F.col("wilderness_area")).alias("wilderness_area"))
transformer = RFormula(formula="cover_type ~ .").fit(new_df)
prepared_df = transformer.transform(new_df).select("features", "label")
prepared_df.show(5, False)

train_df, test_df = prepared_df.randomSplit([0.7, 0.3], seed=42)
train_df.cache()

+-------------------------------------------------------------------------------------------------+-----+
|features                                                                                         |label|
+-------------------------------------------------------------------------------------------------+-----+
|(51,[0,1,2,3,5,6,7,8,9,38],[2596.0,51.0,3.0,258.0,510.0,221.0,232.0,148.0,6279.0,1.0])           |5.0  |
|(51,[0,1,2,3,4,5,6,7,8,9,38],[2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,1.0])    |5.0  |
|(51,[0,1,2,3,4,5,6,7,8,9,21],[2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,1.0])  |2.0  |
|(51,[0,1,2,3,4,5,6,7,8,9,39],[2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,1.0])|2.0  |
|(51,[0,1,2,3,4,5,6,7,8,9,38],[2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,1.0])    |5.0  |
+-------------------------------------------------------------------------------------------------+-----+
only showing top 5 rows



DataFrame[features: vector, label: double]

In [17]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="indexedLabel",
                            featuresCol="indexedFeatures",
                            predictionCol="prediction",
                            numTrees=100)
pipeline = build_pipeline(rf)

param_grid = (ParamGridBuilder()
              .addGrid(dt.maxDepth, [1, 10, 20]) # must <= 30
              .addGrid(dt.maxBins, [10, 20, 30, 40, 50])
              .addGrid(dt.minInfoGain, [0.0, 0.05])
              .build())

evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                              predictionCol="prediction",
                                              metricName="accuracy")

validator = TrainValidationSplit(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, parallelism=1, seed=42)
validated_model = validator.fit(train_df)

In [18]:
validated_model.bestModel.stages[2].extractParamMap()

{Param(parent='RandomForestClassifier_ba60fe650fb8', name='bootstrap', doc='Whether bootstrap samples are used when building trees.'): True,
 Param(parent='RandomForestClassifier_ba60fe650fb8', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False,
 Param(parent='RandomForestClassifier_ba60fe650fb8', name='checkpointInterval', doc='set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext.'): 10,
 Param(parent='RandomForestClassifier_ba60fe650fb8', name='featureSubsetStrategy', doc="The number of features to consider for splits at each tree node. Supp

In [19]:
predictions = validated_model.bestModel.transform(test_df)
evaluator.evaluate(predictions)

0.6724794294802965

In [20]:
cm = (predictions.select('prediction', 'indexedLabel')
      .groupby("indexedLabel")
      .pivot("prediction", list(range(0, 7)))
      .count()
      .fillna(0)
      .sort("indexedLabel")
      .toPandas())
cm = cm.reindex(cm["indexedLabel"], axis=0).drop("indexedLabel", axis=1)
cm

Unnamed: 0_level_0,0,1,2,3,4,5,6
indexedLabel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,63756,20307,837,0,0,0,0
1.0,16782,46768,0,0,0,0,0
2.0,4079,0,6593,0,0,0,0
3.0,28,6120,0,0,0,0,0
4.0,2298,0,2936,0,0,0,0
5.0,2848,0,0,0,0,0,0
6.0,0,0,805,0,0,0,0


## Feature Importance

In [21]:
best_model = validated_model.bestModel.stages[-1]
importances = [(k, v) for v, k in zip(best_model.featureImportances.toArray(), new_df.columns)] # 마지막은 wilderness_area
pd.DataFrame(importances, columns=["feature", "importance"]).sort_values("importance", ascending=False)

Unnamed: 0,feature,importance
0,elevation,0.389976
50,cover_type,0.188755
19,soil_type_9,0.068208
31,soil_type_21,0.063238
21,soil_type_11,0.04612
13,soil_type_3,0.040963
47,soil_type_37,0.030435
5,horz_dist_to_road,0.023659
32,soil_type_22,0.023002
48,soil_type_38,0.020977
