# **CatBoost**

### За основу взят ноутбук из вебинара "CatBoost на больших данных", канал Karpov.Courses, ведущий вебинара Александр Савченко

Репозиторий с исходником: https://github.com/AlexKbit/pyspark-catboost-example

In [None]:
%%capture
!pip install pyspark==3.0.3

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.functions import col
from pyspark.sql.types import StructField, StructType

In [None]:
spark = SparkSession.builder\
    .master('local[*]')\
    .appName('CatBoostWithSpark')\
    .config("spark.jars.packages", "ai.catboost:catboost-spark_3.0_2.12:1.0.3")\
    .config("spark.executor.cores", "2")\
    .config("spark.task.cpus", "2")\
    .config("spark.driver.memory", "2g")\
    .config("spark.driver.memoryOverhead", "2g")\
    .config("spark.executor.memory", "2g")\
    .config("spark.executor.memoryOverhead", "2g")\
    .getOrCreate()

In [None]:
spark

In [None]:
import catboost_spark

In [None]:
schema_dataset = "col1 String, col2 String, col3 Double, col4 Double, col5 Double, target Integer"
df = spark.read.csv('/content/data.csv',sep=',',header=True,schema = schema_dataset)

In [None]:
df.printSchema()

root
 |-- col1: string (nullable = true)
 |-- col2: string (nullable = true)
 |-- col3: double (nullable = true)
 |-- col4: double (nullable = true)
 |-- col5: double (nullable = true)
 |-- target: integer (nullable = true)



In [None]:
print(df.describe().show())

+-------+----+----+------------------+------------------+-----------------+------------------+
|summary|col1|col2|              col3|              col4|             col5|            target|
+-------+----+----+------------------+------------------+-----------------+------------------+
|  count| 999| 999|               999|               999|              999|               999|
|   mean|null|null| 6.126728018535534|3091.0939393577505| 315.711484855864|0.4954954954954955|
| stddev|null|null|0.5898659455919512| 6063.724047846411|248.0598507266117|0.5002301371468432|
|    min|   a|   a|       4.605170186|       2.718281828|      0.010761534|                 0|
|    max|   z|   z|       6.907755279|       22026.46579|      984.2199136|                 1|
+-------+----+----+------------------+------------------+-----------------+------------------+

None


In [None]:
print(df.show(7))

+----+----+-----------+-----------+-----------+------+
|col1|col2|       col3|       col4|       col5|target|
+----+----+-----------+-----------+-----------+------+
|   o|   r|6.084499413|2980.957987|   1.737842|     0|
|   z|   x|6.553933404|8103.083928|2.884142703|     0|
|   w|   w|4.779123493|22026.46579|0.035646382|     0|
|   t|   w|6.663132696|148.4131591|4.600071127|     0|
|   p|   s|6.508769137|7.389056099|1.625782932|     1|
|   u|   r|5.860786223|54.59815003|6.712529775|     1|
|   r|   a|6.708084084|2980.957987|7.360113804|     0|
+----+----+-----------+-----------+-----------+------+
only showing top 7 rows

None


In [None]:
TARGET_LABEL = 'target'

In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol=TARGET_LABEL, 
    predictionCol="prediction", 
    metricName='f1')

In [None]:
train_df, test_df = df.randomSplit([0.75, 0.25])

### Train CatBoost with Pool

In [None]:
col1_indexer = StringIndexer(inputCol='col1', outputCol="col1_index")
col2_indexer = StringIndexer(inputCol='col2', outputCol="col2_index")
features = ["col1_index", "col2_index", "col3", "col4", "col5"]
assembler = VectorAssembler(inputCols=features, outputCol='features')

In [None]:
def prepare_vector(df: DataFrame)-> DataFrame:
  result_df = col1_indexer.fit(df).transform(df)
  result_df = col2_indexer.fit(result_df).transform(result_df)
  result_df = assembler.transform(result_df)
  return result_df

In [None]:
train = prepare_vector(train_df)
test = prepare_vector(test_df)

In [None]:
print(train.show(7))

+----+----+-----------+-----------+-----------+------+----------+----------+--------------------+
|col1|col2|       col3|       col4|       col5|target|col1_index|col2_index|            features|
+----+----+-----------+-----------+-----------+------+----------+----------+--------------------+
|   a|   a|5.129898715|20.08553692|272.5829356|     1|       8.0|      23.0|[8.0,23.0,5.12989...|
|   a|   b|4.804021045|403.4287935| 888.578495|     0|       8.0|       4.0|[8.0,4.0,4.804021...|
|   a|   b|5.446737372|54.59815003|537.8494671|     0|       8.0|       4.0|[8.0,4.0,5.446737...|
|   a|   b|6.401917197|148.4131591|101.2690374|     0|       8.0|       4.0|[8.0,4.0,6.401917...|
|   a|   b|6.883462586|1096.633158|670.3254037|     0|       8.0|       4.0|[8.0,4.0,6.883462...|
|   a|   e| 6.03787092|20.08553692|464.8122752|     0|       8.0|      12.0|[8.0,12.0,6.03787...|
|   a|   f|5.407171771|20.08553692|390.9714078|     1|       8.0|       2.0|[8.0,2.0,5.407171...|
+----+----+---------

In [None]:
train_pool = catboost_spark.Pool(train.select(['features', TARGET_LABEL]))
train_pool.setLabelCol(TARGET_LABEL)
train_pool.setFeaturesCol('features')

Pool_602a442374ff

In [None]:
classifier = catboost_spark.CatBoostClassifier(featuresCol='features', labelCol=TARGET_LABEL)
classifier.setIterations(50)
classifier.setDepth(5)

CatBoostClassifier_2d1eab184387

In [None]:
model = classifier.fit(train_pool)
predict = model.transform(test)
print(f'Model F1 = {evaluator.evaluate(predict)}')

Model F1 = 0.5133384417477316


In [None]:
print(predict.show(7))

+----+----+-----------+-----------+-----------+------+----------+----------+--------------------+--------------------+--------------------+----------+
|col1|col2|       col3|       col4|       col5|target|col1_index|col2_index|            features|       rawPrediction|         probability|prediction|
+----+----+-----------+-----------+-----------+------+----------+----------+--------------------+--------------------+--------------------+----------+
|   a|   a|6.646390515|148.4131591|157.7424017|     1|       4.0|       2.0|[4.0,2.0,6.646390...|[-0.0039150571071...|[0.49804248144779...|       1.0|
|   a|   c|5.950642553|54.59815003| 6.21892211|     1|       4.0|       4.0|[4.0,4.0,5.950642...|[0.04040159315686...|[0.52018981257247...|       0.0|
|   a|   g|6.436150368|148.4131591|56.60413081|     0|       4.0|      18.0|[4.0,18.0,6.43615...|[0.01749729007042...|[0.50874775233026...|       0.0|
|   a|   g|6.864847778|2980.957987|389.8896251|     1|       4.0|      18.0|[4.0,18.0,6.86484.

In [None]:
model.saveNativeModel('catboost_native')

In [None]:
model.write().overwrite().save('catboost_spark')

### Pipeline model with CatBoost

In [None]:
col1_indexer = StringIndexer(inputCol='col1', outputCol="col1_index")
col2_indexer = StringIndexer(inputCol='col2', outputCol="col2_index")
features = ["col1_index", "col2_index", "col3", "col4", "col5"]
assembler = VectorAssembler(inputCols=features, outputCol='features')

In [None]:
classifier = catboost_spark.CatBoostClassifier(featuresCol='features', labelCol=TARGET_LABEL)
classifier.setIterations(50)
classifier.setDepth(5)

CatBoostClassifier_59e51d0fc42a

In [None]:
pipeline = Pipeline(stages=[col1_indexer, col2_indexer, assembler, classifier])

In [None]:
p_model = pipeline.fit(train_df)

In [None]:
print(test_df.show(7))

+----+----+-----------+-----------+-----------+------+
|col1|col2|       col3|       col4|       col5|target|
+----+----+-----------+-----------+-----------+------+
|   a|   a|6.646390515|148.4131591|157.7424017|     1|
|   a|   c|5.950642553|54.59815003| 6.21892211|     1|
|   a|   g|6.436150368|148.4131591|56.60413081|     0|
|   a|   g|6.864847778|2980.957987|389.8896251|     1|
|   a|   o|6.364750757|20.08553692|0.010761534|     1|
|   a|   p|5.505331536|7.389056099| 307.215333|     0|
|   a|   q|6.440946541|403.4287935|167.3800662|     0|
+----+----+-----------+-----------+-----------+------+
only showing top 7 rows

None


In [None]:
predictions = p_model.transform(test_df)

In [None]:
print(predictions.show(7))

+----+----+-----------+-----------+-----------+------+----------+----------+--------------------+--------------------+--------------------+----------+
|col1|col2|       col3|       col4|       col5|target|col1_index|col2_index|            features|       rawPrediction|         probability|prediction|
+----+----+-----------+-----------+-----------+------+----------+----------+--------------------+--------------------+--------------------+----------+
|   a|   a|6.646390515|148.4131591|157.7424017|     1|       8.0|      23.0|[8.0,23.0,6.64639...|[0.22310705457457...|[0.60973872836073...|       0.0|
|   a|   c|5.950642553|54.59815003| 6.21892211|     1|       8.0|       5.0|[8.0,5.0,5.950642...|[0.28321941745872...|[0.63794104949937...|       0.0|
|   a|   g|6.436150368|148.4131591|56.60413081|     0|       8.0|      13.0|[8.0,13.0,6.43615...|[0.26712929527296...|[0.63047581171129...|       0.0|
|   a|   g|6.864847778|2980.957987|389.8896251|     1|       8.0|      13.0|[8.0,13.0,6.86484.

In [None]:
print(f'Model F1 = {evaluator.evaluate(predictions)}')

Model F1 = 0.521987688164286


In [None]:
type(p_model)

pyspark.ml.pipeline.PipelineModel

In [None]:
p_model.write().overwrite().save('catboost_pipeline')