In [383]:
from pyspark.sql import SparkSession, functions
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable

## Ext tools

In [384]:
class ColsFilter(Transformer, DefaultParamsReadable, DefaultParamsWritable):
    def __init__(self, *args, **kwargs):
        super().__init__()
    
    @staticmethod
    def transform(df):
        result = df.select('features', functions.col('ctr').alias('label'))
        return result

In [340]:
def model_train_cycle(model, grid, train_df, test_df, params_to_show):
    evaluator = RegressionEvaluator()
    pipeline = Pipeline(stages=[vectorizer, cols_filter, model])
    best_model_finder = TrainValidationSplit(estimator=pipeline,
                                     estimatorParamMaps=grid,
                                     evaluator=evaluator,
                                     trainRatio=0.7,
                                     seed=42)
    best_model = best_model_finder.fit(train_df).bestModel
    result = dict()
    result['best model'] = best_model
    for param in params_to_show:
        result[param] = best_model.stages[-1].getOrDefault(param)
    best_model_res = best_model.transform(test_df)
    rmse = evaluator.evaluate(best_model_res)
    result['RMSE'] = round(rmse,4)
    return result

## Init

In [3]:
spark = SparkSession.builder.appName('research').getOrCreate()

## Data load

In [369]:
train_df, test_df = spark.read.parquet('./train.parquet'), spark.read.parquet('./test.parquet')

In [370]:
train_df.show(5)

+-----+---------------------+---------+------+------+----------------+---------+-----------------+
|ad_id|target_audience_count|has_video|is_cpm|is_cpc|         ad_cost|day_count|              ctr|
+-----+---------------------+---------+------+------+----------------+---------+-----------------+
|    1|     10707.2440058622|        1|     1|     0|201.829292651124|       15|0.431740082807281|
|    5|     10643.3872649482|        1|     1|     0|192.577221699704|       15|0.809264519216201|
|    6|     11418.7085911347|        1|     1|     0|204.104562956739|       11|0.909738306804039|
|    7|     10109.3278687796|        1|     1|     0|194.255798599684|       12|0.941221039774456|
|    8|     10665.1119991977|        1|     1|     0|202.658042557742|       14|0.986790019690954|
+-----+---------------------+---------+------+------+----------------+---------+-----------------+
only showing top 5 rows



## Check cols for usability

In [371]:
train_df.corr('is_cpm', 'is_cpc')

-1.0

In [372]:
train_df.corr('is_cpc', 'has_video')

0.19889353323887435

In [373]:
train_df.corr('target_audience_count', 'has_video')

-0.37534598443572686

In [374]:
train_df.corr('target_audience_count', 'ad_cost')

0.3758782397440922

## Drop unusual columns

In [379]:
train_df = train_df.drop('is_cpm')

In [380]:
test_df = test_df.drop('is_cpm')

In [381]:
train_df.columns[1:-1]

['target_audience_count', 'has_video', 'is_cpc', 'ad_cost', 'day_count']

In [382]:
test_df.columns[1:-1]

['target_audience_count', 'has_video', 'is_cpc', 'ad_cost', 'day_count']

## Transformers initialization

In [54]:
vectorizer = VectorAssembler(inputCols=train_df.columns[1:-1], outputCol='features')

In [385]:
cols_filter = ColsFilter()

In [386]:
cols_filter.save('./cols_filter')

AttributeError: 'ColsFilter' object has no attribute 'shouldOverwrite'

## LinearRegression train

In [56]:
reccomended_model = LinearRegression(maxIter=40, regParam=0.4, elasticNetParam=0.8)

In [57]:
reccomended_model_pipeline = Pipeline(stages=[vectorizer, cols_filter, reccomended_model])

In [58]:
mdl = reccomended_model_pipeline.fit(train_df)

In [59]:
result = mdl.transform(test_df)

In [61]:
RegressionEvaluator().evaluate(result)

0.5735646296916438

## DecisionTree train

#### stage 1

In [188]:
dc_model = DecisionTreeRegressor(seed=42)

In [200]:
dc_model_params = ParamGridBuilder()\
                    .addGrid(dc_model.maxDepth, [_ for _ in range(5, 9)])\
                    .addGrid(dc_model.maxBins, [_ for _ in range(20, 61, 4)])\
                    .build()

In [345]:
dc_train_result = model_train_cycle(dc_model, dc_model_params, train_df, test_df, ['maxDepth', 'maxBins'])

In [346]:
dc_train_result

{'best model': PipelineModel_4581b376b74d,
 'maxDepth': 7,
 'maxBins': 120,
 'RMSE': 0.2487}

#### stage 2

In [219]:
dc_model_params = ParamGridBuilder()\
                    .addGrid(dc_model.maxDepth, [_ for _ in range(6, 9)])\
                    .addGrid(dc_model.maxBins, [_ for _ in range(60, 81, 10)])\
                    .build()

In [345]:
dc_train_result = model_train_cycle(dc_model, dc_model_params, train_df, test_df, ['maxDepth', 'maxBins'])

In [346]:
dc_train_result

{'best model': PipelineModel_4581b376b74d,
 'maxDepth': 7,
 'maxBins': 120,
 'RMSE': 0.2487}

#### stage 3

In [245]:
dc_model = DecisionTreeRegressor(seed=42, maxDepth=7)

In [246]:
dc_model_params = ParamGridBuilder()\
                    .addGrid(dc_model.maxBins, [_ for _ in range(100, 141, 10)])\
                    .build()

In [345]:
dc_train_result = model_train_cycle(dc_model, dc_model_params, train_df, test_df, ['maxDepth', 'maxBins'])

In [346]:
dc_train_result

{'best model': PipelineModel_4581b376b74d,
 'maxDepth': 7,
 'maxBins': 120,
 'RMSE': 0.2487}

#### stage 4

In [264]:
dc_model_params = ParamGridBuilder()\
                    .addGrid(dc_model.maxBins, [_ for _ in range(115, 125)])\
                    .build()

In [345]:
dc_train_result = model_train_cycle(dc_model, dc_model_params, train_df, test_df, ['maxDepth', 'maxBins'])

In [346]:
dc_train_result

{'best model': PipelineModel_4581b376b74d,
 'maxDepth': 7,
 'maxBins': 120,
 'RMSE': 0.2487}

## RandomForest train

In [272]:
rf_model = RandomForestRegressor(seed=42)

In [273]:
rf_params = ParamGridBuilder()\
                .addGrid(rf_model.numTrees, [_ for _ in range(1, 11, 3)])\
                .addGrid(rf_model.maxDepth, [_ for _ in range(1, 8, 3)])\
                .addGrid(rf_model.maxBins, [_ for _ in range(20, 41, 4)])\
                .build()

In [345]:
rf_train_result = model_train_cycle(rf_model, rf_params, train_df, test_df, ['numTrees', 'maxDepth', 'maxBins'])

In [346]:
rf_train_result

{'best model': PipelineModel_4581b376b74d,
 'maxDepth': 7,
 'maxBins': 120,
 'RMSE': 0.2487}

## GradientBoosting train

In [287]:
gb_model = GBTRegressor(seed=42)

In [288]:
gb_model.getOrDefault('lossType')

'squared'

In [289]:
gb_model.getOrDefault('maxIter')

20

In [290]:
gb_model.getOrDefault('maxDepth')

5

In [291]:
gb_model.getOrDefault('maxBins')

32

#### stage 1

In [294]:
gb_params = ParamGridBuilder()\
                .addGrid(gb_model.maxIter, [_ for _ in range(10, 31, 5)])\
                .addGrid(gb_model.maxDepth, [_ for _ in range(1, 8, 3)])\
                .addGrid(gb_model.maxBins, [_ for _ in range(24, 41, 4)])\
                .build()

In [345]:
gb_train_result = model_train_cycle(gb_model, gb_params, train_df, test_df, ['maxIter', 'maxDepth', 'maxBins'])

In [346]:
gb_train_result

{'best model': PipelineModel_4581b376b74d,
 'maxDepth': 7,
 'maxBins': 120,
 'RMSE': 0.2487}

#### stage 2

In [305]:
gb_params = ParamGridBuilder()\
                .addGrid(gb_model.maxIter, [_ for _ in range(6, 11, 2)])\
                .addGrid(gb_model.maxDepth, [_ for _ in range(6, 11, 2)])\
                .addGrid(gb_model.maxBins, [_ for _ in range(40, 61, 4)])\
                .build()

In [345]:
gb_train_result = model_train_cycle(gb_model, gb_params, train_df, test_df, ['maxIter', 'maxDepth', 'maxBins'])

In [346]:
gb_train_result

{'best model': PipelineModel_4581b376b74d,
 'maxDepth': 7,
 'maxBins': 120,
 'RMSE': 0.2487}

#### stage 3

In [316]:
gb_params = ParamGridBuilder()\
                .addGrid(gb_model.maxIter, [_ for _ in range(7, 10)])\
                .addGrid(gb_model.maxDepth, [_ for _ in range(4, 7)])\
                .addGrid(gb_model.maxBins, [_ for _ in range(60, 121, 20)])\
                .build()

In [345]:
gb_train_result = model_train_cycle(gb_model, gb_params, train_df, test_df, ['maxIter', 'maxDepth', 'maxBins'])

In [346]:
gb_train_result

{'best model': PipelineModel_4581b376b74d,
 'maxDepth': 7,
 'maxBins': 120,
 'RMSE': 0.2487}

#### stage 4

In [347]:
gb_model = GBTRegressor(seed=42, lossType='squared', maxDepth=6)

In [348]:
gb_params = ParamGridBuilder()\
                .addGrid(gb_model.maxIter, [_ for _ in range(8, 12)])\
                .addGrid(gb_model.maxBins, [_ for _ in range(118, 123)])\
                .build()

In [349]:
gb_train_result = model_train_cycle(gb_model, gb_params, train_df, test_df, ['maxIter', 'maxDepth', 'maxBins'])

In [350]:
gb_train_result

{'best model': PipelineModel_db9462407026,
 'maxIter': 10,
 'maxDepth': 6,
 'maxBins': 120,
 'RMSE': 0.2491}

## Model selection

* Two models that show minimal RMSE are DT and GBT. 
* DT is much simplier but on big train dataset GBT could show better results i think. 
* Since we work on educational project i will choise DT model with params to iterate trough during training process:

In [355]:
{'maxDepth': [_ for _ in range(5, 10)],
 'maxBins': [_ for _ in range(110, 131, 2)]}

{'maxDepth': [5, 6, 7, 8, 9],
 'maxBins': [110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130]}