In [1]:
import pandas as pd 

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv('/kaggle/input/real-time-advertisers-auction/Dataset.csv')

In [3]:
def weird_division(n, d):
    return n / d if d else 0

df['CPM'] = df.apply(lambda x: weird_division(((x['total_revenue']*100)),x['measurable_impressions'])*1000 , axis=1)

In [4]:
df.shape[0]

567291

In [5]:
df = df[df.CPM >= 0].reset_index(drop=True)

df.date = pd.to_datetime(df.date)

In [6]:
train = df[df.date < '2019-06-22'].reset_index(drop=True)
train = train[train.CPM < train.CPM.quantile(.95)].reset_index(drop=True)

test = df[df['date'] >= '2019-06-22'].reset_index(drop=True)
test = test[test.CPM < test.CPM.quantile(.95)].reset_index(drop=True)

In [7]:
train.shape[0]+test.shape[0]

538652

In [8]:
feat = ['site_id','ad_type_id','geo_id','device_category_id',
        'advertiser_id','order_id','line_item_type_id','os_id',
        'integration_type_id','monetization_channel_id','ad_unit_id',
       ]

In [10]:
n_estimator = 200

models = []

models.append(('CART', DecisionTreeRegressor()))
models.append(('SS_CART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeRegressor())])))
models.append(('MMS_CART', Pipeline([('MinMax', MinMaxScaler()),('CART', DecisionTreeRegressor())])))
models.append(('N_CART', Pipeline([('Norm', Normalizer()),('CART', DecisionTreeRegressor())])))

models.append(('ET', ExtraTreesRegressor(n_estimators=n_estimator)))
models.append(('SS_ET', Pipeline([('Scaler', StandardScaler()),('ET', ExtraTreesRegressor(n_estimators=n_estimator))])))
models.append(('MMS_ET', Pipeline([('MinMax', MinMaxScaler()),('ET', ExtraTreesRegressor(n_estimators=n_estimator))])))
models.append(('N_ET', Pipeline([('Norm', Normalizer()),('ET', ExtraTreesRegressor(n_estimators=n_estimator))])))

models.append(('GB', GradientBoostingRegressor(n_estimators=n_estimator)))
models.append(('SS_GB', Pipeline([('Scaler', StandardScaler()),('GB', GradientBoostingRegressor(n_estimators=n_estimator))])))
models.append(('MMS_GB', Pipeline([('MinMax', MinMaxScaler()),('GB', GradientBoostingRegressor(n_estimators=n_estimator))])))
models.append(('N_GB', Pipeline([('Norm', Normalizer()),('GB', GradientBoostingRegressor(n_estimators=n_estimator))])))

In [11]:
for name, model in models:
    m_fit = model.fit(train[feat].values, train.CPM.values.ravel())
    y_predict = m_fit.predict(test[feat].values)
    print(name, mean_squared_error(test.CPM, y_predict))

CART 3634.828469492683
SS_CART 3627.4713469357484
MMS_CART 3625.9236590950495
N_CART 3717.5803258233714
ET 3534.977824103481
SS_ET 3533.761390883856
MMS_ET 3534.0866354899213
N_ET 3548.318327770764
GB 4364.037342221434
SS_GB 4364.03637479731
MMS_GB 4364.031101945555
N_GB 4738.325107505117
