In [132]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

date : all data are aggregated by date

shown : số lượng quảng cáo được hiển thị vào một ngày nhất định trên toàn bộ web. Các ấn tượng là miễn phí. Nghĩa là, các công ty chỉ trả tiền nếu người dùng nhấp vào quảng cáo, không phải để hiển thị

clicked : số lần nhấp vào quảng cáo. Đây là những gì các công ty phải trả cho. Bằng cách nhấp vào quảng cáo, người dùng được đưa đến trang web.

converted : số lượng chuyển đổi trên trang web đến từ quảng cáo. Để được tính, một chuyển đổi phải xảy ra cùng ngày với lần nhấp vào quảng cáo.

avg_cost_per_click :trung bình, chi phí cho mỗi nhấp chuột đó.

total_revenue : doanh thu đến từ các chuyển đổi là bao nhiêu

ad : chúng tôi có một số nhóm quảng cáo khác nhau. Điều này cho biết nhóm quảng cáo nào chúng tôi đang xem xét

In [40]:
#Dữ liệu tháng 10 và 11
data = pd.read_csv('ad_table.csv',parse_dates=['date'])
data.head(5)

Unnamed: 0,date,shown,clicked,converted,avg_cost_per_click,total_revenue,ad
0,2015-10-01,65877,2339,43,0.9,641.62,ad_group_1
1,2015-10-02,65100,2498,38,0.94,756.37,ad_group_1
2,2015-10-03,70658,2313,49,0.86,970.9,ad_group_1
3,2015-10-04,69809,2833,51,1.01,907.39,ad_group_1
4,2015-10-05,68186,2696,41,1.0,879.45,ad_group_1


In [41]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2115 entries, 0 to 2114
Data columns (total 7 columns):
date                  2115 non-null datetime64[ns]
shown                 2115 non-null int64
clicked               2115 non-null int64
converted             2115 non-null int64
avg_cost_per_click    2115 non-null float64
total_revenue         2115 non-null float64
ad                    2115 non-null object
dtypes: datetime64[ns](1), float64(2), int64(3), object(1)
memory usage: 115.8+ KB


In [42]:
data.describe()

Unnamed: 0,shown,clicked,converted,avg_cost_per_click,total_revenue
count,2115.0,2115.0,2115.0,2115.0,2115.0
mean,68299.844444,3056.077069,126.453901,1.373749,1966.517589
std,48884.821409,3783.579969,233.420826,0.754331,3942.018757
min,0.0,0.0,0.0,0.0,-200.15
25%,28030.5,744.0,18.0,0.76,235.47
50%,54029.0,1392.0,41.0,1.4,553.3
75%,97314.5,3366.5,103.0,1.92,1611.495
max,192507.0,20848.0,1578.0,4.19,39623.71


In [43]:
data['profit'] = data['total_revenue'] - (data['avg_cost_per_click']*data['clicked'])
data['profit_per_shown'] = data['profit'] /  data['shown']


data.head()

Unnamed: 0,date,shown,clicked,converted,avg_cost_per_click,total_revenue,ad,profit,profit_per_shown
0,2015-10-01,65877,2339,43,0.9,641.62,ad_group_1,-1463.48,-0.022215
1,2015-10-02,65100,2498,38,0.94,756.37,ad_group_1,-1591.75,-0.024451
2,2015-10-03,70658,2313,49,0.86,970.9,ad_group_1,-1018.28,-0.014411
3,2015-10-04,69809,2833,51,1.01,907.39,ad_group_1,-1953.94,-0.02799
4,2015-10-05,68186,2696,41,1.0,879.45,ad_group_1,-1816.55,-0.026641


In [53]:
ad_groups_with_profit = data[['ad', 'profit', 'profit_per_shown', 'converted', 'clicked']].groupby('ad').sum().reset_index()
ad_groups_with_profit = ad_groups_with_profit.sort_values(by='profit', ascending=False)

#Tìm các nhóm có profit dương
ad_groups_with_profit_positive = ad_groups_with_profit[ad_groups_with_profit['profit'] > 0]

#Tìm các nhóm có profit âm
ad_groups_with_profit_negative = ad_groups_with_profit[ad_groups_with_profit['profit'] < 0]
#
ad_groups_with_profit_positive

Unnamed: 0,ad,profit,profit_per_shown,converted,clicked
11,ad_group_2,54456.44,1.072065,6249,61912
24,ad_group_31,40265.93,0.308827,3132,36033
7,ad_group_16,32179.55,1.046841,3702,41764
5,ad_group_14,7252.95,0.862802,1090,13167
19,ad_group_27,2528.73,0.038567,884,28656


# For each group, predict how many ads will be shown on Dec, 15 (assume each ad group keeps following its trend).

In [133]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_error(ytrue, ypred))

In [62]:
import datetime

In [66]:
def convert_str_to_date(date_as_string):
    date_time_obj = datetime.datetime.strptime(date_as_string, '%Y-%m-%d')
    return date_time_obj.date()

In [71]:
#Thêm month
data['month'] = data['date'].apply(lambda row: row.month)
data

Unnamed: 0,date,shown,clicked,converted,avg_cost_per_click,total_revenue,ad,profit,profit_per_shown,month
0,2015-10-01,65877,2339,43,0.90,641.62,ad_group_1,-1463.48,-0.022215,10
1,2015-10-02,65100,2498,38,0.94,756.37,ad_group_1,-1591.75,-0.024451,10
2,2015-10-03,70658,2313,49,0.86,970.90,ad_group_1,-1018.28,-0.014411,10
3,2015-10-04,69809,2833,51,1.01,907.39,ad_group_1,-1953.94,-0.027990,10
4,2015-10-05,68186,2696,41,1.00,879.45,ad_group_1,-1816.55,-0.026641,10
...,...,...,...,...,...,...,...,...,...,...
2110,2015-11-18,78111,1326,37,1.01,553.36,ad_group_40,-785.90,-0.010061,11
2111,2015-11-19,79857,1413,34,1.00,840.72,ad_group_40,-572.28,-0.007166,11
2112,2015-11-20,75695,1332,35,1.09,468.95,ad_group_40,-982.93,-0.012985,11
2113,2015-11-21,73761,1336,36,1.11,437.03,ad_group_40,-1045.93,-0.014180,11


In [85]:
train = data[data['month'] == 10][data['ad'] == 'ad_group_2']
valid = data[data['month'] == 11][data['ad'] == 'ad_group_2']


  """Entry point for launching an IPython kernel.
  


In [86]:
train

Unnamed: 0,date,shown,clicked,converted,avg_cost_per_click,total_revenue,ad,profit,profit_per_shown,month
53,2015-10-01,50988,1267,109,0.7,1704.28,ad_group_2,817.38,0.016031,10
54,2015-10-02,51934,1035,140,0.58,918.21,ad_group_2,317.91,0.006121,10
55,2015-10-03,53386,1041,123,0.56,1548.48,ad_group_2,965.52,0.018086,10
56,2015-10-04,52008,1521,127,0.76,1911.39,ad_group_2,755.43,0.014525,10
57,2015-10-05,48766,1153,108,0.65,1403.14,ad_group_2,653.69,0.013405,10
58,2015-10-06,53933,1331,136,0.69,1901.33,ad_group_2,982.94,0.018225,10
59,2015-10-07,51941,1235,118,0.67,1894.71,ad_group_2,1067.26,0.020548,10
60,2015-10-08,50748,1048,123,0.59,3005.56,ad_group_2,2387.24,0.047041,10
61,2015-10-09,50312,1066,125,0.61,2250.06,ad_group_2,1599.8,0.031798,10
62,2015-10-10,53533,1075,119,0.6,1992.92,ad_group_2,1347.92,0.025179,10


In [151]:
#feature_to_predict = 'avg_cost_per_click'
feature_to_predict = 'shown'
features_to_drop = [feature_to_predict, 'profit', 'date', 'ad', 'month', 'profit_per_shown']

Y_train = train[feature_to_predict]#.apply(lambda row: row if row > 0 else 0)
X_train = train.drop(features_to_drop, axis=1)

#X_train['profit_per_shown'] = X_train['profit_per_shown'].apply(lambda row: row if row > 0 else 0)
X_train

Unnamed: 0,clicked,converted,avg_cost_per_click,total_revenue
53,1267,109,0.7,1704.28
54,1035,140,0.58,918.21
55,1041,123,0.56,1548.48
56,1521,127,0.76,1911.39
57,1153,108,0.65,1403.14
58,1331,136,0.69,1901.33
59,1235,118,0.67,1894.71
60,1048,123,0.59,3005.56
61,1066,125,0.61,2250.06
62,1075,119,0.6,1992.92


In [152]:
Y_valid = valid[feature_to_predict]#.apply(lambda row: row if row > 0 else 0)
X_valid = valid.drop(features_to_drop, axis=1)

#X_valid['profit_per_shown'] = X_valid['profit_per_shown']#.apply(lambda row: row if row > 0 else 0)
X_valid

Unnamed: 0,clicked,converted,avg_cost_per_click,total_revenue
84,1217,101,0.69,2113.0
85,1306,113,0.67,778.67
86,1107,111,0.6,2167.92
87,847,103,0.51,2265.14
88,1292,113,0.67,1611.21
89,908,97,0.55,2535.77
90,1476,118,0.74,2116.45
91,1334,114,0.68,2135.71
92,888,103,0.55,1480.98
93,1022,111,0.6,662.71


In [153]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [154]:
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)


model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
model.fit(X_train_scaled, Y_train)

Y_predict = model.predict(X_valid_scaled)

error = rmsle(Y_valid, Y_predict)

print('Error %.5f' % (error))

Error 1717.60562


In [155]:
model = LGBMRegressor(n_estimators=1000, learning_rate=0.001)
model.fit(X_train, (Y_train))

Y_predict = (model.predict(X_valid))
error = rmsle(Y_valid, Y_predict)

print('Error %.5f' % (error))

Error 2022.09563


In [156]:
from xgboost import XGBRegressor 

xgb = XGBRegressor()
xgb.fit(X_train, Y_train)

Y_predict = model.predict(X_valid)

error = rmsle(Y_valid, Y_predict)
print('Error %.5f' % (error))

Error 2022.09563
