In [1]:
import pandas as pd
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib
pd.set_option('display.max_columns', None)

print("Libraries imported")

with open('Adobe Devcraft PS/user.profile.tags.txt') as f:
    tag_dict = {}
    for idx, line in enumerate(f.readlines()):
        tag_dict[line[:5]] = line[6:-1]

column_names = [
    "BidID", "Timestamp", "Logtype", "VisitorID", "User-Agent", "IP", "Region", "City",
    "Adexchange", "Domain", "URL", "AnonymousURLID", "AdslotID", "Adslotwidth",
    "Adslotheight", "Adslotvisibility", "Adslotformat", "Adslotfloorprice",
    "CreativeID", "Biddingprice", "Payingprice", "KeypageURL", "AdvertiserID", "User_tag"
]

Libraries imported


In [2]:
imp = pd.read_csv("dataset_combined/imp.csv", low_memory=False)
print(f"Imporession txt shape : {imp.shape}")

Imporession txt shape : (12237087, 24)


In [3]:
imp['Timestamp'] = (imp['Timestamp']//1000000)%1000

In [5]:
numerical_cols = imp.select_dtypes(include=['number']).columns
for col in ['Timestamp', 'Adslotwidth', 'Adslotheight']:
  print(col, imp['Payingprice'].corr(imp[col]))

Timestamp -0.036426937528005576
Adslotwidth -0.12513953622796092
Adslotheight 0.08271497268468539


In [7]:
for col in ['Adexchange', 'Adslotvisibility', 'Adslotformat']:
    temp_df = imp.groupby(col)
    print(temp_df['Payingprice'].mean())
# print(imp.groupby('Adexchange')['Payingprice'].mean())
# # print(imp.groupby('Region')['Payingprice'].mean()) no meaning
# # print(imp.groupby('City')['Payingprice'].mean()) no meaning
# print(imp.groupby('Adslotvisibility')['Payingprice'].mean())
# print(imp.groupby('Adslotformat')['Payingprice'].mean())


Adexchange
1    95.698898
2    76.994395
3    64.259066
Name: Payingprice, dtype: float64
Adslotvisibility
0      75.726989
1      99.723296
2      73.108124
255    87.272547
Name: Payingprice, dtype: float64
Adslotformat
0     70.296212
1     94.367957
5    156.994856
Name: Payingprice, dtype: float64


In [3]:
# print(str(imp['Timestamp'].max())[4:], str(imp['Timestamp'].min())[4:])
# imp['Timestamp'] = imp['Timestamp']

In [8]:
imp = imp.drop(['Logtype', 'VisitorID', 'User-Agent', 'AdslotID', 'IP', 'Domain', 'URL', 'AnonymousURLID', 'Adslotfloorprice', 'Biddingprice', 'KeypageURL', 'Timestamp'], axis=1)
print("OHE encoding user tags....")
imp['User_tag'] = imp['User_tag'].str.split(',')
imp = imp.explode('User_tag')
imp = pd.get_dummies(imp, columns=['User_tag'])
imp = imp.groupby('BidID', as_index=False).max()
column_names = list(imp.columns)
for i, col in enumerate(column_names):
    if(not col[:4] == 'User'):
        continue
    column_names[i] = tag_dict[col[-5:]]
imp.columns = column_names
for key in tag_dict:
    if(tag_dict[key] not in column_names):
        imp[tag_dict[key]] = False

OHE encoding user tags....


In [9]:
for key in tag_dict:
    temp_df = imp.groupby(tag_dict[key])
    print(temp_df['Payingprice'].mean())

Long-term interest/news
False    82.282857
True     75.719674
Name: Payingprice, dtype: float64
Long-term interest/eduation
False    77.599538
True     80.325586
Name: Payingprice, dtype: float64
Long-term interest/automobile
False    78.537932
True     77.072253
Name: Payingprice, dtype: float64
Long-term interest/real estate
False    78.131423
True     80.014302
Name: Payingprice, dtype: float64
Long-term interest/IT
False    78.590153
True     76.292012
Name: Payingprice, dtype: float64
Long-term interest/electronic game
False    78.927685
True     75.333591
Name: Payingprice, dtype: float64
Long-term interest/fashion
False    79.010009
True     75.373778
Name: Payingprice, dtype: float64
Long-term interest/entertainment
False    80.92541
True     76.76983
Name: Payingprice, dtype: float64
Long-term interest/luxury
False    78.274599
True     76.898752
Name: Payingprice, dtype: float64
Long-term interest/home and lifestyle
False    78.197099
True     80.989020
Name: Payingprice, dty

In [5]:
# label encoding
print("Label encoding categorical features...")
label_encoders = {}
imp = imp.drop(columns=['BidID'], axis=1)
categorical_features = ['Region', 'City', 'Adslotvisibility', 'Adslotformat', 'CreativeID', 'AdvertiserID', 'Adexchange']
for col in categorical_features:
    le = LabelEncoder()
    imp[col] = le.fit_transform(imp[col])
    label_encoders[col] = le

boolean_features = [col for col in imp.columns if imp[col].dtype == 'bool']
imp[boolean_features] = imp[boolean_features].astype(int)

Label encoding categorical features...


In [6]:
imp.columns

Index(['Region', 'City', 'Adexchange', 'Adslotwidth', 'Adslotheight',
       'Adslotvisibility', 'Adslotformat', 'CreativeID', 'Payingprice',
       'AdvertiserID', 'Long-term interest/news',
       'Long-term interest/eduation', 'Long-term interest/automobile',
       'Long-term interest/real estate', 'Long-term interest/IT',
       'Long-term interest/electronic game', 'Long-term interest/fashion',
       'Long-term interest/entertainment', 'Long-term interest/luxury',
       'Long-term interest/home and lifestyle', 'Long-term interest/health',
       'Long-term interest/food', 'Long-term interest/divine',
       'Long-term interest/motherhood&parenting', 'Long-term interest/sports',
       'Long-term interest/travel&outdoors', 'Long-term interest/social',
       'Demographic/gender/male', 'Demographic/gender/famale',
       'In-market/3c product', 'In-market/appliances',
       'In-market/clothing、shoes&bags', 'In-market/Beauty& Personal Care',
       'In-market/household&home impro

In [7]:
from sklearn.metrics import mean_squared_error, r2_score
X = imp.drop(columns=['Payingprice'], axis=1)
y = imp['Payingprice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
price_model = xgb.XGBClassifier(objective='reg:squarederror', use_label_encoder=False, n_estimators=30, max_depth=6, learning_rate=0.08, early_stopping_rounds=5)
price_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)



[0]	validation_0-mlogloss:4.32836
[1]	validation_0-mlogloss:4.20235
[2]	validation_0-mlogloss:4.11586
[3]	validation_0-mlogloss:4.04858
[4]	validation_0-mlogloss:3.99299
[5]	validation_0-mlogloss:3.94537
[6]	validation_0-mlogloss:3.90382
[7]	validation_0-mlogloss:3.86747
[8]	validation_0-mlogloss:3.83505
[9]	validation_0-mlogloss:3.80570
[10]	validation_0-mlogloss:3.77935
[11]	validation_0-mlogloss:3.75538
[12]	validation_0-mlogloss:3.73347
[13]	validation_0-mlogloss:3.71343
[14]	validation_0-mlogloss:3.69497
[15]	validation_0-mlogloss:3.67794
[16]	validation_0-mlogloss:3.66208
[17]	validation_0-mlogloss:3.64737
[18]	validation_0-mlogloss:3.63341
[19]	validation_0-mlogloss:3.62047
[20]	validation_0-mlogloss:3.60863
[21]	validation_0-mlogloss:3.59744
[22]	validation_0-mlogloss:3.58703
[23]	validation_0-mlogloss:3.57724
[24]	validation_0-mlogloss:3.56813
[25]	validation_0-mlogloss:3.55946
[26]	validation_0-mlogloss:3.55135
[27]	validation_0-mlogloss:3.54377
[28]	validation_0-mlogloss:3.5

In [8]:
y_pred = price_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

test = pd.concat([X_test, y_test], axis=1)
sample = test.sample(20)
print(price_model.predict(sample.drop(columns=['Payingprice'], axis=1)), '\n', list(sample['Payingprice']))


Mean Squared Error: 4637.961345600284
R-squared: -0.29701351899306316
[ 49  85  30  44  73  70  85  80 127  89  44  73  16  20  89  47  70  50
  84  70] 
 [112, 120, 50, 44, 73, 80, 65, 80, 144, 192, 186, 84, 16, 20, 119, 240, 20, 50, 84, 70]
