In [65]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
import datetime

print(f'Done at {datetime.datetime.now()}')

Done at 2024-10-16 22:47:01.023260


In [66]:
# load data into dataframe
csv_file = 'ad_click_dataset.csv'
data = pd.read_csv(csv_file)

# defining helper function to fill empty values
def fill_vals_weighted(column_name):
    entries = data[column_name].dropna().unique()
    occurences = data[column_name].value_counts()
    return random.choices(entries, weights=[occurences.get(entry) for entry in entries])[0]

def fill_vals(column_name):
    entries = data[column_name].dropna().unique()
    return random.choices(entries)[0]
        

# deal with missing values
cols_to_fill = ['gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day']
for col in cols_to_fill:
    data[col] = data[col].fillna('Unknown')
    
# imputer = KNNImputer(n_neighbors=5)
# data[['age']] = imputer.fit_transform(data['age'])
    
mean_age = data['age'].mean()
# data['age'] = data['age'].fillna(mean_age)

print(f'Done at {datetime.datetime.now()}')

Done at 2024-10-16 22:47:02.501633


In [67]:
data.head(5)

Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
0,670,User670,22.0,Unknown,Desktop,Top,Shopping,Afternoon,1
1,3044,User3044,,Male,Desktop,Top,Unknown,Unknown,1
2,5912,User5912,41.0,Non-Binary,Unknown,Side,Education,Night,1
3,5418,User5418,34.0,Male,Unknown,Unknown,Entertainment,Evening,1
4,9452,User9452,39.0,Non-Binary,Unknown,Unknown,Social Media,Morning,0


In [85]:
# preparing to create training and testing sets
features = data.drop(['id', 'click', 'full_name'], axis=1)
encoded_features = pd.get_dummies(features, columns=['gender', 'device_type', 'ad_position', 'browsing_history', 'time_of_day'])

imputer = KNNImputer(n_neighbors=5)
cols = encoded_features.select_dtypes(include=['boolean']).columns.to_list()
encoded_features[['age']] = imputer.fit_transform(encoded_features[['age']])

target = data['click']

# print(f'Done at {datetime.datetime.now()}')
encoded_features.head(20)

Unnamed: 0,age,gender_Female,gender_Male,gender_Non-Binary,gender_Unknown,device_type_Desktop,device_type_Mobile,device_type_Tablet,device_type_Unknown,ad_position_Bottom,...,browsing_history_Entertainment,browsing_history_News,browsing_history_Shopping,browsing_history_Social Media,browsing_history_Unknown,time_of_day_Afternoon,time_of_day_Evening,time_of_day_Morning,time_of_day_Night,time_of_day_Unknown
0,22.0,False,False,False,True,True,False,False,False,False,...,False,False,True,False,False,True,False,False,False,False
1,40.197363,False,True,False,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
2,41.0,False,False,True,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,True,False
3,34.0,False,True,False,False,False,False,False,True,False,...,True,False,False,False,False,False,True,False,False,False
4,39.0,False,False,True,False,False,False,False,True,False,...,False,False,False,True,False,False,False,True,False,False
5,40.197363,False,False,True,False,False,False,False,True,True,...,False,False,False,True,False,False,True,False,False,False
6,26.0,True,False,False,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
7,40.0,False,True,False,False,False,True,False,False,False,...,False,False,False,False,True,False,True,False,False,False
8,40.197363,False,False,True,False,False,True,False,False,True,...,False,False,False,True,False,False,False,False,False,True
9,40.197363,False,False,False,True,False,False,False,True,True,...,False,False,False,False,False,True,False,False,False,False


In [41]:
# creating training and testing sets
X_train, X_test, y_train, y_test = train_test_split(encoded_features, target, test_size=0.1, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'Done at {datetime.datetime.now()}')

Done at 2024-10-16 18:32:19.954476


In [49]:
# model creation
model = Sequential([
    Input(shape=X_train_scaled.shape[1]),

    Dense(256),
    BatchNormalization(),
    Activation('relu'),
    Dropout(.2),
    
    Dense(128),
    BatchNormalization(),
    Activation('relu'),
    Dropout(.2),
    
    Dense(64),
    BatchNormalization(),
    Activation('relu'),
    Dropout(.2),
    
    Dense(64),
    BatchNormalization(),
    Activation('relu'),
    
    Dense(1, activation='sigmoid')
])

print(f'Done at {datetime.datetime.now()}')

Done at 2024-10-16 18:38:04.314506


In [50]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
    )

# initial_learning_rate = .001
# lr_schedule = ExponentialDecay(
#     initial_learning_rate,
#     decay_steps=10000,
#     decay_rate=0.96,
#     staircase=True)

optimizer = Adam(learning_rate=.0001)

loss = BinaryCrossentropy()

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=['accuracy']
    )

model.fit(
    X_train_scaled,
    y_train,
    validation_split=.2,
    batch_size=32,
    shuffle=True,
    epochs=150,
    callbacks=[early_stopping],
    verbose=2
    )

print(f'Done at {datetime.datetime.now()}')

Epoch 1/150
225/225 - 2s - loss: 0.6947 - accuracy: 0.5932 - val_loss: 0.6575 - val_accuracy: 0.6256 - 2s/epoch - 9ms/step
Epoch 2/150
225/225 - 1s - loss: 0.6742 - accuracy: 0.6144 - val_loss: 0.6387 - val_accuracy: 0.6611 - 1s/epoch - 5ms/step
Epoch 3/150
225/225 - 1s - loss: 0.6678 - accuracy: 0.6240 - val_loss: 0.6344 - val_accuracy: 0.6600 - 1s/epoch - 5ms/step
Epoch 4/150
225/225 - 1s - loss: 0.6598 - accuracy: 0.6283 - val_loss: 0.6300 - val_accuracy: 0.6678 - 1s/epoch - 5ms/step
Epoch 5/150
225/225 - 1s - loss: 0.6511 - accuracy: 0.6374 - val_loss: 0.6298 - val_accuracy: 0.6622 - 1s/epoch - 5ms/step
Epoch 6/150
225/225 - 1s - loss: 0.6474 - accuracy: 0.6436 - val_loss: 0.6274 - val_accuracy: 0.6633 - 1s/epoch - 5ms/step
Epoch 7/150
225/225 - 1s - loss: 0.6516 - accuracy: 0.6364 - val_loss: 0.6268 - val_accuracy: 0.6589 - 1s/epoch - 5ms/step
Epoch 8/150
225/225 - 1s - loss: 0.6459 - accuracy: 0.6457 - val_loss: 0.6258 - val_accuracy: 0.6572 - 1s/epoch - 5ms/step
Epoch 9/150
225/

In [51]:
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print(f'Test accuracy; {round(test_accuracy, 4) * 100}%, Test loss: {round(test_loss, 3)}')

print(f'Done at {datetime.datetime.now()}')

Test accuracy; 66.5%, Test loss: 0.628
Done at 2024-10-16 18:39:36.021488
