In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.sparse.linalg import cg
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, roc_curve, roc_auc_score, classification_report

# Data Preprocessing

In [None]:
raw_sample_df1 = pd.read_csv("/Users/huangfuzixuan/Downloads/raw_sample.csv")
raw_sample_df1['user'] = raw_sample_df1['user'].astype(int)
raw_sample_df1.rename(columns={'user': 'userid'}, inplace=True)

raw_sample_df1['time_stamp'] = raw_sample_df1['time_stamp'].astype(int)
raw_sample_df1.rename(columns={'time_stamp': 'timestamp'}, inplace=True)

raw_sample_df1['adgroup_id'] = raw_sample_df1['adgroup_id'].astype(int)
raw_sample_df1.rename(columns={'adgroup_id': 'adgroupId'}, inplace=True)

raw_sample_df1['pid'] = raw_sample_df1['pid'].astype(str)

raw_sample_df1['nonclk'] = raw_sample_df1['nonclk'].astype(int)
raw_sample_df1['clk'] = raw_sample_df1['clk'].astype(int)
print(raw_sample_df1.dtypes)
print(raw_sample_df1.head())  

In [None]:
print("Total click through rate:",sum(raw_sample_df1['clk'])/len(raw_sample_df1))

In [None]:
raw_sample_df1 = pd.get_dummies(raw_sample_df1, columns=['pid'])
print(raw_sample_df1.head())

In [None]:
_ad_feature_df = pd.read_csv("/Users/huangfuzixuan/Downloads/ad_feature.csv", header=0)
_ad_feature_df['brand'].replace(np.nan ,'-1', inplace=True)
_ad_feature_df['adgroup_id'] = _ad_feature_df['adgroup_id'].astype(int)
_ad_feature_df.rename(columns={'adgroup_id': 'adgroupId'}, inplace=True)

_ad_feature_df['cate_id'] = _ad_feature_df['cate_id'].astype(int)
_ad_feature_df.rename(columns={'cate_id': 'cateId'}, inplace=True)

_ad_feature_df['campaign_id'] = _ad_feature_df['campaign_id'].astype(int)
_ad_feature_df.rename(columns={'campaign_id': 'campaignId'}, inplace=True)

_ad_feature_df['customer'] = _ad_feature_df['customer'].astype(int)
_ad_feature_df.rename(columns={'customer': 'customerId'}, inplace=True)

_ad_feature_df['brand'] = _ad_feature_df['brand'].astype(int)
_ad_feature_df.rename(columns={'brand': 'brandId'}, inplace=True)
_ad_feature_df['price'] = _ad_feature_df['price'].astype(float)
print(_ad_feature_df.dtypes)
print(_ad_feature_df.head())

In [None]:
user_profile_df = pd.read_csv("/Users/huangfuzixuan/Downloads/user_profile.csv")

user_profile_df = user_profile_df.drop(columns=['pvalue_level', 'new_user_class_level '])
print(user_profile_df.info())
print(user_profile_df.head())
dtype_casts = {
    'userid': 'int',
    'final_gender_code': 'int',
    'age_level': 'int',
    #'pvalue_level': 'int',
    'shopping_level': 'int',
    'occupation': 'int',
    #'new_user_class_level': 'int'
}
user_profile_df = user_profile_df.astype(dtype_casts)
print(user_profile_df.info())
print(user_profile_df.head())

In [None]:
user_profile_df = pd.read_csv("/Users/huangfuzixuan/Downloads/user_profile.csv")
user_profile_df = user_profile_df.dropna()
print(user_profile_df.info())
print(user_profile_df.head())

In [None]:
merged_df = pd.merge(raw_sample_df1, _ad_feature_df, on='adgroupId', how='outer')
datasets = pd.merge(merged_df,user_profile_df , on='userid', how='outer')
print(datasets.info())
print(datasets.shape[0])

In [None]:
datasets_cleaned = datasets.dropna()
datasets_cleaned['pid_430539_1007'] = datasets_cleaned['pid_430539_1007'].astype('int')
datasets_cleaned

In [None]:
#ctr calculation
grouped = datasets_cleaned.groupby('adgroupId')['clk'].agg(['sum', 'count'])
grouped['ctr'] = grouped['sum'] / grouped['count']
datasets_cleaned['ctr'] = datasets_cleaned['adgroupId'].map(grouped['ctr'])

datasets_cleaned

## Feature Engineering

In [None]:
datasets_cleaned['timestamp'] = pd.to_datetime(datasets_cleaned['timestamp'], unit='s')
print(datasets_cleaned)

In [None]:
import numpy as np
from datetime import datetime
import pickle
def get_time_of_day(hour):
    if hour <= 12:
        return 1
    elif hour <= 18:
        return 2
    else:
        return 3
datasets_cleaned['time_weekday'] = datasets_cleaned.timestamp.map(lambda x: 0 if x.isoweekday() >= 6 else 1)
datasets_cleaned['time_of_day'] = datasets_cleaned.timestamp.map(lambda x: get_time_of_day(x.hour)) 
datasets_cleaned

In [None]:
from sklearn.preprocessing import OneHotEncoder


lr_onehot = OneHotEncoder()
test_trans = lr_onehot.fit_transform(test_df_lr[['time_weekday', 'time_of_day']])
test_trans = pd.DataFrame(test_trans.toarray())
lr_onehot = OneHotEncoder()
test_trans = lr_onehot.transform(test_df_lr[[ 'time_weekday', 'time_of_day']].values)
test_trans = pd.DataFrame(test_trans.toarray())
datasets_cleaned = pd.concat([test_df_lr.drop(['time_weekday', 'time_of_day'], axis = 1), test_trans], axis = 1)


In [None]:
#one hot encoding
import pandas as pd
from sklearn.preprocessing import OneHotEncoder


X_ctr = datasets_cleaned[['cateId', 'cms_segid', 'cms_group_id', 'final_gender_code', 'pid_430539_1007']]
X_ctr_encoded = pd.get_dummies(X_ctr)
datasets_cleaned_encoded = pd.concat(['cateId', 'cms_segid', 'cms_group_id', 'final_gender_code', 'pid_430539_1007'], axis=1), X_ctr_encoded], axis=1)
print(datasets_cleaned_encoded.head())
datasets_cleaned = datasets_cleaned_encoded


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

datasets_cleaned['date'] = pd.to_datetime(datasets_cleaned['timestamp']).dt.date

daily_clicks_count = datasets_cleaned.groupby('date')['clk'].sum()
for date, clicks in daily_clicks_count.items():
    weekday = pd.to_datetime(date).strftime('%A') 
    print(f"{date}: {clicks} clicks ({weekday})")


daily_clicks_df = pd.DataFrame({'date': daily_clicks_count.index, 'clicks': daily_clicks_count.values})
daily_clicks_df = daily_clicks_df.sort_values(by='date')
daily_clicks_df = pd.DataFrame({'date': daily_clicks_count.index, 'clicks': daily_clicks_count.values})


plt.figure(figsize=(10, 6))
plt.plot(daily_clicks_df['date'], daily_clicks_df['clicks'], marker='o')
plt.xlabel('Date')
plt.ylabel('Number of Clicks')
plt.title('Clicks per Day')
plt.xticks(rotation=45)  
plt.tight_layout()  
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import pandas as pd

datasets_cleaned['timestamp'] = pd.to_datetime(datasets_cleaned['timestamp'])

datasets_cleaned['time_interval'] = (datasets_cleaned['timestamp'].dt.hour // 2) * 2
datasets_cleaned['time_interval'] = datasets_cleaned['timestamp'].dt.hour // 2

clicks_per_interval = datasets_cleaned.groupby('time_interval')['clk'].sum()
plt.figure(figsize=(10, 6))
plt.bar(clicks_per_interval.index, clicks_per_interval.values)
plt.xlabel('Time Interval')
plt.ylabel('Number of Clicks')
plt.title('Clicks per Time Interval')
plt.xticks(clicks_per_interval.index, ['{}-{}'.format(i*2, (i+1)*2) for i in clicks_per_interval.index])
plt.show()

In [None]:

grouped = datasets_cleaned.groupby('cateId')['ctr'].agg(['mean', 'count']).reset_index()  
grouped.columns = ['cateId', 'average_ctr', 'count']  
top10_cateId_ctr_count = grouped.nlargest(10, 'count')  

print(top10_cateId_ctr_count)

# Random Forest

In [None]:
features = ['cms_segid', 'cms_group_id', 'final_gender_code', 'age_level', 'shopping_level', 'occupation', 'pid_430539_1007', 'pvalue_level', 'price']
X_train = train_set[features]
y_train = train_set['clk']
X_test = test_set[features]
y_test = test_set['clk']
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [None]:

from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred_rf)
r2 = r2_score(y_test, y_pred_rf)

print("Mean Squared Error:", mse)
print("R-squared:", r2)

In [None]:
#cross validation
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np


grid_values = {'max_features': np.linspace(1,20,20, dtype='int32'),
               'min_samples_leaf': [5],
               'n_estimators': [500],
               'random_state': [88]} 

tic = time.time()

rf = RandomForestRegressor() 
rf_cv = GridSearchCV(rf, param_grid=grid_values, cv=5)
rf_cv.fit(X_train, y_train)

toc = time.time()

# Linear Regression

In [None]:
X_ctr= datasets_cleaned[['cateId', 'price', 'cms_segid', 'cms_group_id', 'final_gender_code', 'age_level', 'shopping_level', 'pid_430539_1007', 'pvalue_level']]
y_ctr = datasets_cleaned['ctr']

#y_test = datasets['clk']
#X_test = dtm_test

X_train_ctr, X_test_ctr, y_train_ctr, y_test_ctr = train_test_split(X_ctr, y_ctr, test_size=0.3, random_state=42)

X_train_ctr = sm.add_constant(X_train_ctr)

lr_model = sm.OLS(y_train_ctr, X_train_ctr).fit() 
print(lr_model.summary())

y_pred_ctr = lr_model.predict(X_test_ctr)