# Generate Dataset
using the raw data in database, we will generate dataset using DatasetGenerator

In [1]:
from datetime import datetime

from dataset.labeler import Labeler
from dataset.dataset_generator import DatasetGenerator
from dataset.feature_transformer import FeatureTransformer
from dataset.session_filterer import SessionFilterer

In [2]:
dataset_generator = DatasetGenerator(
    feature_transformer=FeatureTransformer(),
    labeler=Labeler(),
    session_filterer=SessionFilterer(),

    start_date=datetime(2023, 3, 1), end_date=datetime(2023, 3, 2)
)

X, Y = dataset_generator.generate()

generate dataset...


121546it [00:27, 4490.62it/s] 

generate dataset done.





In [3]:
X.shape, Y.shape

((87492, 38), (87492,))

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
import pandas as pd

# Assuming you have arrays X and Y already defined

# Concatenate X and Y along the second axis (column-wise)
concatenated = np.concatenate((X, Y.reshape(-1, 1)), axis=1)

# Create a dataframe from the concatenated array
df = pd.DataFrame(concatenated)

# Optional: If you want to assign column names to the dataframe
column_names = [
            "session_duration",
            "num_actions",
            "num_unique_pages",
            "is_member",
            "is_App",
            "device",
            "content_types",
            "Banner001",
            "TabBar",
            "PopupAD",
            "Home",
            "CustomPage",
            "Sidebar",
            "num_unique_pages",
            "UTMSource",
            "ContentName",
            "viewcategory_count",
            "viewactivity_count",
            "viewalbumdetail_count",
            "viewvideodetail_count",
            "viewarticledetail_count",
            "viewecoupondetail_count",
            "viewpromotiondetail_count",
            "viewproduct_count",
            "search_count",
            "addtocart_count",
            "viewmainpage_count",
            "viewcategory_time",
            "viewactivity_time",
            "viewalbumdetail_time",
            "viewvideodetail_time",
            "viewarticledetail_time",
            "viewecoupondetail_time",
            "viewpromotiondetail_time",
            "viewproduct_time",
            "search_time",
            "addtocart_time",
            "viewmainpage_time"
        ] + ['Y']
df.columns = column_names

df.head()


Unnamed: 0,session_duration,num_actions,num_unique_pages,is_member,is_App,device,content_types,Banner001,TabBar,PopupAD,...,viewalbumdetail_time,viewvideodetail_time,viewarticledetail_time,viewecoupondetail_time,viewpromotiondetail_time,viewproduct_time,search_time,addtocart_time,viewmainpage_time,Y
0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.574067
1,2.157,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.157,1.538117
2,2.929,3.0,2.0,1.0,1.0,1.0,2.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.157,1.52525
3,81.492,4.0,2.0,1.0,1.0,1.0,2.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.157,0.215867
4,94.438,5.0,2.0,1.0,1.0,1.0,2.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,12.946,0.0,0.0,0.0,2.157,0.0001


# (一) 隨機切分train/test (8:2)

In [5]:
y1 = df['Y'].dropna()
X1 = df.drop(['Y'],axis=1) 
train_X1, test_X1, train_y1, test_y1 = train_test_split(X1, y1, test_size=0.2, random_state=3654)

train_X1 = train_X1.reset_index(drop=True)
test_X1 = test_X1.reset_index(drop=True)


print('train shape:', train_X1.shape)
print('test shape:', test_X1.shape)

train shape: (69993, 38)
test shape: (17499, 38)


In [10]:
# 使用 Random Forest 模型預測

model = RandomForestRegressor(n_estimators=10, random_state=3654)
model.fit(train_X1, train_y1)

y_pred1_rf = model.predict(test_X1)

mean_absolute_error(test_y1, y_pred1_rf)

2.6814590317193194

In [11]:
# 使用 Gradient Boosting 模型預測

model = GradientBoostingRegressor(n_estimators=10, random_state=3654)
model.fit(train_X1, train_y1)

y_pred1_gb = model.predict(test_X1)

mean_absolute_error(test_y1, y_pred1_gb)

9.184350507785997

# (二) 依index順序，將前80%做為train，後20%做為test

In [12]:
train_rows = int(0.8 * len(df))
train2 = df[:train_rows]
test2 = df[train_rows:]

print('train shape:', train2.shape)
print('test shape:', test2.shape)

train shape: (69993, 39)
test shape: (17499, 39)


In [13]:
train_y2 = train2['Y'].dropna()
train_X2 = train2.drop(['Y'], axis=1)
test_y2 = test2['Y'].dropna()
test_X2 = test2.drop(['Y'], axis=1)

train_X2 = train_X2.reset_index(drop=True)
test_X2 = test_X2.reset_index(drop=True)

In [14]:
# 使用 Random Forest 模型預測

model = RandomForestRegressor(n_estimators=10, random_state=3654)
model.fit(train_X2, train_y2)

y_pred2_rf = model.predict(test_X2)

mean_absolute_error(test_y2, y_pred2_rf)

7.397581718524592

In [15]:
# 使用 Gradient Boosting 模型預測

model = GradientBoostingRegressor(n_estimators=10, random_state=3654)
model.fit(train_X2, train_y2)

y_pred2_gb = model.predict(test_X2)

mean_absolute_error(test_y2, y_pred2_gb)

7.8052813246990445

# (三) 先抽樣做feature selection，再進行預測

In [16]:
import random

random.seed(6829)

sample_df = df.sample(n=1000)

sam_X = sample_df.drop(['Y'],axis=1)
sam_Y = sample_df['Y']

In [17]:
def RandomForest_evaluation(X,Y):
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size = 0.2, random_state = 3654) 

    # RandomForestClassifier訓練模型
    model = RandomForestRegressor(random_state = 98437)
    model.fit(X_train, Y_train)

    # 預測
    Y_pred = model.predict(X_test)

    # 回傳evaluation_metrics_on_test_set
    return {'MAE': mean_absolute_error(Y_test, Y_pred)}

In [18]:
res = pd.DataFrame(RandomForest_evaluation(sam_X,sam_Y), index=['ALL'])
res

Unnamed: 0,MAE
ALL,6.307934


In [19]:
#RFE
from sklearn.feature_selection import RFE

def rfe_selection( X , Y, k=20):
    X_train, X_test, Y_train, Y_test = train_test_split(
        sam_X, sam_Y, test_size = 0.2, random_state = 3654)
    
    #對特徵標準化
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = RandomForestRegressor(random_state=98437)
    rfe = RFE(model)
    rfe = rfe.fit(X_train_scaled, Y_train)

    feature_names = X.columns[rfe.get_support()]
    
    return feature_names

rfe_selection(sam_X,sam_Y, 20)

Index(['session_duration', 'num_actions', 'num_unique_pages', 'device',
       'content_types', 'Home', 'num_unique_pages', 'UTMSource',
       'viewcategory_count', 'viewpromotiondetail_count', 'viewproduct_count',
       'search_count', 'viewmainpage_count', 'viewcategory_time',
       'viewecoupondetail_time', 'viewpromotiondetail_time',
       'viewproduct_time', 'search_time', 'viewmainpage_time'],
      dtype='object')

In [24]:
res = pd.concat([res, pd.DataFrame(RandomForest_evaluation(
        sam_X[rfe_selection(sam_X,sam_Y)], 
        sam_Y), 
        index=['RFE'])])

res

Unnamed: 0,MAE
ALL,6.307934
RFE,6.237156


In [25]:
def feature_importance(X,Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 3654)
    
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = RandomForestRegressor(random_state=98437)
    model = model.fit(X_train_scaled,Y_train)

    return model


def select_features_from_model(model,X):
    
    model = SelectFromModel(model, prefit=True, threshold=0.0015)
    feature_idx = model.get_support()
    feature_names = X.columns[feature_idx]
        
    return feature_names

model = feature_importance(sam_X,sam_Y)
feature_imp_feature_names = select_features_from_model(model,sam_X)
feature_imp_feature_names

Index(['session_duration', 'num_actions', 'num_unique_pages', 'device',
       'content_types', 'Banner001', 'Home', 'CustomPage', 'Sidebar',
       'num_unique_pages', 'UTMSource', 'viewcategory_count',
       'viewpromotiondetail_count', 'viewproduct_count', 'search_count',
       'viewmainpage_count', 'viewcategory_time', 'viewecoupondetail_time',
       'viewpromotiondetail_time', 'viewproduct_time', 'search_time',
       'viewmainpage_time'],
      dtype='object')

In [28]:
res = pd.concat([res, pd.DataFrame(
        RandomForest_evaluation(
            sam_X[feature_imp_feature_names],
            sam_Y), 
            index=['Feature Importance'])])
res

Unnamed: 0,MAE
ALL,6.307934
RFE,6.237156
Feature Importance,6.156794
Feature Importance,6.156794


In [29]:
def feature_importance(X,Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 3654)
    
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = RandomForestRegressor(random_state=98437)
    model = model.fit(X_train_scaled,Y_train)

    return model


def select_features_from_model(model,X):
    
    model = SelectFromModel(model, prefit=True, threshold=0.0015)
    feature_idx = model.get_support()
    feature_names = X.columns[feature_idx]
        
    return feature_names

model = feature_importance(sam_X,sam_Y)
feature_imp_feature_names = select_features_from_model(model,sam_X)
feature_imp_feature_names

Index(['session_duration', 'num_actions', 'num_unique_pages', 'device',
       'content_types', 'Banner001', 'Home', 'CustomPage', 'Sidebar',
       'num_unique_pages', 'UTMSource', 'viewcategory_count',
       'viewpromotiondetail_count', 'viewproduct_count', 'search_count',
       'viewmainpage_count', 'viewcategory_time', 'viewecoupondetail_time',
       'viewpromotiondetail_time', 'viewproduct_time', 'search_time',
       'viewmainpage_time'],
      dtype='object')

In [30]:
res = pd.concat([res, pd.DataFrame(
        RandomForest_evaluation(
            sam_X[feature_imp_feature_names],
            sam_Y), 
            index=['Feature Importance'])])
res

Unnamed: 0,MAE
ALL,6.307934
RFE,6.237156
Feature Importance,6.156794
Feature Importance,6.156794
Feature Importance,6.156794


In [31]:
from sklearn.decomposition import PCA

def pca_selection( X , Y, k):
    
    # Split train and test sets
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, 
        Y, 
        test_size = 0.2,  
        random_state = 3654)
    
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    pca = PCA(n_components=k)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    model = RandomForestRegressor(random_state=3654)
    model.fit(X_train_pca, Y_train)

    # Get feature importances from RandomForestRegressor
    feature_importances = model.feature_importances_
    # Sort feature importances in descending order
    sorted_indices = feature_importances.argsort()[::-1]
    # Get the feature names based on sorted indices
    feature_names = X.columns[sorted_indices][:k]

    return feature_names

pca_selected_features = pca_selection(sam_X, sam_Y, 20)
print(pca_selected_features)

Index(['is_App', 'Banner001', 'session_duration', 'ContentName', 'TabBar',
       'num_unique_pages', 'viewactivity_count', 'content_types', 'device',
       'is_member', 'num_actions', 'viewcategory_count',
       'viewalbumdetail_count', 'UTMSource', 'viewvideodetail_count',
       'num_unique_pages', 'Sidebar', 'PopupAD', 'Home', 'CustomPage'],
      dtype='object')


In [32]:
res = pd.concat([res, pd.DataFrame(
        RandomForest_evaluation(
            sam_X[pca_selected_features],
            sam_Y), 
            index=['PCA'])])
res

Unnamed: 0,MAE
ALL,6.307934
RFE,6.237156
Feature Importance,6.156794
Feature Importance,6.156794
Feature Importance,6.156794
PCA,7.071534


In [33]:
# 使用RFE篩選出的變數

y3 = df['Y'].dropna()
X3 = df.drop(['is_member',
              'is_App',        
              'Banner001',               
              'TabBar',
              'PopupAD',                       
              'CustomPage',                
              'Sidebar',                                  
              'ContentName',                            
              'viewactivity_count',           
              'viewalbumdetail_count',        
              'viewvideodetail_count',       
              'viewarticledetail_count',         
              'viewpromotiondetail_count',           
              'search_count',             
              'addtocart_count',                   
              'viewactivity_time',         
              'viewalbumdetail_time',        
              'viewvideodetail_time',
              'viewarticledetail_time',
              'addtocart_time',            
              'Y'],axis=1) 
  

In [34]:
train_X3, test_X3, train_y3, test_y3 = train_test_split(X3, y3, test_size=0.2, random_state=3654)

train_X3 = train_X3.reset_index(drop=True)
test_X3 = test_X3.reset_index(drop=True)


print('train shape:', train_X3.shape)
print('test shape:', test_X3.shape)

train shape: (69993, 18)
test shape: (17499, 18)


In [35]:
# 使用 Random Forest 模型進行預測

model = RandomForestRegressor(n_estimators=10, random_state=3654)
model.fit(train_X3, train_y3)

y_pred3_rf = model.predict(test_X3)

mean_absolute_error(test_y3, y_pred3_rf)

2.9862905842755025

In [36]:
# 使用 Gradient Boosting 模型進行預測

model = GradientBoostingRegressor(n_estimators=10, random_state=3654)
model.fit(train_X3, train_y3)

y_pred3_rf = model.predict(test_X3)

mean_absolute_error(test_y3, y_pred3_rf)

9.383679727473499