## Import training libraries

In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense
from keras.models import model_from_json
from keras.optimizers import SGD
from keras.callbacks import LearningRateScheduler, EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

## Load training and test data along with user attributes

In [4]:
df_train = pd.read_csv('../input/student-shopee-code-league-marketing-analytics/train.csv')
df_test = pd.read_csv('../input/student-shopee-code-league-marketing-analytics/test.csv')
df_users = pd.read_csv('../input/student-shopee-code-league-marketing-analytics/users.csv')

Fill in missing value with NAN

In [5]:
df_train = df_train.fillna(-1)
df_users = df_users.fillna(-1)
df_test = df_test.fillna(-1)

In [9]:
df_users.head()

Unnamed: 0,user_id,attr_1,attr_2,attr_3,age,domain
0,0,-1.0,1.0,0.0,-1.0,@gmail.com
1,1,1.0,1.0,2.0,50.0,@gmail.com
2,2,-1.0,1.0,0.0,-1.0,other
3,3,-1.0,1.0,0.0,-1.0,@gmail.com
4,4,1.0,1.0,2.0,33.0,@gmail.com


Load user attributes in a dictionary

In [8]:
user_dict = {}
for row in df_users.itertuples():
    user_dict[row.user_id] = (row.attr_1,row.attr_2,row.attr_3,row.age,row.domain)

In [11]:
user_dict

{0: (-1.0, 1.0, 0.0, -1.0, '@gmail.com'),
 1: (1.0, 1.0, 2.0, 50.0, '@gmail.com'),
 2: (-1.0, 1.0, 0.0, -1.0, 'other'),
 3: (-1.0, 1.0, 0.0, -1.0, '@gmail.com'),
 4: (1.0, 1.0, 2.0, 33.0, '@gmail.com'),
 5: (1.0, 1.0, 1.0, 30.0, '@gmail.com'),
 6: (1.0, 1.0, 2.0, 32.0, '@gmail.com'),
 7: (1.0, 1.0, 1.0, 36.0, '@gmail.com'),
 8: (1.0, 1.0, 1.0, 43.0, '@yahoo.com'),
 9: (-1.0, 1.0, 0.0, -1.0, '@gmail.com'),
 10: (1.0, 1.0, 2.0, 33.0, '@gmail.com'),
 11: (1.0, 1.0, 2.0, 32.0, '@gmail.com'),
 12: (1.0, 1.0, 2.0, 25.0, '@hotmail.com'),
 13: (-1.0, 1.0, 0.0, -1.0, '@hotmail.com'),
 14: (1.0, 1.0, 4.0, 36.0, '@gmail.com'),
 15: (1.0, 1.0, 3.0, 38.0, '@gmail.com'),
 16: (-1.0, 1.0, 0.0, -1.0, '@yahoo.com'),
 17: (1.0, 1.0, 2.0, 45.0, '@gmail.com'),
 18: (1.0, 1.0, 2.0, 28.0, '@gmail.com'),
 19: (-1.0, 1.0, 0.0, -1.0, '@gmail.com'),
 20: (1.0, 1.0, 1.0, 43.0, '@gmail.com'),
 21: (1.0, 1.0, 2.0, 44.0, '@hotmail.com'),
 22: (1.0, 1.0, 2.0, 39.0, '@yahoo.com'),
 23: (1.0, 1.0, 3.0, 22.0, '@gmail.c

Get user feature in user dictionary

In [13]:
def get_user_feature(user_id,i):
    if user_id in user_dict:
        return user_dict[user_id][i]
    else:
        return -2

In [14]:
def get_user_feature_age(user_id,i=3):
    if user_id in user_dict:
        if int(user_dict[user_id][i])<0:
            return 31.4
        return user_dict[user_id][i]
    else:
        return -2

Populate dataframe with valid integer function, if "never login" -> -1

In [15]:
def fill_ints(data):
    if isinstance(data,int):
        return data
    if data.isnumeric():
        return data
    else:
        return -1

Convert time frame to categorical data function

In [16]:
def time_to_categorical_series(df,type="hour"):
    if type == "hour":
        return df['date_time'].dt.hour.astype('category')
    elif type == "dayofweek":
        return df['date_time'].dt.dayofweek.astype('category')
    elif type == "month":
        return df['date_time'].dt.month.astype('category')
    else:
        return None
    
def time_to_categorical(df):
    hour_series = time_to_categorical_series(df,type='hour')
    dayofweek_series = time_to_categorical_series(df,type='dayofweek')
    month_series = time_to_categorical_series(df,type='month')

    df['hour'] = hour_series
    df['dayofweek'] = dayofweek_series
    df['month'] = month_series

List of categorical features and numerical features

In [17]:
cat_features = ['country_code','dayofweek','month','domain','attr3']
numerical_features = [ 'subject_line_length',
       'last_open_day', 'last_login_day', 'last_checkout_day',
       'open_count_last_10_days', 'open_count_last_30_days',
       'open_count_last_60_days', 'login_count_last_10_days',
       'login_count_last_30_days', 'login_count_last_60_days',
       'checkout_count_last_10_days', 'checkout_count_last_30_days',
       'checkout_count_last_60_days','attr1', 'attr2', 'age']

## Transform dataframe to feature function

> Tune the feature here

> With feature scaling

In [21]:
# def make_df_features(df,train=None,encoder=None,scaler=None):
#     df['attr1'] = df['user_id'].apply(lambda x: get_user_feature(x,0))
#     df['attr2'] = df['user_id'].apply(lambda x: get_user_feature(x,1))
#     df['attr3'] = df['user_id'].apply(lambda x: get_user_feature(x,2))
#     df['age'] = df['user_id'].apply(lambda x: get_user_feature_age(x))
#     df['domain'] = df['user_id'].apply(lambda x: get_user_feature(x,4))
#     df['date_time'] = pd.to_datetime(df['grass_date'])
#     df['last_open_day'] = df['last_open_day'].apply(fill_ints)
#     df['last_login_day'] = df['last_login_day'].apply(fill_ints)
#     df['last_checkout_day'] = df['last_checkout_day'].apply(fill_ints)
#     time_to_categorical(df)
#     cat = df.loc[:,cat_features].values
#     val = df.loc[:,numerical_features].values
# #   Encode data and normalise

#     if train:
#         encoder = OneHotEncoder(handle_unknown='ignore',sparse=False) # Categorical
#         scaler = MinMaxScaler() # Numerical [0,1]
#         cat = encoder.fit_transform(cat).astype(np.float64)
#         val = scaler.fit_transform(val).astype(np.float64)
#     else:
#         cat = encoder.transform(cat).astype(np.float64)
#         val = scaler.transform(val).astype(np.float64)
    
#     return np.concatenate([cat,val],axis=1),encoder,scaler

> Without feature scaling

In [35]:
def make_df_features(df,train=None,encoder=None):
    df['attr1'] = df['user_id'].apply(lambda x: get_user_feature(x,0))
    df['attr2'] = df['user_id'].apply(lambda x: get_user_feature(x,1))
    df['attr3'] = df['user_id'].apply(lambda x: get_user_feature(x,2))
    df['age'] = df['user_id'].apply(lambda x: get_user_feature(x,3))
    df['domain'] = df['user_id'].apply(lambda x: get_user_feature(x,4))
    df['date_time'] = pd.to_datetime(df['grass_date'])
    df['last_open_day'] = df['last_open_day'].apply(fill_ints)
    df['last_login_day'] = df['last_login_day'].apply(fill_ints)
    df['last_checkout_day'] = df['last_checkout_day'].apply(fill_ints)
    time_to_categorical(df)
    cat = df.loc[:,cat_features].values
    if train:
        encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
        cat = encoder.fit_transform(cat).astype(np.float64)
    else:
        cat = encoder.transform(cat).astype(np.float64)
    val = df.loc[:,numerical_features].values.astype(np.float64)
    return np.concatenate([cat,val],axis=1),encoder

## Data preparation for training

In [36]:
train, validation = train_test_split(df_train, test_size=0.2)
df_train.count

<bound method DataFrame.count of        country_code                 grass_date  user_id  subject_line_length  \
0                 4  2019-07-16 00:00:00+08:00       43                   44   
1                 4  2019-07-16 00:00:00+08:00      102                   44   
2                 6  2019-07-16 00:00:00+08:00      177                   49   
3                 1  2019-07-16 00:00:00+08:00      184                   49   
4                 6  2019-07-16 00:00:00+08:00      221                   49   
...             ...                        ...      ...                  ...   
73534             6  2019-09-02 00:00:00+08:00   127613                   39   
73535             2  2019-09-02 00:00:00+08:00   127620                   38   
73536             2  2019-09-02 00:00:00+08:00   127696                   32   
73537             2  2019-09-02 00:00:00+08:00   127807                   38   
73538             6  2019-09-02 00:00:00+08:00   127880                   39   

      

> With feature normalisation (numerical)

In [37]:
train_features,encoder,scaler = make_df_features(train,True)
validation_features,_,_ = make_df_features(validation,False,encoder=encoder,scaler=scaler)

train_labels = train['open_flag'].values
validation_labels = validation['open_flag'].values

test_features,_,_ = make_df_features(df_test,False,encoder=encoder,scaler=scaler)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

ValueError: not enough values to unpack (expected 3, got 2)

> Without feature normalisation (numerical)

In [42]:
train_features,encoder = make_df_features(train,True)
validation_features,_ = make_df_features(validation,False,encoder=encoder)

train_labels = train['open_flag'].values
validation_labels = validation['open_flag'].values

test_features,_ = make_df_features(df_test,False,encoder=encoder)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

In [43]:
train_features[0]

array([ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0., 39., 17.,  7., 10.,  0.,  3.,
        5.,  6., 18., 27.,  1.,  1.,  1.,  1.,  1., 35.])

## Tune Hyperparameter here

In [46]:
epochs_num=50
initial_learning_rate=0.001
momentum=0.9

## Load saved best model from dataset

In [None]:
# Load Latest Model
# load json and create model
json_file = open('../input/marketing-analysis-shopee-league-code/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("../input/marketing-analysis-shopee-league-code/best_model.hdf5")
print("Loaded model from disk")
model=loaded_model

## Callback function and optimiser

In [47]:
checkpoint_filepath = 'best_model.hdf5'
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_best_only=True,
    monitor='val_loss',
    mode='min')
schedule = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate,
    100,
    end_learning_rate=0.0001,
    power=2,
    cycle=False,
    name=None,
)
opt = SGD(lr=initial_learning_rate, momentum=momentum)
early_stopping = EarlyStopping(monitor='val_accuracy',mode='max',patience=5, verbose=1)

callbacks = [LearningRateScheduler(schedule),model_checkpoint_callback,early_stopping]

## Training Model

In [48]:
model = Sequential()
model.add(Dense(32, input_dim=train_features.shape[1], activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
round(model.optimizer.lr.numpy(), 5)


0.001

In [None]:
# # Fit the model
history = model.fit(train_features, train_labels, validation_data=(validation_features, validation_labels), epochs=epochs_num, batch_size=32, callbacks=callbacks,shuffle= True)

## Plot validation vs accuracy graph

In [49]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss=history.history['loss']
val_loss=history.history['val_loss']

epochs_range = range(epochs_num) #Based on early stopping epochs

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

NameError: name 'history' is not defined

In [50]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


## Making Prediction on test data

In [None]:
predictions = model.predict(test_features)
yhat = [round(x[0]).astype(np.int) for x in predictions]

## Save predictions to CSV file

In [None]:
df_test = df_test.drop([col for col in df_test.columns if col!='row_id'],axis=1)
df_test['open_flag'] = yhat
df_test.to_csv('solution.csv',index=False)