In [1]:
import os
os.chdir('./..')

In [2]:
import data
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle
pd.options.display.max_columns = None
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from sklearn.utils import class_weight
from sklearn.metrics import classification_report

Using TensorFlow backend.


In [95]:
import keras
from keras import metrics
from keras import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten, Conv1D
from keras import regularizers
from keras.metrics import top_k_categorical_accuracy

In [82]:
def mrr_metric(y_true, y_pred):
    mrr = 0
    current_percentage = 0
    for i in range(1, 26, 1):
        if i == 1:
            mrr = metrics.top_k_categorical_accuracy(y_true, y_pred, k=i)
            current_percentage = metrics.top_k_categorical_accuracy(y_true, y_pred, k=i)
        else:
            t = metrics.top_k_categorical_accuracy(y_true, y_pred, k=i)
            mrr += (t - current_percentage) * (1 / i)
            current_percentage = t
    return mrr

In [7]:
def _extract_features(df, submission_mode=False):
    # get the rows where the action is 'clickout item'
    if submission_mode:
        clickout_rows_df = df[(df['action_type'] == 'clickout item') & df['reference'].isnull()]
    else:
        clickout_rows_df = df[df['action_type'] == 'clickout item']

    if len(clickout_rows_df) > 0:

        # features
        features = {
            # impressions features
            'times_impression_appeared': [],
            'time_elapsed_from_last_time_impression_appeared': [],
            'steps_from_last_time_impression_appeared': [],
            'kind_action_reference_appeared': [],
            'impression_position': [],
            'label': [],
            'price': [],
            'price_position': [],

            'delta_position': [],

            # session features
            'session_length': [],
            'session_steps': [],
            'time_from_last_action': [],
            'reference_position_last_action': [],

            'index': []}

        clk = clickout_rows_df.tail(1)

        head_index = df.head(1).index

        # considering only the past!
        # mantain of the df only the actions before the last clickout
        df = df.loc[head_index.values[0]:clk.index.values[0] - 1]

        if len(df) > 0:
            session_length = clk['timestamp'].values[0] - df.head(1)['timestamp'].values[0]
            time_from_last_action = clk['timestamp'].values[0] - df.tail(1)['timestamp'].values[0]
            if df.tail(1)['reference'].values[0].isdigit():
                last_ref = int(df.tail(1)['reference'])
            else:
                last_ref = 0
        else:
            session_length = -0.5
            time_from_last_action = -0.5
            last_ref = -0.5
        session_steps = clk['step'].values[0]

        # get the impression
        impr = list(map(int, clk['impressions'].values[0].split('|')))

        if last_ref in impr:
            reference_position_last_action = impr.index(last_ref)
        else:
            reference_position_last_action = -0.5

        prices = list(map(int, clk['prices'].values[0].split('|')))
        sorted_prices = prices.copy()
        sorted_prices.sort()

        references = df['reference'].values

        count = 0
        for i in impr:
            if reference_position_last_action >= 0:
                delta_pos = count - reference_position_last_action
            else:
                delta_pos = count
            indices = np.where(references == str(i))[0]

            features['index'].append(clk.index[0])
            features['impression_position'].append(count + 1)
            features['price'].append(prices[count])
            features['price_position'].append(sorted_prices.index(prices[count]))
            if len(indices) > 0:
                row_reference = df.head(indices[-1] + 1).tail(1)
                features['steps_from_last_time_impression_appeared'].append(len(df) - indices[-1])
                features['time_elapsed_from_last_time_impression_appeared'].append(
                    int(clk['timestamp'].values[0] - row_reference['timestamp'].values[0]))
                features['kind_action_reference_appeared'].append(row_reference['action_type'].values[0])
            else:
                features['steps_from_last_time_impression_appeared'].append(-0.5)
                features['time_elapsed_from_last_time_impression_appeared'].append(-0.5)
                features['kind_action_reference_appeared'].append('no_action')
            features['times_impression_appeared'].append(len(indices))
            features['delta_position'].append(delta_pos)
            features['session_length'].append(session_length)
            features['session_steps'].append(session_steps)
            features['time_from_last_action'].append(time_from_last_action)
            features['reference_position_last_action'].append(reference_position_last_action)

            if submission_mode:
                features['label'].append(0)
            else:
                if int(clk['reference'].values[0]) == i:
                    features['label'].append(1)
                else:
                    features['label'].append(0)

            count += 1

        # zero padd the impressions with 0 feature values
        missing_impr_count = 25 - len(impr)
        if missing_impr_count > 0:
            for k in features.keys():
                if k == 'label':
                    features[k].extend(np.zeros(missing_impr_count))
                elif k == 'delta_position':
                    features[k].extend(np.ones(missing_impr_count) * 25)
                else:
                    features[k].extend(np.ones(missing_impr_count) * -1)
        return pd.DataFrame(features)


### CREATE FEATURES DATAFRAMES

In [8]:
train_df = data.train_df('small', 'no_cluster')
test_df = data.test_df('small', 'no_cluster')

print('extracting features from TRAIN...')
train_features_dataframe = train_df.groupby(['user_id', 'session_id']).progress_apply(_extract_features)

print('extracting features from TEST...')
test_features_dataframe = test_df.groupby(['user_id', 'session_id']).progress_apply(_extract_features, submission_mode=True)

extracting features from TRAIN...


HBox(children=(IntProgress(value=0, max=45790), HTML(value='')))


extracting features from TEST...


HBox(children=(IntProgress(value=0, max=11448), HTML(value='')))




### CREATE DATASET

In [11]:
X, Y = train_features_dataframe.iloc[:, [0, 1, 2, 4, 6, 7, 8]], train_features_dataframe.iloc[:, 5]
X_session_features = train_features_dataframe.iloc[:, [9, 10, 11, 12]]
scaler1 = MinMaxScaler()
scaler2 = MinMaxScaler()

print('scaling...')
# normalize the values
X_session_features_norm = scaler1.fit_transform(X_session_features)
X_norm = scaler2.fit_transform(X)
Y_norm = Y.values

# removing duplicates from session featureS
X_session_features = []
for i in range(0, X_session_features_norm.shape[0], 25):
    X_session_features.append(X_session_features_norm[i])
X_session_features = np.array(X_session_features)

# shuffle the data
print('shuffling...')
X_norm_shuffled = []
Y_norm_shuffled = []
for i in tqdm(range(0, X_norm.shape[0], 25)):
    x, y = shuffle(X_norm[i:i + 25], Y_norm[i:i + 25])
    X_norm_shuffled.append(x)
    Y_norm_shuffled.append(y)

scaling...
shuffling...


HBox(children=(IntProgress(value=0, max=41654), HTML(value='')))




In [28]:
np.array(X_norm_shuffled).shape

(41654, 25, 7)

In [62]:
np.array(Y_norm_shuffled).shape

(41654, 25)

In [168]:
# create the train and test data to be saved
data_train = np.array(X_norm_shuffled)
labels = np.array(Y_norm_shuffled)

# add the session features to the samples
#data_train = np.concatenate((data_train, X_session_features), axis=1)

X_train, X_val, Y_train, Y_val = train_test_split(data_train, labels, shuffle=True, test_size=0.2)

In [173]:
X_train.shape

(33323, 25, 7)

In [147]:
np.resize(X_train, (-1, X_train.shape))

ValueError: setting an array element with a sequence.

In [162]:
X_train=X_train.reshape((33323, 25, 7,1))

In [171]:
model = create_model()
model.fit(X_train, Y_train, validation_data=(X_val, Y_val),
                       epochs=1000,
                       shuffle=False)

ValueError: Error when checking input: expected conv2d_29_input to have 4 dimensions, but got array with shape (33323, 25, 7)

In [170]:
def create_model():
    model = Sequential()

    model.add(Conv2D(25,(25,1), input_shape=(25,7,1), activation='relu'))
    model.add(Flatten())
    model.add(Dense(50, activation='relu'))
    model.add(Dense(50, activation='relu'))

    #model.add(Conv2D(64, (5, 5), activation='relu'))

    
    model.add(Dense(25, activation='softmax'))

    # compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[metrics.categorical_accuracy, mrr_metric])
    return model