## Data Sampling

This module samples the data into a subset for resource limitations.

1. Check for valid session which contains clickout reference in its impression list
2. Add new fields to the data for different actions and reference types
3. Sample the data into a smaller dataset which considers first 50K sessions

#### Sample the Data

##### Reading Data

In [23]:
import os
import pandas as pd
import numpy as np

In [26]:
TAKE_N_SESSIONS = 50000
DATA_PATH = os.path.join("data")  # change it with your data folder
WRITE_PATH = os.path.join("data", "sampled")

try:
    os.mkdir(WRITE_PATH)
except FileExistsError:
    pass

train_filepath = os.path.join(DATA_PATH, "train.csv")
test_filepath = os.path.join(DATA_PATH, "test.csv")

print("Reading data...")
raw_train_data = pd.read_csv(train_filepath)
raw_train_data["session_valid"] = 1

all_session_ids = raw_train_data['session_id'].drop_duplicates()

print("Number of sessions: ", len(all_session_ids))

sampled_session_ids = all_session_ids[:TAKE_N_SESSIONS]  # pick the session id's that will be used
sampled_raw_train_data = raw_train_data.loc[raw_train_data['session_id'].isin(sampled_session_ids)]  # apply filtering

print("Labeling invalid sessions...")
invalid_sess=[]
sessions = []

for i,sess in sampled_raw_train_data.groupby("session_id"):
    sessions.append(i)
    session_clickouts = sess[sess["action_type"] == "clickout item"]
    if len(session_clickouts) == 0:
        invalid_sess.append(i)
        continue

    for sess_id, co in session_clickouts.iterrows():
        if co["reference"] not in co["impressions"].split("|"):
            invalid_sess.append(i)
            break

sampled_raw_train_data.loc[sampled_raw_train_data["session_id"].isin(invalid_sess), "session_valid"] = 0

sampled_raw_train_data.to_csv(os.path.join(WRITE_PATH, "train_sample.csv"))  # write selected sessions to a csv

Reading data...
Number of sessions:  910683
Labeling invalid sessions...


#### Understand the Data 

In [9]:
sampled_raw_train_data.head()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
0,00RL8Z82B2Z1,aff3928535f48,1541037460,1,search for poi,Newtown,AU,"Sydney, Australia",mobile,,,
1,00RL8Z82B2Z1,aff3928535f48,1541037522,2,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
2,00RL8Z82B2Z1,aff3928535f48,1541037522,3,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
3,00RL8Z82B2Z1,aff3928535f48,1541037532,4,interaction item image,666856,AU,"Sydney, Australia",mobile,,,
4,00RL8Z82B2Z1,aff3928535f48,1541037532,5,interaction item image,109038,AU,"Sydney, Australia",mobile,,,


In [10]:
sampled_raw_train_data.tail()

Unnamed: 0,user_id,session_id,timestamp,step,action_type,reference,platform,city,device,current_filters,impressions,prices
15932987,ZYNMLE3MV3LK,62728015bec05,1541544490,15,interaction item image,6617798,PT,"Paris, France",desktop,,,
15932988,ZYNMLE3MV3LK,62728015bec05,1541544491,16,clickout item,6617798,PT,"Paris, France",desktop,Focus on Distance,6617798|1263420|9567886|1161323|149768|1890735...,58|96|55|75|90|60|233|104|150|145|328|207|150|...
15932989,ZYNMLE3MV3LK,62728015bec05,1541544540,17,clickout item,2712342,PT,"Paris, France",desktop,Focus on Distance,6617798|1263420|9567886|1161323|149768|1890735...,58|96|55|75|90|60|233|104|150|145|328|207|150|...
15932990,ZYNMLE3MV3LK,62728015bec05,1541544967,18,change of sort order,interaction sort button,PT,"Paris, France",desktop,,,
15932991,ZYNMLE3MV3LK,62728015bec05,1541544973,19,clickout item,1161323,PT,"Paris, France",desktop,Focus on Distance,6617798|1263420|9567886|1161323|149768|1890735...,58|96|55|75|90|60|233|104|150|145|328|207|150|...


#### Read Sampled Data

In [5]:
sampled_raw_train_data = pd.read_csv(os.path.join("data", "sampled", "train_sample.csv"), index_col=0)

#### Check Valid Sessions

In [6]:
actions = sampled_raw_train_data['action_type'].drop_duplicates()
print(actions)  # first occurence and name of the action

0               search for poi
1       interaction item image
13               clickout item
16       interaction item info
62      interaction item deals
116     search for destination
125           filter selection
178    interaction item rating
179            search for item
185       change of sort order
Name: action_type, dtype: object


In [None]:
click_actions = sampled_raw_train_data[sampled_raw_train_data['action_type'] == "clickout item"]

invalid_click_actions = click_actions[click_actions["reference"].isin(click_actions["impressions"].apply(lambda x: x.split("|")))]
invalid_click_actions

In [43]:
invalid_session_ids = list()

for action in click_actions.to_numpy():
    session_id = action[2]
    reference = action[6]
    impressions = action[-1]
    
    if reference not in impressions:
        invalid_session_ids.append(session_id)
        
print(invalid_session_ids)

['641a409c90fac', 'eeaed44183a45', '661c65d405426', 'f5033775f3c82', '05010f12ea5d3', '08b88abc663e9', 'fe080f62523b5', '6762189ecabb6', '806fdfd02e112', '70efab42b3841', '70efab42b3841', '3249bf623f5ca', '3249bf623f5ca', '3249bf623f5ca', '3249bf623f5ca', '3249bf623f5ca', '3249bf623f5ca', '3249bf623f5ca', '3249bf623f5ca', '3249bf623f5ca', '3249bf623f5ca', '3249bf623f5ca', '3249bf623f5ca', 'afc0c14fc2882', '2bafb1904f934', '2bafb1904f934', '8d384825cf756', '8d384825cf756', '8d384825cf756', '8d384825cf756', '71730a9777f97', 'cabd8159fa194', '147d869f28713', 'd423b27bd6679', '8e3591d8fb22d', '8e3591d8fb22d', '8e3591d8fb22d', '8e3591d8fb22d', '8e3591d8fb22d', 'cb0086dfcd935', '943422579a32a', 'f8728b92e54c3', 'f8728b92e54c3', '9771f9ffc139d', '9771f9ffc139d', 'd3a73f5bf054d', '1b18332b0a514', '0798a2b759892', '88e1613844b19', '72adeafe395b0']


In [46]:
print(sampled_raw_train_data.shape)
for sess_id in invalid_session_ids:
    sampled_raw_train_data.drop(sampled_raw_train_data[sampled_raw_train_data['session_id'] == sess_id].index, inplace=True)
print(sampled_raw_train_data.shape)

(879579, 13)
(878532, 13)


#### Check Valid Sessions 

In [23]:

sampled_raw_train_data[sampled_raw_train_data["session_id"] in invalid_session_ids]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [27]:
print(type(sampled_raw_train_data['session_id'][5]))


<class 'str'>


In [34]:
print(sampled_raw_train_data[sampled_raw_train_data['session_id'] == "641a409c90fac"])

Empty DataFrame
Columns: [Unnamed: 0, user_id, session_id, timestamp, step, action_type, reference, platform, city, device, current_filters, impressions, prices]
Index: []


In [37]:
sampled_raw_train_data['session_id'] == "641a409c90fac"

0         False
1         False
2         False
3         False
4         False
          ...  
879574    False
879575    False
879576    False
879577    False
879578    False
Name: session_id, Length: 878532, dtype: bool

In [None]:
click_actions["impressions"].apply(lambda x: x.split("|"))

In [20]:
invalid_sess=[]
sessions = []
for i,sess in sampled_raw_train_data.groupby("session_id"):
    sessions.append(i)
    session_clickouts = sess[sess["action_type"] == "clickout item"]
    if len(session_clickouts) == 0:
        invalid_sess.append(i)
        continue
    for sess_id, co in session_clickouts.iterrows():
        if co["reference"] not in co["impressions"].split("|"):
            invalid_sess.append(i)
            break