In [5]:
import pandas as pd


In [6]:
from web_tracking.pre_processing.trajectories.binned_non_stationary_trajectory import BinnedNonStationaryTrajectory
from web_tracking.pre_processing.trajectories.sequential_non_stationary_trajectory import SequentialNonStationaryTrajectory
from web_tracking.pre_processing.trajectories.binned_stationary_trajectory import BinnedStationaryTrajectory

# Import Pre-processed Data

- Firstly, we import the pre-processed data.
- Methods on this notebook require a pre-processed data.
- We create three different trajectories by using these methods:
  - Binned Non-stationary Trajectory
  - Sequential Non-stationary Trajectory
  - Binned Stationary Trajectory

In [7]:
df_pre_processed = pd.read_csv('./data/[2]urls_pre_processed.csv', parse_dates=['starts', 'ends'])
df_pre_processed.head(2)

Unnamed: 0,uid,user,url,starts,active_seconds,domain,category,ends,gap_seconds
0,2686,354471f893df6b8e,8f155e77c8af923f,2013-05-22 01:19:56,176,8f155e77c8af923f,05175ca42d439154,2013-05-22 01:22:52,0
1,2698,354471f893df6b8e,d54549e0cd7183c5,2013-05-22 01:22:52,6,8f155e77c8af923f,05175ca42d439154,2013-05-22 01:22:58,0


# Apply Non-stationarization

- The first method helps us to create `Binned Non-stationary Trajectory`.
- It is a trajectory with consecutively non-repeating domains.
- To make it, we create an instance of `BinnedNonStationaryTrajectory` with pre-processed data.
- Then we call the `create` method.
- Finally, we would be able to see the constructed trajectory.
- Then, we export the result.

In [8]:
BNST = BinnedNonStationaryTrajectory(df_pre_processed)

In [13]:
BNST_domain, = BNST.create(features=['domain'], bin_size=60).values()

In [15]:
BNST_domain

Unnamed: 0,feat_group,uids,user,domain
0,1,"[2686, 2686, 2686, 2686, 2706, 2695, 2702, 270...",354471f893df6b8e,8f155e77c8af923f
1,2,"[4118, 4120, 4120, 4120, 4120, 4113]",354471f893df6b8e,975af2aeb582ca21
2,3,[16117],354471f893df6b8e,de005655d95fe208
3,4,[4115],354471f893df6b8e,975af2aeb582ca21
4,5,"[4579, 4579, 4579, 4579, 4580, 4580, 4580, 458...",354471f893df6b8e,3837f5e3ea677a7d
...,...,...,...,...
2121,2122,"[10589, 10591]",dd3cbdd6641b85d9,29326257d3ff35ea
2122,2123,"[11754, 11756, 11774, 11774, 11774, 11774, 117...",dd3cbdd6641b85d9,5ff45759b5f7a4ec
2123,2124,"[10015, 10015, 10015, 10015, 10015]",dd3cbdd6641b85d9,9c5cd16538db7e31
2124,2125,[16645],dd3cbdd6641b85d9,d369ec04f7e84097


In [17]:
BNST_domain.to_csv('./data/[3]trajectory_binned_non_stationary_domain.csv', index=False)
# BNST_category.to_csv('./data/[3]trajectory_binned_non_stationary_category.csv', index=False)

# Apply Sequential-Aggregation

- The second method is for creating a `Sequential Non-Stationary` trajectory.
- It is a consecutively aggregated domains based on certain conditions.
- We create an instance of `SequentialNonStationaryTrajectory`.
- Then, we call the `create` method with a `threshold` parameter.
- After finishing processing, we export the result.

In [18]:
SNST = SequentialNonStationaryTrajectory(df_pre_processed)

In [19]:
SNST_domain, SNST_category = SNST.create(threshold=180, features=['domain', 'category']).values()

In [20]:
SNST_domain.head()

Unnamed: 0,feat_group,uids,user,domain,starts,ends,cumulative_active_seconds,gap_seconds
0,1,"[2686, 2698, 2700, 2704, 2706, 2684, 2695, 270...",354471f893df6b8e,8f155e77c8af923f,2013-05-22 01:19:56,2013-05-22 01:38:00,1084,0
1,2,"[4118, 4114, 4120, 4117, 4121, 4113]",354471f893df6b8e,975af2aeb582ca21,2013-05-22 01:38:04,2013-05-22 01:45:06,422,0
2,3,[16117],354471f893df6b8e,de005655d95fe208,2013-05-22 01:45:06,2013-05-22 01:45:32,26,0
3,4,"[4116, 4115, 4119]",354471f893df6b8e,975af2aeb582ca21,2013-05-22 01:46:46,2013-05-22 01:47:32,46,0
4,5,"[4579, 4580, 4577, 4578]",354471f893df6b8e,3837f5e3ea677a7d,2013-05-22 01:47:34,2013-05-22 02:19:23,1909,0


In [21]:
SNST_domain.to_csv('./data/[4]trajectory_sequential_non_stationary_domain.csv', index=False)
SNST_category.to_csv('./data/[4]trajectory_sequential_non_stationary_category.csv', index=False)

# Apply Binning

- The last method provides us a `BinnedStationary` trajectory.
- It is a trajectory where visits are equally divided into bins, so it is a kind of expansion of visit data.
- The processing works in a similar way, creating an instance of `BinnedStationary`, and calling a method `create` with a `bin_size` parameter.
- Lastly, we export the results for the next process.

In [22]:
BST = BinnedStationaryTrajectory(df_pre_processed)

In [23]:
BST_traj = BST.create(bin_size=60)

In [14]:
BST.df.head()

Unnamed: 0,user,uid,bin,seconds,share,domain,category
0,354471f893df6b8e,2686,2959,10,0.166667,8f155e77c8af923f,05175ca42d439154
1,354471f893df6b8e,2686,2960,60,1.0,8f155e77c8af923f,05175ca42d439154
2,354471f893df6b8e,2686,2961,60,1.0,8f155e77c8af923f,05175ca42d439154
3,354471f893df6b8e,2686,2962,46,0.766667,8f155e77c8af923f,05175ca42d439154
4,354471f893df6b8e,2706,2963,34,0.566667,8f155e77c8af923f,05175ca42d439154


In [15]:
BST_traj.to_csv('./data/[5]trajectory_binned_stationary.csv', index=False)