In [2]:
import pandas as pd
import sys
sys.path.append('../')

In [3]:
from web_tracking.measures.entropy import Entropy
from web_tracking.measures.entropy_rate import EntropyRate
from web_tracking.measures.predictability import Predictability

# Import Trajectory

- First of all, we import the trajectories.
- Then, we create measurement instances of `Entropy`, `EntropyRate`, `Predictability`.
- Every measurement has a same method which makes the same task: a computation by using the trajectory.
- `calculate` method provides a function with options such computing an entropy with a picked feature.
- On this notebook, we calculate the entropy and predictability values for each trajectory type.

In [4]:
df_binned_non_stationary_domain = pd.read_csv('./data/[3]trajectory_binned_non_stationary_domain.csv')
df_binned_non_stationary_category = pd.read_csv('./data/[3]trajectory_binned_non_stationary_category.csv')
df_sequential_non_stationary_domain = pd.read_csv('./data/[4]trajectory_sequential_non_stationary_domain.csv', parse_dates=['starts', 'ends'])
df_sequential_non_stationary_category = pd.read_csv('./data/[4]trajectory_sequential_non_stationary_category.csv', parse_dates=['starts', 'ends'])
df_binned_stationary = pd.read_csv('./data/[5]trajectory_binned_stationary.csv')

In [5]:
display(df_binned_non_stationary_domain.head(2))
display(df_binned_non_stationary_category.head(2))
display(df_sequential_non_stationary_domain.head(2))
display(df_sequential_non_stationary_category.head(2))
display(df_binned_stationary.head(2))

Unnamed: 0,feat_group,uids,user,domain
0,1,"[2686, 2686, 2686, 2686, 2706, 2695, 2702, 270...",354471f893df6b8e,8f155e77c8af923f
1,2,"[4118, 4120, 4120, 4120, 4120, 4113]",354471f893df6b8e,975af2aeb582ca21


Unnamed: 0,feat_group,uids,user,category
0,1,"[2686, 2686, 2686, 2686, 2706, 2695, 2702, 270...",354471f893df6b8e,05175ca42d439154
1,2,[16117],354471f893df6b8e,234f5b297d040799


Unnamed: 0,feat_group,uids,user,domain,starts,ends,cumulative_active_seconds,gap_seconds
0,1,"[2686, 2698, 2700, 2704, 2706, 2684, 2695, 270...",354471f893df6b8e,8f155e77c8af923f,2013-05-22 01:19:56,2013-05-22 01:38:00,1084,0
1,2,"[4118, 4114, 4120, 4117, 4121, 4113]",354471f893df6b8e,975af2aeb582ca21,2013-05-22 01:38:04,2013-05-22 01:45:06,422,0


Unnamed: 0,feat_group,uids,user,category,starts,ends,cumulative_active_seconds,gap_seconds
0,1,"[2686, 2698, 2700, 2704, 2706, 2684, 2695, 270...",354471f893df6b8e,05175ca42d439154,2013-05-22 01:19:56,2013-05-22 01:45:06,1510,0
1,2,[16117],354471f893df6b8e,234f5b297d040799,2013-05-22 01:45:06,2013-05-22 01:45:32,26,0


Unnamed: 0,user,uid,bin,seconds,share,domain,category
0,354471f893df6b8e,2686,2959,10,0.166667,8f155e77c8af923f,05175ca42d439154
1,354471f893df6b8e,2686,2960,60,1.0,8f155e77c8af923f,05175ca42d439154


In [6]:
E = Entropy()
ER = EntropyRate()
P = Predictability()

# Calculate Entropy & Predictability on Binned Non-stationary

- Firstly, we calculate the random and Shannon entropy with `Entropy` instance.
- Secondly, we compute the Lempel-Ziv entropy with `EntropyRate` instance.
- Thirdly, we merge the results to have entropy values in a single data frame.
- Lastly, we calculate the predictability results by using the entropy result with `Predictability` instance.
- This is a summary of calculations with `Non-stationary` trajectory, and we apply same steps when we use `Domain-aggregated` and `Binned` trajectory.

In [7]:
E.calculate(df_binned_non_stationary_domain)
E.df

Unnamed: 0,user,domain_unique,domain_count,domain_entropy_shannon,domain_entropy_random
0,354471f893df6b8e,54,209,4.169659,5.754888
1,3739ea7393e94fe5,36,153,4.07284,5.169925
2,4e251a0d5d4ca48e,46,205,3.748571,5.523562
3,7ff09f10fa9835cf,96,294,5.686221,6.584963
4,a01f41af54998313,32,211,3.42683,5.0
5,b1dcb5d717657d39,32,253,2.612988,5.0
6,b8981c28b9aeeb75,84,208,5.049784,6.392317
7,c03e08a1fdcf4b97,41,201,4.156978,5.357552
8,d7810ac0f8207be5,50,177,4.45742,5.643856
9,dd3cbdd6641b85d9,54,215,4.429999,5.754888


In [8]:
E.calculate(df_binned_non_stationary_category, features=['category'])
E.df

Unnamed: 0,user,category_unique,category_count,category_entropy_shannon,category_entropy_random
0,354471f893df6b8e,24,152,3.191419,4.584963
1,3739ea7393e94fe5,19,122,3.205964,4.247928
2,4e251a0d5d4ca48e,21,184,3.052992,4.392317
3,7ff09f10fa9835cf,34,270,3.952412,5.087463
4,a01f41af54998313,18,205,2.937281,4.169925
5,b1dcb5d717657d39,18,246,2.270987,4.169925
6,b8981c28b9aeeb75,32,186,3.791186,5.0
7,c03e08a1fdcf4b97,16,177,3.086058,4.0
8,d7810ac0f8207be5,21,154,3.348222,4.392317
9,dd3cbdd6641b85d9,22,195,3.334473,4.459432


In [9]:
ER.calculate(df_binned_non_stationary_domain)
ER.df

Unnamed: 0,user,domain_entropy_lz
0,354471f893df6b8e,3.208327
1,3739ea7393e94fe5,3.225506
2,4e251a0d5d4ca48e,2.659722
3,7ff09f10fa9835cf,4.350847
4,a01f41af54998313,2.781059
5,b1dcb5d717657d39,1.735433
6,b8981c28b9aeeb75,3.998305
7,c03e08a1fdcf4b97,3.311649
8,d7810ac0f8207be5,3.543948
9,dd3cbdd6641b85d9,3.171353


In [10]:
ER.calculate(df_binned_non_stationary_category, features=['category'])
ER.df

Unnamed: 0,user,category_entropy_lz
0,354471f893df6b8e,2.43187
1,3739ea7393e94fe5,2.55642
2,4e251a0d5d4ca48e,2.248891
3,7ff09f10fa9835cf,3.170898
4,a01f41af54998313,2.360035
5,b1dcb5d717657d39,1.452899
6,b8981c28b9aeeb75,2.837318
7,c03e08a1fdcf4b97,2.362632
8,d7810ac0f8207be5,2.831216
9,dd3cbdd6641b85d9,2.418741


In [11]:
binned_non_stationary_entropy = E.calculate(df_binned_non_stationary_domain) \
                            .merge(ER.calculate(df_binned_non_stationary_domain), on='user') \
                            .merge(E.calculate(df_binned_non_stationary_category, features=['category']), on='user') \
                            .merge(ER.calculate(df_binned_non_stationary_category, features=['category']), on='user')
binned_non_stationary_entropy

Unnamed: 0,user,domain_unique,domain_count,domain_entropy_shannon,domain_entropy_random,domain_entropy_lz,category_unique,category_count,category_entropy_shannon,category_entropy_random,category_entropy_lz
0,354471f893df6b8e,54,209,4.169659,5.754888,3.208327,24,152,3.191419,4.584963,2.43187
1,3739ea7393e94fe5,36,153,4.07284,5.169925,3.225506,19,122,3.205964,4.247928,2.55642
2,4e251a0d5d4ca48e,46,205,3.748571,5.523562,2.659722,21,184,3.052992,4.392317,2.248891
3,7ff09f10fa9835cf,96,294,5.686221,6.584963,4.350847,34,270,3.952412,5.087463,3.170898
4,a01f41af54998313,32,211,3.42683,5.0,2.781059,18,205,2.937281,4.169925,2.360035
5,b1dcb5d717657d39,32,253,2.612988,5.0,1.735433,18,246,2.270987,4.169925,1.452899
6,b8981c28b9aeeb75,84,208,5.049784,6.392317,3.998305,32,186,3.791186,5.0,2.837318
7,c03e08a1fdcf4b97,41,201,4.156978,5.357552,3.311649,16,177,3.086058,4.0,2.362632
8,d7810ac0f8207be5,50,177,4.45742,5.643856,3.543948,21,154,3.348222,4.392317,2.831216
9,dd3cbdd6641b85d9,54,215,4.429999,5.754888,3.171353,22,195,3.334473,4.459432,2.418741


In [12]:
P.calculate(binned_non_stationary_entropy, features=['domain', 'category'], workers=1)
P.df

  solution is possible.
  improvement from the last ten iterations.
  return -x*np.log2(x) - (1 - x)*(np.log2(1-x)) + (1 - x)*(np.log2(n-1)) - s


Unnamed: 0,user,domain_unique,domain_count,domain_entropy_shannon,domain_entropy_random,domain_entropy_lz,category_unique,category_count,category_entropy_shannon,category_entropy_random,category_entropy_lz,domain_pi_unc,domain_pi_rand,domain_pi_max,category_pi_unc,category_pi_rand,category_pi_max
0,354471f893df6b8e,54,209,4.169659,5.754888,3.208327,24,152,3.191419,4.584963,2.43187,0.445109,0.018519,0.608487,0.515403,0.041667,0.66563
1,3739ea7393e94fe5,36,153,4.07284,5.169925,3.225506,19,122,3.205964,4.247928,2.55642,0.394629,0.027778,0.56382,0.470375,0.052632,0.617164
2,4e251a0d5d4ca48e,46,205,3.748571,5.523562,2.659722,21,184,3.052992,4.392317,2.248891,0.499518,0.021739,0.680312,0.524579,0.047619,0.687088
3,7ff09f10fa9835cf,96,294,5.686221,6.584963,4.350847,34,270,3.952412,5.087463,3.170898,0.260438,0.010417,0.489922,0.410062,0.029412,0.567061
4,a01f41af54998313,32,211,3.42683,5.0,2.781059,18,205,2.937281,4.169925,2.360035,0.510087,0.03125,0.630466,0.525581,0.055556,0.650932
5,b1dcb5d717657d39,32,253,2.612988,5.0,1.735433,18,246,2.270987,4.169925,1.452899,0.659367,0.03125,0.796734,0.66859,0.055556,0.814064
6,b8981c28b9aeeb75,84,208,5.049784,6.392317,3.998305,32,186,3.791186,5.0,2.837318,0.355103,0.011905,0.529292,0.434062,0.03125,0.620586
7,c03e08a1fdcf4b97,41,201,4.156978,5.357552,3.311649,16,177,3.086058,4.0,2.362632,0.401504,0.02439,0.563448,0.46516,0.0625,0.637151
8,d7810ac0f8207be5,50,177,4.45742,5.643856,3.543948,21,154,3.348222,4.392317,2.831216,0.376272,0.02,0.545833,0.455339,0.047619,0.572751
9,dd3cbdd6641b85d9,54,215,4.429999,5.754888,3.171353,22,195,3.334473,4.459432,2.418741,0.395654,0.018519,0.61428,0.46783,0.045455,0.6599


# Calculate Entropy & Predictability on Domain-aggregated

In [13]:
sequential_non_stationary_entropy = E.calculate(df_sequential_non_stationary_domain) \
                                        .merge(ER.calculate(df_sequential_non_stationary_domain), on='user') \
                                        .merge(E.calculate(df_sequential_non_stationary_category, features=['category']), on='user') \
                                        .merge(ER.calculate(df_sequential_non_stationary_category, features=['category']), on='user')
sequential_non_stationary_entropy

Unnamed: 0,user,domain_unique,domain_count,domain_entropy_shannon,domain_entropy_random,domain_entropy_lz,category_unique,category_count,category_entropy_shannon,category_entropy_random,category_entropy_lz
0,354471f893df6b8e,68,423,4.123983,6.087463,3.093799,25,312,2.833464,4.643856,2.230685
1,3739ea7393e94fe5,43,324,3.928548,5.426265,2.80569,21,271,2.998075,4.392317,2.147287
2,4e251a0d5d4ca48e,62,377,3.878672,5.954196,2.587957,26,336,3.235228,4.70044,2.222965
3,7ff09f10fa9835cf,113,672,5.510473,6.820179,3.759722,35,611,3.842215,5.129283,2.832554
4,a01f41af54998313,47,679,3.483761,5.554589,2.590809,25,672,3.06582,4.643856,2.306149
5,b1dcb5d717657d39,39,617,2.611196,5.285402,1.983022,21,600,2.272989,4.392317,1.784238
6,b8981c28b9aeeb75,116,562,4.839743,6.857981,2.94187,39,481,3.618995,5.285402,2.352508
7,c03e08a1fdcf4b97,62,503,4.193638,5.954196,3.050944,22,447,3.073447,4.459432,2.186261
8,d7810ac0f8207be5,66,432,4.552196,6.044394,3.343881,23,391,3.291024,4.523562,2.664793
9,dd3cbdd6641b85d9,99,659,4.746581,6.629357,2.969809,33,594,3.497235,5.044394,2.233774


In [14]:
P.calculate(sequential_non_stationary_entropy, features=['domain', 'category'])
P.df

Unnamed: 0,user,domain_unique,domain_count,domain_entropy_shannon,domain_entropy_random,domain_entropy_lz,category_unique,category_count,category_entropy_shannon,category_entropy_random,category_entropy_lz,domain_pi_unc,domain_pi_rand,domain_pi_max,category_pi_unc,category_pi_rand,category_pi_max
0,354471f893df6b8e,68,423,4.123983,6.087463,3.093799,25,312,2.833464,4.643856,2.230685,0.4849,0.014706,0.644728,0.594464,0.04,0.704481
1,3739ea7393e94fe5,43,324,3.928548,5.426265,2.80569,21,271,2.998075,4.392317,2.147287,0.45586,0.023256,0.652493,0.536785,0.047619,0.705497
2,4e251a0d5d4ca48e,62,377,3.878672,5.954196,2.587957,26,336,3.235228,4.70044,2.222965,0.514516,0.016129,0.710094,0.518458,0.038462,0.708731
3,7ff09f10fa9835cf,113,672,5.510473,6.820179,3.759722,35,611,3.842215,5.129283,2.832554,0.323996,0.00885,0.591062,0.43923,0.028571,0.630082
4,a01f41af54998313,47,679,3.483761,5.554589,2.590809,25,672,3.06582,4.643856,2.306149,0.549073,0.021277,0.692194,0.547985,0.04,0.691456
5,b1dcb5d717657d39,39,617,2.611196,5.285402,1.983022,21,600,2.272989,4.392317,1.784238,0.675653,0.025641,0.770288,0.68266,0.047619,0.767989
6,b8981c28b9aeeb75,116,562,4.839743,6.857981,2.94187,39,481,3.618995,5.285402,2.352508,0.43743,0.008621,0.69914,0.500946,0.025641,0.71581
7,c03e08a1fdcf4b97,62,503,4.193638,5.954196,3.050944,22,447,3.073447,4.459432,2.186261,0.460761,0.016129,0.643957,0.527443,0.045455,0.702263
8,d7810ac0f8207be5,66,432,4.552196,6.044394,3.343881,23,391,3.291024,4.523562,2.664793,0.405898,0.015152,0.605438,0.486127,0.043478,0.617641
9,dd3cbdd6641b85d9,99,659,4.746581,6.629357,2.969809,33,594,3.497235,5.044394,2.233774,0.431548,0.010101,0.686639,0.500553,0.030303,0.7234


# Calculate Entropy & Predictability on Binned

In [15]:
binned_stationary_entropy = E.calculate(df_binned_stationary, features=['domain', 'category']) \
                                        .merge(ER.calculate(df_binned_stationary, features=['domain', 'category']), on='user')
binned_stationary_entropy

Unnamed: 0,user,domain_unique,domain_count,category_unique,category_count,domain_entropy_shannon,domain_entropy_random,category_entropy_shannon,category_entropy_random,domain_entropy_lz,category_entropy_lz
0,354471f893df6b8e,54,1220,24,1220,3.712884,5.754888,2.753438,4.584963,1.285785,0.907613
1,3739ea7393e94fe5,36,732,19,732,3.59159,5.169925,2.659319,4.247928,1.507952,1.091966
2,4e251a0d5d4ca48e,46,1789,21,1789,1.3784,5.523562,1.197814,4.392317,0.712679,0.622084
3,7ff09f10fa9835cf,96,1261,34,1261,5.390734,6.584963,4.012563,5.087463,1.960416,1.568333
4,a01f41af54998313,32,1842,18,1842,2.362642,5.0,2.242425,4.169925,0.836201,0.736092
5,b1dcb5d717657d39,32,2703,18,2703,2.009052,5.0,1.903267,4.169925,0.607344,0.573602
6,b8981c28b9aeeb75,84,811,32,811,4.35381,6.392317,2.97705,5.0,1.775426,1.298801
7,c03e08a1fdcf4b97,41,2176,16,2176,2.706539,5.357552,2.086197,4.0,0.764301,0.575774
8,d7810ac0f8207be5,50,1193,21,1193,3.635833,5.643856,2.875296,4.392317,1.190807,0.942365
9,dd3cbdd6641b85d9,54,1253,22,1253,3.369442,5.754888,2.704187,4.459432,1.240197,1.04308


In [16]:
P.calculate(binned_stationary_entropy, features=['domain', 'category'], workers=1)
P.df

Unnamed: 0,user,domain_unique,domain_count,category_unique,category_count,domain_entropy_shannon,domain_entropy_random,category_entropy_shannon,category_entropy_random,domain_entropy_lz,category_entropy_lz,domain_pi_unc,domain_pi_rand,domain_pi_max,category_pi_unc,category_pi_rand,category_pi_max
0,354471f893df6b8e,54,1220,24,1220,3.712884,5.754888,2.753438,4.584963,1.285785,0.907613,0.526034,0.018519,0.87192,0.605257,0.041667,0.901782
1,3739ea7393e94fe5,36,732,19,732,3.59159,5.169925,2.659319,4.247928,1.507952,1.091966,0.49473,0.027778,0.832924,0.595698,0.052632,0.871093
2,4e251a0d5d4ca48e,46,1789,21,1789,1.3784,5.523562,1.197814,4.392317,0.712679,0.622084,0.856873,0.021739,0.934064,0.858773,0.047619,0.935712
3,7ff09f10fa9835cf,96,1261,34,1261,5.390734,6.584963,4.012563,5.087463,1.960416,1.568333,0.316554,0.010417,0.808767,0.396634,0.029412,0.822724
4,a01f41af54998313,32,1842,18,1842,2.362642,5.0,2.242425,4.169925,0.836201,0.736092,0.700794,0.03125,0.915532,0.67417,0.055556,0.91909
5,b1dcb5d717657d39,32,2703,18,2703,2.009052,5.0,1.903267,4.169925,0.607344,0.573602,0.756213,0.03125,0.941939,0.737534,0.055556,0.939887
6,b8981c28b9aeeb75,84,811,32,811,4.35381,6.392317,2.97705,5.0,1.775426,1.298801,0.4736,0.011905,0.82607,0.595581,0.03125,0.857224
7,c03e08a1fdcf4b97,41,2176,16,2176,2.706539,5.357552,2.086197,4.0,0.764301,0.575774,0.664407,0.02439,0.927136,0.69357,0.0625,0.938228
8,d7810ac0f8207be5,50,1193,21,1193,3.635833,5.643856,2.875296,4.392317,1.190807,0.942365,0.530083,0.02,0.881449,0.563406,0.047619,0.894472
9,dd3cbdd6641b85d9,54,1253,22,1253,3.369442,5.754888,2.704187,4.459432,1.240197,1.04308,0.58286,0.018519,0.877264,0.604746,0.045455,0.881835
