In [3]:
import numpy as np
import pandas
from scipy.optimize import linprog

In [66]:
dataset = pandas.read_csv('../datasets/yow_userstudy_raw.csv')
dataset.head()

Unnamed: 0,EventOnScroll,MSecForDownArrow,TimeOnVScroll,relevant,NumOfPageUp,TimeOnMouse,NumOfPageDown,ClickOnWindow,log_id,serverTimeVisit,...,readability,TimeOnHScroll,novelty,user_like,DOC_ID,TimeOnPage,TimeVisit,user_id,NumOfDownArrow,authority
0,0.0,0.0,0.0,4.0,0,1455,0.0,0.0,2401,2004-04-23 15:23,...,1.0,0.0,5.0,4,279282,6109.0,2004-04-23 15:23,51,0.0,1.0
1,0.0,0.0,0.0,4.0,0,9673,0.0,0.0,2405,2004-04-23 15:25,...,1.0,0.0,4.0,4,278354,68265.0,2004-04-23 15:25,51,0.0,1.0
2,0.0,0.0,0.0,4.0,0,1424,0.0,0.0,2409,2004-04-23 15:34,...,1.0,0.0,5.0,4,276597,12610.0,2004-04-23 15:34,51,0.0,1.0
3,0.0,0.0,0.0,4.0,0,876,0.0,0.0,2411,2004-04-23 15:35,...,1.0,0.0,4.0,4,278339,6641.0,2004-04-23 15:35,51,0.0,1.0
4,0.0,0.0,0.0,5.0,0,4613,0.0,0.0,2412,2004-04-23 15:37,...,1.0,0.0,5.0,4,278386,77563.0,2004-04-23 15:37,51,0.0,1.0


In [67]:
# Removes NaN values
clds = dataset[dataset['classes'].notna()]

In [68]:
# Gets 'people' topics
txds = clds[clds['classes'].str.contains("people")]

In [69]:
dataset.loc[dataset['RSS_ID'] == 2].classes

0                                  NaN
7                                  NaN
8                             |theatre
9                                  NaN
10                                 NaN
                     ...              
9773                        |misc|wow!
9922    |familar topic|politics|people
9945      |familar topic|people|health
9960            |bad news|economy|wow!
9967             |movies|entertainment
Name: classes, Length: 976, dtype: object

In [70]:
txds.RSS_ID.value_counts()

8157    24
9       13
8186    11
8154     8
8201     8
2        6
11       5
3        5
8153     5
8156     4
8216     3
14       3
8221     3
8202     2
8183     2
10       2
8209     2
3661     2
8184     2
1302     2
5        2
8203     1
12       1
8195     1
15       1
2324     1
8199     1
8217     1
8205     1
1579     1
3621     1
1839     1
8242     1
8272     1
8181     1
2905     1
6747     1
2909     1
606      1
6499     1
2685     1
501      1
686      1
4293     1
8152     1
2113     1
Name: RSS_ID, dtype: int64

In [64]:
txds.loc[txds['RSS_ID'] == 9 or txds['RSS_ID'] == 8157]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [79]:
# Select two most populous RSS IDs
nines = txds.loc[txds['RSS_ID'] == 9]
eights = txds.loc[txds['RSS_ID'] == 8157]
frames = [nines, eights]
final_ds = pandas.concat(frames)
final_ds.count()

EventOnScroll       37
MSecForDownArrow    37
TimeOnVScroll       37
relevant            37
NumOfPageUp         37
TimeOnMouse         37
NumOfPageDown       37
ClickOnWindow       37
log_id              37
serverTimeVisit     37
MSecForPageUp       37
MSecForUpArrow      37
NumOfUpArrow        37
classes             37
MSecForPageDown     37
RSS_ID              37
readability         37
TimeOnHScroll       37
novelty             37
user_like           37
DOC_ID              37
TimeOnPage          37
TimeVisit           37
user_id             37
NumOfDownArrow      37
authority           37
dtype: int64

In [80]:
# Now we need to normalize relevance to between 0 and 1, then add Gaussian noise, then clip.
final_ds['relevant'] = final_ds['relevant'] / 5

In [81]:
final_ds['relevant'] = final_ds['relevant'] + np.random.normal(0, 0.05, final_ds['relevant'].count())

In [82]:
final_ds.loc[final_ds['relevant'] < 0] = 0
final_ds.loc[final_ds['relevant'] > 1] = 1

In [83]:
final_ds.head()

Unnamed: 0,EventOnScroll,MSecForDownArrow,TimeOnVScroll,relevant,NumOfPageUp,TimeOnMouse,NumOfPageDown,ClickOnWindow,log_id,serverTimeVisit,...,readability,TimeOnHScroll,novelty,user_like,DOC_ID,TimeOnPage,TimeVisit,user_id,NumOfDownArrow,authority
9716,0.0,0.0,0.0,0.627253,0,1847,0.0,0.0,5030,2004-04-29 23:43,...,1.0,0.0,3.0,3,292655,82735.0,2004-04-29 23:43,92,0.0,1.0
9777,0.0,0.0,0.0,0.395508,0,1312,0.0,0.0,5769,2004-05-02 01:44,...,1.0,0.0,2.0,3,271918,24078.0,2004-05-02 01:44,92,0.0,1.0
9932,0.0,0.0,0.0,0.712238,0,310,0.0,0.0,19695,2004-05-11 18:46,...,1.0,0.0,2.0,3,312319,26750.0,2004-05-11 18:46,92,0.0,1.0
9933,0.0,0.0,0.0,0.465007,0,1173,0.0,0.0,19696,2004-05-11 18:48,...,1.0,0.0,2.0,3,312321,66984.0,2004-05-11 18:48,92,0.0,1.0
9934,0.0,0.0,0.0,0.615199,0,2296,0.0,0.0,19697,2004-05-11 18:50,...,1.0,0.0,3.0,3,312318,116375.0,2004-05-11 18:50,92,0.0,1.0


In [84]:
final_ds.loc[final_ds['relevant'] == 1].count()

EventOnScroll       0
MSecForDownArrow    0
TimeOnVScroll       0
relevant            0
NumOfPageUp         0
TimeOnMouse         0
NumOfPageDown       0
ClickOnWindow       0
log_id              0
serverTimeVisit     0
MSecForPageUp       0
MSecForUpArrow      0
NumOfUpArrow        0
classes             0
MSecForPageDown     0
RSS_ID              0
readability         0
TimeOnHScroll       0
novelty             0
user_like           0
DOC_ID              0
TimeOnPage          0
TimeVisit           0
user_id             0
NumOfDownArrow      0
authority           0
dtype: int64

In [None]:
"""
    inputs:
        d: document = article from an authority
        u: user from authority
        q: query =
        u: expected utility u(d|q) = \sum_{u \in users} \lambda(rel(d|u, q)) P(u|q)
            a vector length of the number of documents
            = (dataset.relevance / 5) + Gaussian(0.05)
        P:
        v: position bias (how much attention an article gets based on rank)
            a vector length of the number of documents
            v(j) = \frac{1}{log(1 + j)} where j is the rank of document d
        fair constraint: f^T P g
            f: (DPC) f_i = \frac{1}{|G_0|} \cdot (1_{d_i \in G_0}) - \frac{1}{|G_1|} \cdot (1_{d_i \in G_1}) where 1 is the indicator
            g: g = v
"""
linprog