In [62]:
import os
import sys

module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from prefix_span import PrefixSpan
from js_distance import JS
from sequence_generator import SequenceGenerator

# Descriptive Database
The tabel `data` contains the following content: 

| column            | content explaination                                          |
|:----------------: | :----------------------------------------------------------: |
| item_id           | edited item page ID                                          |
| item_name         | respective item page name                                    |
| label             | English label of the item page                               |
| category          | classified content category based on label and description   |
| user_id           | editor ID                                                    |
| user_name         | editer name                                                  |
| user_group        | editor's user group and their corresponding user rights      |
| user_editcount    | rough number of edits and edit-like actions the user has performed |
| user_registration | editor registration timestamp                                |
| rev_id            | revision(edit) ID                                            | 
| rev_timestamp     | revision timestamp                                           |
| comment           | original comment information for this edit                   |
| edit_summary      | comment information simplified with regular expression       |
| edit_type         | schematized and classified edit summary for ease of use      |
| paraphrase        | paraphrase of edit summary according to Wikibase API         |
| prediction        | quality prediction of this revision ID, chosen as the one with the biggest probability |
|itemquality_A, itemquality_B, itemquality_C, itemquality_D, itemquality_E | concrete quality level probability distribution of this revision                                          |
| js_distance       | Jensen-Shannon divergence value based on given quality distribution |

# Sequence Analysis
## Generate Sequence Database

An event is a list of continuous activities contributed by the same editor. (list of strings)

A sequence is a list of events occurred on the same article. (list)

A sequence database is a list of sequences. (list)

Thus, a sequence database is a list of lists of lists of strings.

A sequence database ready to be mined is determined by setting up the js-distance constraint.

In [63]:
seq = SequenceGenerator(csvfile='../db/data.csv', jsThreshold=0.8)
seq_db = seq.generate_sequence()
for sequence in seq_db:
    print(sequence)

[['set reference', 'remove reference']]
[['set item', 'set item', 'set item', 'set item', 'set item']]
[['revert edits', 'revert edits', 'revert edits', 'revert edits', 'revert edits']]
[['set reference']]
[['set reference']]
[['revert edits']]
[['add reference']]
[['add reference']]
[['set claim', 'set reference']]
[['set reference']]
[['add reference']]
[['set reference']]
[['set description']]
[['set label']]
[['set claim']]
[['set reference']]
[['set reference']]
[['remove claim']]
[['set reference']]
[['set reference']]
[['set reference']]
[['revert edits']]
[['set reference']]
[['set reference']]
[['update item']]
[['set reference']]
[['set reference', 'revert edits']]
[['revert edits']]
[['revert edits']]
[['add reference', 'add reference']]
[['revert edits']]
[['revert edits']]
[['revert edits']]
[['remove claim']]
[['add reference']]
[['set reference']]
[['set reference']]
[['add reference']]
[['set reference']]
[['update item']]
[['add reference']]
[['revert edits']]
[['set c

## Mine Sequential Patterns

The sequential patterns within the sequence database are discovered with PrefixSpan algorithm by setting up the minimum support threshold.

In [64]:
prex = PrefixSpan()
result = prex.prefix_span(dataset=seq_db, minSupport=0.1)
df = prex.display(result)
print(df)

                   0    1
0      [[set claim]]  102
1  [[set reference]]   57
2    [[update item]]   39


## Representative Patterns 

Following metrics are used for mining patterns from different perspectives, this can be archieved by adjusting the jsThreshold and minSupport constraints:

* high quality + high frequency  
* high quality + middle frequency
* high quality + low frequency 
____________________________________
* middle quality + high frequency
* middle quality + middle frequency
* middle quality + low frequency
____________________________________

* low quality + high frequency
* low quality + middle frequency
* low quality + low frequency
____________________________________

* no quality constraint + high frequency
* no quality constraint + middle frequency
* no quality constraint + low frequency 
____________________________________


| Grading       | Range         |
| :-------------| :-------------|
| Q_high        | \[0.7, 1)     |
| Q_middle      | \[0.3, 0.7)   |
| Q_low         | (0, 0.3)      |
| F_high        | \[0.2, 1)     |
| F_middle      | \[0.05, 0.2)  |
| F_low         | (0, 0.05)     |

### High Quality Constraint 

In [65]:
seq_high = SequenceGenerator(csvfile='../db/data.csv', jsThreshold=0.75)
db = seq_high.generate_sequence()
highF = prex.prefix_span(dataset=db, minSupport=0.25)
midF = prex.prefix_span(dataset=db, minSupport=0.1)
lowF = prex.prefix_span(dataset=db, minSupport=0.01)
prex.display(highF)

Unnamed: 0,0,1
0,[[set claim]],130
1,[[set reference]],101


In [66]:
prex.display(midF)

Unnamed: 0,0,1
0,[[set claim]],130
1,[[set reference]],101
2,[[update item]],40


In [67]:
prex.display(lowF)

Unnamed: 0,0,1
6,[[set claim]],130
10,[[set reference]],101
12,[[update item]],40
1,[[add reference]],35
0,[[add description]],32
5,[[revert edits]],15
4,[[remove claim]],12
8,[[set description]],10
3,[[merge item]],9
9,[[set label]],9


### Middle Quality Constraint

In [68]:
seq_mid = SequenceGenerator(csvfile='../db/data.csv', jsThreshold=0.35)
db = seq_mid.generate_sequence()
highF = prex.prefix_span(dataset=db, minSupport=0.25)
midF = prex.prefix_span(dataset=db, minSupport=0.1)
lowF = prex.prefix_span(dataset=db, minSupport=0.01)
prex.display(highF)

Unnamed: 0,0,1
0,[[set claim]],209
1,[[set reference]],173


In [69]:
prex.display(midF)

Unnamed: 0,0,1
0,[[set claim]],209
1,[[set reference]],173
2,[[update item]],77


In [70]:
prex.display(lowF)

Unnamed: 0,0,1
7,[[set claim]],209
11,[[set reference]],173
12,[[update item]],77
0,[[add description]],65
2,[[add reference]],49
6,[[revert edits]],26
4,[[remove claim]],21
10,[[set label]],17
9,[[set description]],16
3,[[merge item]],15


### Low Quality Constraint

In [71]:
seq_low = SequenceGenerator(csvfile='../db/data.csv', jsThreshold=0.01)
db = seq_low.generate_sequence()
highF = prex.prefix_span(dataset=db, minSupport=0.25)
midF = prex.prefix_span(dataset=db, minSupport=0.1)
lowF = prex.prefix_span(dataset=db, minSupport=0.01)
prex.display(highF)

Unnamed: 0,0,1
0,[[set claim]],3616


In [72]:
prex.display(midF)

Unnamed: 0,0,1
1,[[set claim]],3616
0,[[add reference]],1910
2,[[set description]],1786


In [73]:
prex.display(lowF)

Unnamed: 0,0,1
10,[[set claim]],3616
4,[[add reference]],1910
13,[[set description]],1786
1,[[add description]],1092
21,[[update item]],903
5,"[[add reference, set claim]]",827
16,[[set label]],757
7,[[remove claim]],555
18,[[set sitelink]],522
17,[[set reference]],479


### No Quality Constraint

In [74]:
seq_low = SequenceGenerator(csvfile='../db/data.csv', jsThreshold=0)
db = seq_low.generate_sequence()
highF = prex.prefix_span(dataset=db, minSupport=0.25)
midF = prex.prefix_span(dataset=db, minSupport=0.1)
lowF = prex.prefix_span(dataset=db, minSupport=0.01)
prex.display(highF)

Unnamed: 0,0,1
0,[[set claim]],12279


In [75]:
prex.display(midF)

Unnamed: 0,0,1
2,[[set claim]],12279
3,[[set description]],4929
1,[[add sitelink]],4452
0,[[add description]],3980


In [76]:
prex.display(lowF)

Unnamed: 0,0,1
20,[[set claim]],12279
25,[[set description]],4929
12,[[add sitelink]],4452
3,[[add description]],3980
30,[[set label]],3581
36,[[update item]],3436
10,[[add reference]],3190
11,"[[add reference, set claim]]",2923
35,[[update claim]],2860
33,[[set sitelink]],2653


In [77]:
maximal = prex.filterMaximal(lowF)
prex.display(maximal)

Unnamed: 0,0,1
4,[[add sitelink]],4452
19,[[update sitelink]],2050
12,"[[set claim, update claim]]",1428
14,"[[set description, set label]]",1363
6,"[[remove claim, set claim]]",1275
17,[[set item]],1006
0,"[[add alias, add description]]",857
8,[[remove sitelink]],756
18,"[[set label, set sitelink]]",712
13,"[[set claim, update item]]",590
