In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from prefix_span import PrefixSpan
from js_distance import JS
from sequence_generator import SequenceGenerator

# Descriptive Database
The tabel `data` contains the following content: 

| column            | content explaination                                          |
|:----------------: | :----------------------------------------------------------: |
| item_id           | edited item page ID                                          |
| item_name         | respective item page name                                    |
| label             | English label of the item page                               |
| category          | classified content category based on label and description   |
| user_id           | editor ID                                                    |
| user_name         | editer name                                                  |
| user_group        | editor's user group and their corresponding user rights      |
| user_editcount    | rough number of edits and edit-like actions the user has performed |
| user_registration | editor registration timestamp                                |
| rev_id            | revision(edit) ID                                            | 
| rev_timestamp     | revision timestamp                                           |
| comment           | original comment information for this edit                   |
| edit_summary      | comment information simplified with regular expression       |
| edit_type         | schematized and classified edit summary for ease of use      |
| paraphrase        | paraphrase of edit summary according to Wikibase API         |
| prediction        | quality prediction of this revision ID, chosen as the one with the biggest probability |
|itemquality_A, itemquality_B, itemquality_C, itemquality_D, itemquality_E | concrete quality level probability distribution of this revision                                          |
| js_distance       | Jensen-Shannon divergence value based on given quality distribution |

# Sequence Analysis
## Generate Sequence Database

An event is a list of continuous activities contributed by the same editor. (list of strings)

A sequence is a list of events occurred on the same article. (list)

A sequence database is a list of sequences. (list)

Thus, a sequence database is a list of lists of lists of strings.

A sequence database ready to be mined is determined by setting up the js-distance constraint.

In [3]:
seq = SequenceGenerator(csvfile='../db/data.csv', jsThreshold=0.8)
seq_db = seq.generate_sequence()
for sequence in seq_db:
    print(sequence)

[['set reference', 'remove reference']]
[['set item', 'set item', 'set item', 'set item', 'set item'], ['revert edits', 'revert edits', 'revert edits', 'revert edits', 'revert edits']]
[['set reference']]
[['set reference']]
[['revert edits'], ['add reference']]
[['add reference']]
[['set claim', 'set reference']]
[['set reference']]
[['add reference']]
[['set reference']]
[['set description']]
[['set label'], ['set claim']]
[['set reference'], ['set reference'], ['remove claim']]
[['set reference']]
[['set reference']]
[['set reference']]
[['revert edits']]
[['set reference']]
[['set reference']]
[['update item']]
[['set reference']]
[['set reference', 'revert edits'], ['revert edits'], ['revert edits'], ['add reference', 'add reference'], ['revert edits'], ['revert edits'], ['revert edits'], ['remove claim']]
[['add reference']]
[['set reference']]
[['set reference']]
[['add reference']]
[['set reference'], ['update item']]
[['add reference'], ['revert edits']]
[['set claim']]
[['set

## Mine Sequential Patterns

The sequential patterns within the sequence database are discovered with PrefixSpan algorithm by setting up the minimum support threshold.

In [4]:
prex = PrefixSpan()
result = prex.prefix_span(dataset=seq_db, minSupport=0.1)
df = prex.display(result)
print(df)

                     0   1
2        [[set claim]]  99
3    [[set reference]]  56
0  [[add description]]  25
1    [[add reference]]  25
4      [[update item]]  25


## Representative Patterns 

Following metrics are used for mining patterns from different perspectives, this can be archieved by adjusting the jsThreshold and minSupport constraints:

* high quality + high frequency  
* high quality + middle frequency
* high quality + low frequency 
____________________________________
* middle quality + high frequency
* middle quality + middle frequency
* middle quality + low frequency
____________________________________

* low quality + high frequency
* low quality + middle frequency
* low quality + low frequency
____________________________________

* no quality constraint + high frequency
* no quality constraint + middle frequency
* no quality constraint + low frequency 
____________________________________


| Grading       | Range         |
| :-------------| :-------------|
| Q_high        | \[0.7, 1)     |
| Q_middle      | \[0.3, 0.7)   |
| Q_low         | (0, 0.3)      |
| F_high        | \[0.2, 1)     |
| F_middle      | \[0.05, 0.2)  |
| F_low         | (0, 0.05)     |

### High Quality Constraint 

In [5]:
seq_high = SequenceGenerator(csvfile='../db/data.csv', jsThreshold=0.75)
db = seq_high.generate_sequence()
highF = prex.prefix_span(dataset=db, minSupport=0.25)
midF = prex.prefix_span(dataset=db, minSupport=0.1)
lowF = prex.prefix_span(dataset=db, minSupport=0.01)
prex.display(highF)

Unnamed: 0,0,1
0,[[set claim]],125
1,[[set reference]],97


In [6]:
prex.display(midF)

Unnamed: 0,0,1
1,[[set claim]],125
2,[[set reference]],97
0,[[add reference]],34


In [7]:
prex.display(lowF)

Unnamed: 0,0,1
12,[[set claim]],125
19,[[set reference]],97
3,[[add reference]],34
0,[[add description]],27
25,[[update item]],26
26,"[[update item], [update item]]",14
8,[[remove claim]],11
16,[[set description]],10
11,[[revert edits]],9
18,[[set label]],8


### Middle Quality Constraint

In [8]:
seq_mid = SequenceGenerator(csvfile='../db/data.csv', jsThreshold=0.35)
db = seq_mid.generate_sequence()
highF = prex.prefix_span(dataset=db, minSupport=0.25)
midF = prex.prefix_span(dataset=db, minSupport=0.1)
lowF = prex.prefix_span(dataset=db, minSupport=0.01)
prex.display(highF)

Unnamed: 0,0,1
0,[[set claim]],193
1,[[set reference]],169


In [9]:
prex.display(midF)

Unnamed: 0,0,1
2,[[set claim]],193
3,[[set reference]],169
4,[[update item]],59
0,[[add description]],58
1,[[add reference]],47


In [10]:
prex.display(lowF)

Unnamed: 0,0,1
18,[[set claim]],193
31,[[set reference]],169
41,[[update item]],59
0,[[add description]],58
5,[[add reference]],47
37,"[[set reference], [set claim]]",34
12,[[remove claim]],19
39,"[[set reference], [update item]]",18
32,"[[set reference], [add description]]",18
45,"[[update item], [update item]]",18


### Low Quality Constraint

In [13]:
seq_low = SequenceGenerator(csvfile='../db/data.csv', jsThreshold=0.05)
db = seq_low.generate_sequence()
highF = prex.prefix_span(dataset=db, minSupport=0.25)
midF = prex.prefix_span(dataset=db, minSupport=0.1)
lowF = prex.prefix_span(dataset=db, minSupport=0.01)
prex.display(highF)

Unnamed: 0,0,1
15,[[set claim]],406
20,"[[set claim], [set claim]]",294
2,[[add reference]],272
29,[[set description]],226
17,"[[set claim], [add reference]]",225
7,"[[add reference], [set claim]]",220
33,[[set reference]],219
4,"[[add reference], [add reference]]",210
31,[[set label]],202
42,[[update item]],197


In [14]:
prex.display(midF)

Unnamed: 0,0,1
104,[[set claim]],406
143,"[[set claim], [set claim]]",294
13,[[add reference]],272
188,[[set description]],226
113,"[[set claim], [add reference]]",225
50,"[[add reference], [set claim]]",220
205,[[set reference]],219
22,"[[add reference], [add reference]]",210
199,[[set label]],202
254,[[update item]],197


In [15]:
prex.display(lowF)

Unnamed: 0,0,1
7616,[[set claim]],406
10525,"[[set claim], [set claim]]",294
1349,[[add reference]],272
12769,[[set description]],226
8412,"[[set claim], [add reference]]",225
3879,"[[add reference], [set claim]]",220
13727,[[set reference]],219
2092,"[[add reference], [add reference]]",210
13499,[[set label]],202
17195,[[update item]],197
