In [8]:
import os
import sys

module_path = os.path.abspath(os.path.join('../src'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from prefix_span import PrefixSpan
from js_distance import JS
from sequence_generator import SequenceGenerator

# Descriptive Database
The tabel `data` contains the following content: 

| column            | content explanation                                          |
|:----------------: | :----------------------------------------------------------: |
| item_id           | edited item page ID                                          |
| item_name         | respective item page name                                    |
| label             | English label of the item page                               |
| category          | classified content category based on label and description   |
| user_id           | editor ID                                                    |
| user_name         | editer name                                                  |
| user_group        | editor's user group and their corresponding user rights      |
| user_editcount    | rough number of edits and edit-like actions the user has performed |
| user_registration | editor registration timestamp                                |
| rev_id            | revision(edit) ID                                            | 
| rev_timestamp     | revision timestamp                                           |
| comment           | original comment information for this edit                   |
| edit_summary      | comment information simplified with regular expression       |
| edit_type         | schematized and classified edit summary for ease of use      |
| paraphrase        | paraphrase of edit summary according to Wikibase API         |
| prediction        | quality prediction of this revision ID, chosen as the one with the biggest probability |
|itemquality_A, itemquality_B, itemquality_C, itemquality_D, itemquality_E | concrete quality level probability distribution of this revision                                          |
| js_distance       | Jensen-Shannon divergence value based on given quality distribution |

# Sequence Analysis
## Sequence Database

An event is a list of continuous activities contributed by the same editor. (list of strings)

A sequence is a list of events occurred on the same article. (list)

A sequence database is a list of sequences. (list)

Thus, a sequence database is a list of lists of lists of strings.

In [17]:
seq = SequenceGenerator(csvfile='../db/data.csv', jsThreshold=0.5)
data = seq.generate_sequence()
for sequence in data:
    print(sequence)

[['set reference']]
[['set reference']]
[['remove claim']]
[['set reference']]
[['set term']]
[['add reference']]
[['set reference']]
[['revert edits']]
[['set reference']]
[['add reference']]
[['add reference']]
[['remove claim']]
[['set reference']]
[['set reference', 'remove reference']]
[['set reference']]
[['add reference']]
[['remove reference', 'revert edits', 'unrevert edits']]
[['set reference']]
[['set reference']]
[['remove claim']]
[['set reference']]
[['set item', 'set item', 'set item', 'set item', 'set item']]
[['revert edits', 'revert edits', 'revert edits', 'revert edits', 'revert edits']]
[['set reference']]
[['set reference']]
[['set reference']]
[['set claim']]
[['set reference']]
[['set claim']]
[['add reference']]
[['set reference']]
[['set reference']]
[['revert edits']]
[['set claim']]
[['add reference']]
[['remove claim', 'set claim']]
[['add reference']]
[['set reference']]
[['set claim', 'set reference']]
[['add description']]
[['set reference']]
[['set refer

In [12]:
prex = PrefixSpan()
result_df = prex.prefix_span_display(dataset=data, minSupport=5)
print(result_df)

                                                     0     1
114                                      [[set claim]]  3616
40                                   [[add reference]]  1910
127                                [[set description]]  1786
12                                 [[add description]]  1092
161                                    [[update item]]   903
64                        [[add reference, set claim]]   827
144                                      [[set label]]   757
79                                    [[remove claim]]   555
153                                   [[set sitelink]]   522
149                                  [[set reference]]   479
159                                   [[update claim]]   402
139                                       [[set item]]   394
0                                        [[add alias]]   368
75                                    [[add sitelink]]   368
131                     [[set description, set label]]   345
107                     