## Wordpiece Model

inspired by https://arxiv.org/pdf/1609.08144.pdf
and https://arxiv.org/abs/1508.07909



In [26]:
import os
import csv

from wordpieces import wordpieces as wp
from wordpieces.wordpieces import WPDict, WPDictBuilder

data_dir    = "c-files"
input_file_name = "linux_kernel_concat.txt"
#input_file_name = "memmgr.c"
input_file  = os.path.join(data_dir, input_file_name)

In [27]:
def readCommentsFileTsv(input_file):
    rows=[]
    with open(input_file, encoding='utf-8') as infile:
        tsvreader = csv.reader(infile, delimiter="\t")
        for row in tsvreader:
            rows.append(row)
    
    return rows;

## Build Small Dictionary

In [28]:
comments = readCommentsFileTsv(input_file + ".comments.csv");
 
wp_dict_builder = WPDictBuilder()
for comment in comments:
    txt=comment[2]
    wp_dict_builder.learn_sentense(txt)

print (len(wp_dict_builder.stats))
for s in wp_dict_builder.sorted_wordpieces()[0:20] : print(s)

14900
(' ', 196647)
('e', 69617)
('t', 43929)
('s', 36596)
('n', 29462)
('o', 28536)
('r', 27889)
('a', 26353)
('i', 22927)
('d', 21475)
('l', 19138)
('\n', 17064)
('h', 15964)
('c', 14355)
('u', 12532)
('he', 11454)
('f', 11346)
('_t', 10367)
('.', 9680)
('p', 9467)


#### Keep only 5 000 top wordpeaces

In [29]:
wp_dict = wp_dict_builder.build(1000)
print("dictionary size is %d "%len(wp_dict.stats))

wp_dict.save_tsv(os.path.join("wordpieces", "wordpieces.dict.small.csv"))


building WPDict of size 1000
dictionary size is 1000 
total sum is 1305372


## test splitting sentenses into wordpieces from dictionary

In [30]:
print(wp_dict.find_longest_chunk("_hema"))

# break_word
print (" ".join(wp_dict.break_word("wordpieces")))
print (" ".join(wp_dict.break_word("artem")))
print (" ".join(wp_dict.break_word("sentenses")))

# break_sentence
print (" ".join(wp_dict.break_sentence("test splitting sentenses into wordpieces from dictionary")))
print (" ".join(wp_dict.break_sentence(comments[0][2])))
print (" ".join(wp_dict.break_sentence(comments[1][2])))
print ("] [".join(wp_dict.break_sentence(comments[2][2])))



('_he', 3)
_wo rd pi ec es
_ar tem
_se nte nse s
_t est _sp li tt ing _se nte nse s _in to _wo rd pi ec es _fr om _di cti on ar y
_li nu x / ker nel / acc t. c 

 B S D _P roc ess _A cc oun tin g _fo r _L in u x 

 A ut ho r: _M arc o _va n _W ier ing en _ < m v w @ pl an ets . el m . ne t > 

 So me _co de _ba sed _on _id eas _an d _co de _fr om : 
Th oma s _ K . _D y as _ < t dy as @ ed en . ru t ge rs . edu > 

T his _fi le _im ple men ts _ B S D -s ty le _pr oce ss _ac cou nti ng . _W hen eve r _an y
 pro ces s _ex its , _an _ac cou nti ng _re co rd _of _t y pe _ " str uct _ac ct " _is 
w rit ten _to _th e _fi le _sp ec ifi ed _wi th _th e _ac ct () _sy ste m _ca ll . _I t _is 
 up _to _us er - le ve l _pr og ram s _to _do _us ef ul _th ing s _wi th _th e _ac cou nti ng
 lo g. _Th e _ke rne l _j ust _pr ov ide s _th e _ra w _ac cou nti ng _in for mat ion .

 ( C ) _C op y ri ght _1 9 9 <UNK> _- _1 9 9 <UNK> _M arc o _va n _W ier ing en _- _ E L M _C ons ult anc y _ B . <UNK> .

 P 

#### Test restoring sentenses from list of wordpieces

In [31]:
sentence=comments[0][2]
print("original:---------------------")
print(sentence)

breaks=list(wp_dict.break_sentence(sentence))
# print()
# print("breaks:---------------------")
# print (" ".join(breaks))
#print (len(breaks))

print()
print("restored:---------------------")
print (wp_dict.joinSentence(breaks))

original:---------------------
linux/kernel/acct.c

BSD Process Accounting for Linux

Author: Marco van Wieringen <mvw@planets.elm.net>

Some code based on ideas and code from:
Thomas K. Dyas <tdyas@eden.rutgers.edu>

This file implements BSD-style process accounting. Whenever any
process exits, an accounting record of type "struct acct" is
written to the file specified with the acct() system call. It is
up to user-level programs to do useful things with the accounting
log. The kernel just provides the raw accounting information.

(C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.

Plugged two leaks. 1) It didn't return acct_file into the free_filps if
the file happened to be read-only. 2) If the accounting was suspended
due to the lack of space it happily allowed to reopen it and completely
lost the old acct_file. 3/10/98, Al Viro.

Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
XTerms and EMACS are manifestations of pure evil. 21/10/98, AV

In [None]:
del wp_dict
del comments

# Build larger dict

In [37]:


dict_builder = WPDictBuilder()

with open(os.path.join(data_dir, "redis_concat.pp.c"), "r", encoding="utf8") as f:
    data = f.read() 
    for line in data.splitlines(True):
        if(not line.startswith("#") and (not len(line.strip())==0)):
            dict_builder.learn_sentense(line)
    del data
    
with open(os.path.join(data_dir, "linux_kernel_concat.txt"), "r", encoding="utf8") as f:
    data = f.read() 
    for line in data.splitlines(True):
        dict_builder.learn_sentense(line)
    del data

print (len(dict_builder.stats))

for s in dict_builder.sorted_wordpieces()[0:120] : print(s)
    
wp_dict_larger = dict_builder.build(2000)
#print (wp_dict_larger.stats["\t"])
print (wp_dict_larger.stats[" "])
# print (wp_dict_larger.stats["\n"])
print("dictionary size is %d "%len(wp_dict_larger.stats))

wp_dict_larger.save_tsv(os.path.join("wordpieces", "wordpieces.dict.larger.csv"))


42460
(' ', 4279833)
('\n', 1713342)
('_', 1444482)
('t', 1154191)
('e', 684648)
('r', 521527)
('i', 501193)
('n', 463421)
('s', 437049)
('d', 407529)
(';\n', 394208)
('a', 377875)
('o', 370539)
('c', 311786)
(',', 252147)
('_t', 242778)
('l', 242522)
('u', 224233)
('f', 223871)
(')', 214487)
(';', 206429)
('nt', 198732)
('p', 191255)
('m', 175791)
('in', 172076)
('*', 160441)
('g', 160030)
('(', 157318)
('st', 142448)
('__', 130272)
('h', 124403)
('_*', 124023)
('_s', 121427)
('_i', 110398)
('y', 108125)
('int', 104624)
('ct', 101554)
('v', 99952)
('ed', 97718)
('re', 94123)
('ar', 91569)
('_in', 89253)
('tr', 88436)
('id', 88320)
('=', 86648)
('b', 86059)
('\t', 77225)
(');', 76911)
(');\n', 75653)
('te', 73601)
('on', 73117)
('_c', 71636)
('er', 69455)
('at', 66943)
('ef', 63415)
('k', 63030)
('_r', 60563)
('ch', 59705)
('e_', 58196)
('x', 55567)
('de', 54659)
('si', 53764)
('___', 53483)
('str', 51326)
('_st', 51140)
('en', 47956)
('lo', 47014)
('se', 46786)
('pe', 46778)
('"', 466

In [39]:
print("original:---------------------")
print(sentence)

breaks=list(wp_dict_larger.break_sentence(sentence))
# print()
# print("breaks:---------------------")
# print (" ".join(breaks))
#print (len(breaks))

print()
print("restored:---------------------")
print (wp_dict_larger.joinSentence(breaks))

original:---------------------
linux/kernel/acct.c

BSD Process Accounting for Linux

Author: Marco van Wieringen <mvw@planets.elm.net>

Some code based on ideas and code from:
Thomas K. Dyas <tdyas@eden.rutgers.edu>

This file implements BSD-style process accounting. Whenever any
process exits, an accounting record of type "struct acct" is
written to the file specified with the acct() system call. It is
up to user-level programs to do useful things with the accounting
log. The kernel just provides the raw accounting information.

(C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V.

Plugged two leaks. 1) It didn't return acct_file into the free_filps if
the file happened to be read-only. 2) If the accounting was suspended
due to the lack of space it happily allowed to reopen it and completely
lost the old acct_file. 3/10/98, Al Viro.

Now we silently close acct_file on attempt to reopen. Cleaned sys_acct().
XTerms and EMACS are manifestations of pure evil. 21/10/98, AV