In [9]:
import sys
sys.path.append("../")

In [10]:
from src.tokenizer import CommitBucket

In [11]:
import spacy
from string import punctuation
from spacy.tokenizer import Tokenizer
import re
nlp = spacy.load("en")

In [12]:
bucket = CommitBucket("../tmp/plotly@plotly.py.jsonl")

In [13]:
commits = (bucket.iterate_commits())
diff = [next(commits) for i in range(100)][32].modifications[0].diff
print(diff)

@@ -87,6 +87,14 @@ class plotly:
 				pass
 		return res
 
+	def fig2plotly(self, fig):
+		try:
+			import matplotlylib
+		except as e:
+			print("Uh oh! matplotlylib not installed. Install with pip (depends on matplotlib):\n$ sudo pip install matplotlylib")
+			raise e
+		matplotlylib.fig2plotly(fig, username=self.username, key=self.api_key)
+
 	def __callplot(self, *args, **kwargs):
 		''' Make a plot in plotly.
 		Two interfaces:



In [14]:
diff

'@@ -87,6 +87,14 @@ class plotly:\n \t\t\t\tpass\n \t\treturn res\n \n+\tdef fig2plotly(self, fig):\n+\t\ttry:\n+\t\t\timport matplotlylib\n+\t\texcept as e:\n+\t\t\tprint("Uh oh! matplotlylib not installed. Install with pip (depends on matplotlib):\\n$ sudo pip install matplotlylib")\n+\t\t\traise e\n+\t\tmatplotlylib.fig2plotly(fig, username=self.username, key=self.api_key)\n+\n \tdef __callplot(self, *args, **kwargs):\n \t\t\'\'\' Make a plot in plotly.\n \t\tTwo interfaces:\n'

In [51]:
def create_custom_tokenizer(nlp):

    token_match = re.compile(r"(<ADD>|<DEL>|<space>|<tab>)")

    prefix_re = re.compile(r"[(]")
    infix_re = re.compile(r"[^\w]")
    suffix_re = re.compile(r"[)]")
    
    return Tokenizer(nlp.vocab, 
                    None,
                     prefix_search = prefix_re.search, 
                     infix_finditer = infix_re.finditer, 
                     suffix_search = None,
                     token_match=token_match.search)

In [52]:
nlp.tokenizer = create_custom_tokenizer(nlp)

In [53]:
[token for token in nlp("(for tok in https://google.com range(100): return tok)")]

[(,
 for,
 tok,
 in,
 https,
 :,
 /,
 /,
 google,
 .,
 com,
 r,
 a,
 n,
 g,
 e,
 (,
 100,
 ),
 :,
 return,
 tok,
 )]

In [95]:
import typing 
import string

class PunctTokenizer:
    def __init__(self, punctuation: str = string.punctuation, ignore_punctuation: bool = False):
        self.punctuation = punctuation
        self._punctuation_tokens = list(punctuation) + ["\n", "\t", " "]
        self.ignore_punctuation = ignore_punctuation
        self.add_token = "<ADD>"
        self.del_token = "<DEL>"
        self.space_token = "<SPACE>"
        self.tab_token = "<TAB>"
        self.new_line_token = "<NL>"
    
    def tokenize(self, text: str) -> typing.List[str]:
        text = self.preprocess(text)
        tokens = []
        identation_found = False

        for token in nlp(text):
            if self.ignore_punctuation:
                if not self.is_punctuation(token.text):
                    tokens.append(token.text)
                continue
                
            if identation_found:
                if token.text == " ":
                    tokens.append(self.space_token)
                elif token.text == "\t":
                    tokens.append(self.tab_token)
                elif token.text == "\n":
                    tokens.append(self.new_line_token)
                else:
                    tokens.append(token.text)
                    identation_found = False
            else:
                if token.text == "\n":
                    tokens.append(self.new_line_token)
                    identation_found = True
                elif token.text not in [" ", "\t"]:
                    tokens.append(token.text)

        return tokens
    
    def preprocess(self, text: str) -> str:
        text = re.sub(r"\@@.*?\@@", "", text)
        text = re.sub(f"([{punctuation}\\\])", r" \1 ", text)
        text = text.replace("\n +", f"\n{self.add_token}").replace("\n -", f"\n{self.del_token}")
        return text
    
    def is_punctuation(self, token: str) -> bool:
        return token in self._punctuation_tokens

In [96]:
PunctTokenizer().preprocess(diff)

' class plotly : \n \t\t\t\tpass\n \t\treturn res\n \n<ADD> \tdef fig2plotly ( self ,  fig )  : \n<ADD> \t\ttry : \n<ADD> \t\t\timport matplotlylib\n<ADD> \t\texcept as e : \n<ADD> \t\t\tprint (  " Uh oh !  matplotlylib not installed .  Install with pip  ( depends on matplotlib )  :  \\ n $  sudo pip install matplotlylib "  ) \n<ADD> \t\t\traise e\n<ADD> \t\tmatplotlylib . fig2plotly ( fig ,  username = self . username ,  key = self . api _ key ) \n<ADD> \n \tdef  _  _ callplot ( self ,   * args ,   *  * kwargs )  : \n \t\t \'  \'  \'  Make a plot in plotly . \n \t\tTwo interfaces : \n'

In [99]:
len(PunctTokenizer(ignore_punctuation=True).tokenize(diff))

64

In [1089]:
subword_toks = """@@ -@@ 174,1@@ 0 +@@ 174,1@@ 1 @@ de@@ f de@@ pend@@ a@@ n@@ c@@ ies@@ _@@ in@@ _@@ d@@ o@@ t@@ _@@ f@@ or@@ m@@ at@@ (@@ p@@ at@@ h@@ )@@ :\n f@@ o@@ r (k, k2@@ ) i@@ n t@@ w@@ o@@ _@@ w@@ a@@ y@@ s@@ :\n l.append("\\t\\"@@ %s@@ \\" -@@ > \\"@@ %s\\"@@ ;@@ " % (k, k2@@ )@@ )\n \n@@ - l.append("\\t")\n@@ - l.append("\\tedge [color=black];")\n@@ - l.append("\\tnode [shape=plaintext];")\n@@ - l.append("\\t\\"Categories\\" [label=\\"@@ Categories@@ :@@ \\\\n@@ \\\\n@@ %s\\"];" % "\\\\n".join(category_list)@@ )\n@@ + i@@ f category_list@@ :@@ \n@@ + l.append("\\t")\n@@ + l.append("\\tedge [color=black];")\n@@ + l.append("\\tnode [shape=plaintext];")\n@@ + l.append("\\t\\"Categories\\" [label=\\"@@ %s\\"];" % "\\\\n".join(category_list)@@ )\n \n l.append(@@ "@@ }@@ \\n"@@ )\n r@@ e@@ t@@ u@@ r@@ n \'@@ \\n@@ \'@@ .join(@@ l@@ )\n"""

In [1091]:
len(subword_toks.split(" "))

123

In [14]:
import sentencepiece as spm

# train sentencepiece model from `botchan.txt` and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
# spm.SentencePieceTrainer.train('--input=/workspace/tmp/openshift@openshift-ansible.jsonl --model_prefix=sentpiece --vocab_size=8000 --character_coverage=1.0 --model_type=bpe')

In [19]:
%%time
spm.SentencePieceTrainer.train('--input=/workspace/tmp/diffs.train --user_defined_symbols=<FILE>,<CHUNK>,<NL>,<ADD>,<DEL>,<URL> --model_prefix=model1 --vocab_size=16000 --hard_vocab_limit=false --input_sentence_size=1000 --model_type=bpe')

CPU times: user 1.88 s, sys: 110 ms, total: 1.99 s
Wall time: 2.07 s


True

In [20]:
diffs = open("/workspace/tmp/diffs.train").readlines()
len(diffs)

10000

In [33]:
diff = diffs[9111]
diff, len(diff.split(" "))

('<FILE> tutorial_02.rst <CHUNK> url this view will respond to: <NL>   <NL>  .. code-block:: python <NL>   <NL> <ADD>     from django.conf.urls import patterns, url <NL> <ADD>     from oauth2_provider import views <NL> <ADD>     from django.conf import settings <NL>      from .views import apiendpoint <NL>   <NL>      urlpatterns = patterns( <NL>          \'\', <NL>          url(r\'^admin/\', include(admin.site.urls)), <NL> <DEL>         url(r\'^o/\', include(\'oauth2_provider.urls\', namespace=\'oauth2_provider\')),  # look ma, i\'m a provider! <NL> <DEL>         url(r\'^api/hello\', apiendpoint.as_view()),  # and also a resource server! <NL> <ADD>  <NL> <ADD>         # oauth2 provider endpoints <NL> <ADD>         url(r\'^o/authorize/$\', views.authorizationview.as_view(), name="authorize"), <NL> <ADD>         url(r\'^o/token/$\', views.tokenview.as_view(), name="token"), <NL> <ADD>         url(r\'^o/revoke-token/$\', views.revoketokenview.as_view(), name="revoke-token"), <NL> <ADD>  

In [34]:
# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load('model1.model')

# encode: text => id
print(sp.encode_as_pieces(diff))
print(sp.encode_as_ids(diff))

['▁', '<FILE>', '▁tutorial', '_02.', 'rst', '▁', '<CHUNK>', '▁url', '▁this', '▁view', '▁will', '▁respon', 'd', '▁to', ':', '▁', '<NL>', '▁', '<NL>', '▁..', '▁code', '-', 'block', '::', '▁python', '▁', '<NL>', '▁', '<NL>', '▁', '<ADD>', '▁from', '▁django', '.', 'conf', '.', 'urls', '▁import', '▁patterns', ',', '▁url', '▁', '<NL>', '▁', '<ADD>', '▁from', '▁oauth', '2_', 'provider', '▁import', '▁views', '▁', '<NL>', '▁', '<ADD>', '▁from', '▁django', '.', 'conf', '▁import', '▁settings', '▁', '<NL>', '▁from', '▁.', 'views', '▁import', '▁api', 'endpoint', '▁', '<NL>', '▁', '<NL>', '▁urlpatterns', '▁=', '▁patterns', '(', '▁', '<NL>', "▁'',", '▁', '<NL>', '▁url', '(', 'r', "'", '^', 'admin', "/',", '▁include', '(', 'admin', '.', 'site', '.', 'urls', ')),', '▁', '<NL>', '▁', '<DEL>', '▁url', '(', 'r', "'", '^', 'o', "/',", '▁include', "('", 'oauth', '2_', 'provider', '.', 'urls', "',", '▁namespace', "='", 'oauth', '2_', 'provider', "')),", '▁#', '▁look', '▁ma', ',', '▁i', "'", 'm', '▁a', '▁prov

In [35]:
len(sp.encode_as_pieces(diff))

584