In [None]:
import pandas as pd
import numpy as np
import string
import re
import os
import glob
import json

from typing import List, Optional, Dict

In [None]:
path_root = '/opt/workspace/data/smc/'
if os.path.exists('/Users/dev/'):
    path_root = '/Users/dev/work/data/smc/'
    
print(path_root)

In [None]:
train_pathes = glob.glob(path_root+"b_*")
for n, p in enumerate(train_pathes):
    print('{}: {}'.format(n, p))

In [None]:
test_pathes = glob.glob(path_root+"z_*")
for n, p in enumerate(test_pathes):
    print('{}: {}'.format(n, p))

In [None]:
class OpenTextProcessor:
    def __init__(self,
            min_occurrence: Dict[int,int],
            left_cnt: int, right_cnt: int,
            min_error_len: int, max_error_len: int,
            sw='internal'
        ):
        self.ks = list(min_occurrence.keys())
        self.min_occurrence = min_occurrence
        self.left_cnt = left_cnt
        self.right_cnt = right_cnt
        self.min_error_len = min_error_len
        self.max_error_len = max_error_len
        
        self.alphabet = 'abcdefghijklmnopqrstuvwxyz'
        self.alen = len(self.alphabet)
        
        self.sw = {}
        if isinstance(sw, dict):
            self.sw = sw
        elif sw=='internal':
            self.sw = {
                'the', 'what', 'is', 'a', 'in', 'i', 'to', 'how', 'of', 'do', 'are', 'and', 'for', 'can', 'you', 't',
                'why', 'it', 'my', 'does', 'on', 'or', 'which', 's', 'with', 'if', 'have', 'be', 'an', 'that',
                'some', 'get', 'should', 'from', 'your', 'at', 'when', 'like', 'who', 'there', 'will', 'as',
                'would', 'not', 'one', 'about', 'where', 'any', 'by', 'me', 'did', 'was', 'we', 'after',
                'so', 'they', 'this', 'am', 'has', 'their', 'many', 'than', 'more', 'other', 'but', 'out',
                'into', 'm'
            }
        print('len(sw)={}'.format(len(sw)))
        
        self.train_texts = None
        
        print('self.alphabet={}'.format(self.alphabet))
        print('len(self.alphabet)={}'.format(len(self.alphabet)))
        print('len(self.sw)={}'.format(len(self.sw)))
        
    def _get_texts_dicts(self, pathes: List[str]):
        texts = []
        for n, p in enumerate(pathes):
            text_p = self._read_txt(p)
            if text_p is None:
                continue
            
            text_p = self._preproc_txt(text_p)
            print('{}: len={}'.format(p, len(text_p)))
            
            texts.append({'num':n, 'path':p, 'text':text_p})
        return texts
        
    def fit(self, train_pathes: List[str]):
        self.train_texts = self._get_texts_dicts(train_pathes)    
        print('len(self.train_texts)={}\n'.format(len(self.train_texts)))
            
        self.kgramm_enc = self._get_kgramm_encoder()
        print('len(self.kgramm_enc)={}\n'.format(len(self.kgramm_enc)))
        
        dfs = {}
        for k in self.ks:
            print('k={}'.format(k))
            dfs[k] = self.create_kgramm_df(k=k, txts=self.train_texts)
            print('\n')
        
        return dfs
        
    def transform(self, test_pathes: List[str]):
        test_texts = self._get_texts_dicts(test_pathes)    
        print('len(test_texts)={}\n'.format(len(test_texts)))
        
        dfs = {}
        for k in self.ks:
            print('k={}'.format(k))
            dfs[k] = self.create_kgramm_df(k=k, txts=test_texts)
            print('\n')
            
        return dfs

    def _read_txt(self, path: str) -> Optional[str]:
        try:
            with open(path, 'r', encoding='utf-8') as fd:
                return fd.read()
        except Exception as e:
            print('exception reading path {}: {}'.format(path, e))
        return None
    
    def _preproc_txt(self, txt: str) -> Optional[str]:
        txt = txt.lower()

        buff = []
        for c in txt:
            buff.append(' ' if c not in self.alphabet else c)
        txt = ''.join(buff)

        txt = re.sub("\s\s+" , " ", txt)

        words = txt.split(' ')
        words = [w for w in words if w not in self.sw]
        txt = ' '.join(words)
        txt = ''.join(txt.split(' '))
        return txt
    
    def _collect_kgramm_stat_text(self, txt: str, k: int) -> Dict[str, int]:
        d = {}
        tlen = len(txt)
        for i in range(tlen-k):
            kgr = txt[i:i+k]
            if kgr not in d:
                d[kgr] = 0
            d[kgr] += 1

        return d
    
    def _collect_kgramm_stat(self, k: int) -> Dict[str, int]:
        stat = {}
        for d in self.train_texts:
            n = d['num']
            p = d['path']
            t = d['text']
            print('processing text {}'.format(p))
            
            d = self._collect_kgramm_stat_text(t, k)
            
            for kgr,v in d.items():
                if kgr not in stat:
                    stat[kgr] = 0
                stat[kgr] += v
                
        return stat
    
    def _get_kgramm_encoder(self) -> Dict[str, int]:
        enc_dict = {}
        for k in self.ks:
            print('processing k={}'.format(k))
            d = self._collect_kgramm_stat(k)
            
            enc_id_to_kgr = {1: '<OTHER>'}
            enc_kgr_to_id = {'<OTHER>': 1}
            kgr_prob = {'<OTHER>': 0}
            
            cnt_sum = sum(d.values())
            
            n = 2
            min_prob = None
            other_cnt = 0
            for kgr, cnt in sorted(d.items(), key=lambda x: -x[1]):
                if cnt<self.min_occurrence[k]:
                    other_cnt += cnt
                    continue
                enc_id_to_kgr[n] = kgr
                enc_kgr_to_id[kgr] = n
                kgr_prob[kgr] = cnt/cnt_sum
                if min_prob is None or min_prob>kgr_prob[kgr]:
                    min_prob = kgr_prob[kgr]
                n += 1
                
            kgr_prob['<OTHER>'] = other_cnt/cnt_sum
            
            enc_dict[k] = {'id_to_kgr':enc_id_to_kgr, 'kgr_to_id':enc_kgr_to_id, 'kgr_prob':kgr_prob}
            print('len(k)={}/{} other_prob={:0.6f}, min_prob={:0.6f}\n'.format(
                    len(enc_id_to_kgr), self.alen**k, kgr_prob['<OTHER>'], min_prob
                )
            )
                
        return enc_dict

    def create_kgramm_df(self, k: int, txts: Optional[List] = None) -> Optional[pd.DataFrame]:
        data = []
        encoder = self.kgramm_enc[k]['kgr_to_id']
        allids = list(self.kgramm_enc[k]['kgr_to_id'].values())
        len_allids = len(allids)
            
        for d in txts:
            n = d['num']
            p = d['path']
            t = d['text']
            print('processing text {}'.format(p))
            
            tlen = len(t)
            for i in range(tlen-(k+self.left_cnt*k+self.right_cnt*k+k)):
                lefts = []
                for l in range(self.left_cnt):
                    kgr = t[l*k+i: l*k+i+k]
                    lefts.append(kgr)
                #print(lefts)
                lefts = [encoder.get(s,1) for s in lefts]
                
                l += 1
                target_kgr = t[l*k+i: l*k+i+k]
                #print(target_kgr)
                target_kgr = encoder.get(target_kgr,1)
                l += 1
                
                rights = []
                for r in range(self.right_cnt):
                    kgr = t[r*k+l*k+i: r*k+l*k+i+k]
                    rights.append(kgr)
                #print(rights)
                rights = [encoder.get(s,1) for s in rights]
                
                if 'rnd' not in p:
                    data.append([n, i] + lefts + [target_kgr] + rights + [1])
                while True:
                    rnd_kgr = allids[np.random.randint(len_allids)]
                    if rnd_kgr != target_kgr:
                        data.append([n, i] + lefts + [rnd_kgr] + rights + [0])
                        break
                #print(t[i:i+1000])
                #1/0
        
        left_cols = ['l{}'.format(n) for n in range(self.left_cnt)]
        right_cols = ['r{}'.format(n) for n in range(self.right_cnt)]
        return pd.DataFrame(data=data, columns=['n', 'i']+left_cols+['tkgr']+right_cols+['target'])
    
    def _rnd_txt(self, tlen):
        return ''.join(self.alphabet[i] for i in np.random.randint(self.alen, size=tlen))
    
    def _insert_rnd(self, txt, nins):
        tlen = len(txt)
        ntxt = str(txt)
        for n in range(nins):
            ins_idx = np.random.randint(tlen)
            ntxt = ntxt[:ins_idx]+self.alphabet[np.random.randint(self.alen)]+ntxt[ins_idx:]
        return ntxt[:tlen]
    
    def _insert_rnd_seq(self, txt, nins):
        tlen = len(txt)
        ntxt = str(txt)
        ins_chars = []
        for n in range(nins):
            ins_chars.append(self.alphabet[np.random.randint(self.alen)])
        
        ins_txt = ''.join(ins_chars)
        ins_idx = np.random.randint(tlen)
        ntxt = ntxt[:ins_idx]+ins_txt+ntxt[ins_idx:]
        return ntxt[:tlen]
    
    def _replace_rnd(self, txt, nins):
        tlen = len(txt)
        ntxt = str(txt)
        for n in range(nins):
            ins_idx = np.random.randint(tlen)
            ntxt = ntxt[:ins_idx]+self.alphabet[np.random.randint(self.alen)]+ntxt[ins_idx+1:]
        return ntxt[:tlen]
    
    def get_openclosed_text_df(self, test_pathes: List[str], clen: int, keep_prob: float) -> pd.DataFrame:
        data = []
        
        test_texts = self._get_texts_dicts(test_pathes)
        print('len(test_texts)={}'.format(len(test_texts)))
        
        for d in test_texts:
            n = d['num']
            p = d['path']
            t = d['text']
            print('processing text {}'.format(p))
            maxlen = len(t)-clen
            for i in range(maxlen):
                if np.random.rand()>keep_prob:
                    continue
                
                if i%12345 == 0:
                    print('{}/{} done;'.format(i, maxlen))
                
                txt = t[i:i+clen]
                if len(txt)!=clen:
                    continue
                
                # open text
                dtype = np.random.randint(low=0, high=3)
                sr = np.random.randint(low=self.min_error_len, high=self.max_error_len)
                if dtype==0:
                    data.append([n, i, dtype, self._insert_rnd(txt, sr), sr, 1])
                if dtype==1:
                    data.append([n, i, dtype, self._insert_rnd_seq(txt, sr), sr, 1])
                else:
                    data.append([n, i, dtype, self._replace_rnd(txt, sr), sr, 1])
                
                # closed text
                data.append([n, i, dtype, self._rnd_txt(clen), sr, 0])
                
        
        return pd.DataFrame(data=data, columns=['n_txt', 'idx', 'bad_type', 'txt', 'num_reps', 'open'])


In [None]:
otp = OpenTextProcessor(
    min_occurrence={2: 80, 3:120},
    left_cnt=6, right_cnt=3,
    min_error_len=0, max_error_len=1,
    sw={}
)

In [None]:
train_dfs = otp.fit(train_pathes)

for k in train_dfs:
    print('k={}'.format(k))
    path_save = path_root+'train_k2v_{}.csv'.format(k)
    train_dfs[k].to_csv(path_save, index=False)
    print('saved to {}'.format(path_save))

In [None]:
test_dfs = otp.transform(test_pathes)

for k in test_dfs:
    print('k={}'.format(k))
    path_save = path_root+'test_k2v_{}.csv'.format(k)
    test_dfs[k].to_csv(path_save, index=False)
    print('saved to {}'.format(path_save))

In [None]:
with open(path_root + 'encoder.json', 'w') as fd:
    encjs = json.dumps(otp.kgramm_enc)
    fd.write(encjs)

In [None]:
# open/cipher text nn data
df_data = otp.get_openclosed_text_df(test_pathes, clen=44, keep_prob=0.4)
path_train_data = path_root + 'open_closed_nosw.csv'
df_data.to_csv(path_train_data, index=False)
print('df_data.shape={}'.format(df_data.shape))

In [None]:
df_data.sample(frac=0.1).head()