# 1. A Brief View

In [None]:
import numpy as np

corpus = [['1.病史：患者为63岁女性，慢性病程，急性加重。',
'既往有“高脂血症”病史。',
'2.因“反复脐周疼痛2年余，再发并加重1周”入院。'],
['3.体查：血压128/63mmHg，神志清楚，浅表淋巴结无肿大，口唇无苍白。',
'双侧扁桃体无肿大、充血。咽无充血。颈静脉无怒张。'],
['双肺呼吸音清晰，未闻及干湿性啰音，无胸膜摩擦音。',
'心率62次/分，律齐，各瓣膜区未闻及病理性杂音。',
'腹部平坦，未见胃肠型及蠕动波，腹壁柔软，脐周压痛，无反跳痛，未扪及包块，肝脾肋下未扪及，Murphy征（-）',
'肝肾区无叩击痛，移动性浊音（-），肠鸣音4次/分。',
'双下肢无浮肿。四肢肌力、肌张力正常，生理反射存在，病理反射未引出。']]
# print(len(corpus))
corpus


In [None]:
TEXT_DICT = {}
TEXT_DICT['NUMSents'] = []
TEXT_DICT['EndIDXSents'] = []

SENT_DICT = {}
SENT_DICT['NUMTokens'] = []
SENT_DICT['EndIDXTokens'] = []

TOKEN_DICT = {}
TOKEN_DICT['DATAToken'] = []

for text in corpus:
    # get text feature
    
    lenText = len(text)
    TEXT_DICT['NUMSents'].append(lenText)
    try:
        TEXT_DICT['EndIDXSents'].append(SENT_DICT['EndIDXTokens'][-1] + lenText)
    except:
        TEXT_DICT['EndIDXSents'].append(lenText)
    for sent in text:
        lenSent = len(sent)
        SENT_DICT['NUMTokens'].append(lenSent)
        try:
            SENT_DICT['EndIDXTokens'].append(SENT_DICT['EndIDXTokens'][-1] + lenSent)
        except:
            SENT_DICT['EndIDXTokens'].append(lenSent)
        
        
        TOKEN_DICT['DATAToken'].extend([token for token in sent])
               

print('Text  Level Dictionary')
print(TEXT_DICT)
print()
print('Sent  Level Dictionary')
print(SENT_DICT)
print()
print('Token Level Dictionary')
print(TOKEN_DICT)

In [None]:
sentId = 0
StartIdx = SENT_DICT['EndIDXTokens'][sentId-1] if sentId != 0 else 0 # this is more faster
EndIdx   = SENT_DICT['EndIDXTokens'][sentId]

print(StartIdx, EndIdx)
print(''.join(TOKEN_DICT['DATAToken'][StartIdx: EndIdx]))
print(corpus[0][0])

# 2. Deal with the Folder Corpus `(deprecate)`

## 2.1 Generate Text File and Its Paths `(deprecate)`

In [None]:
import os
import numpy as np
# Important One
def geneTextFilePaths(corpusPath, orig_iden = '.txt', anno_iden = None):
    FolderNames = [i for i in np.sort(os.listdir(corpusPath)) if i[0] != '.']
    # print(FolderNames)
    FolderDict = {}
    
    for foldername in FolderNames:
        path = corpusPath + foldername
        # TODO: check the path by os
        OrigFileList = [i for i in os.listdir(path) if orig_iden in i ]
        if anno_iden:
            AnnoFileList = [i.replace(orig_iden, anno_iden)  for i in OrigFileList]
            AnnoFileList = [i if os.path.isfile(path + '/' + i) else '' for i in AnnoFileList]
        else:
            AnnoFileList = [''] * len(OrigFileList)
            
        FolderDict[foldername] = OrigFileList, AnnoFileList
    return FolderDict

corpusPath = 'dataset/ner/'
anno_iden = '.Entity'
FolderDict = geneTextFilePaths(corpusPath,  orig_iden = '.txt', anno_iden = anno_iden)
print(FolderDict)

In [None]:
corpusPath = 'dataset/medpos/'
anno_iden = '.UMLSTag'
CorpusFolderDict = geneTextFilePaths(corpusPath, orig_iden = '.txt', anno_iden = anno_iden)
print(CorpusFolderDict)

## 2.2 Read Text from a File Path `(deprecate)`

In [None]:
def strQ2B(ustring):
    rstring = ''
    for uchar in ustring:
        inside_code = ord(uchar)
        if inside_code == 12288:
            inside_code = 32
        elif (inside_code >= 65281 and inside_code <= 65374):
            inside_code -= 65248
        # 2: unichr; 3: chr
        rstring += chr(inside_code)
    return rstring


def fileReader(fullfilepath):
    with open(fullfilepath, 'r', encoding = 'utf-8') as f:
        text = f.read()
    return strQ2B(text)
    


filename = 'patient5212.txt'
fullfilepath = 'dataset/medpos/'+ 'batch2/'+ filename


text = fileReader(fullfilepath)
text

## 2.3 Segment Text to Sentences

### 2.3.1 Using RegEx

In [None]:
import re

def reCutText2Sent(text):
    text = re.sub( ' +', ' ', text ).strip()
    text = re.sub('([。！？\?])([^”])',r"\1\n\2",text) 
    text = re.sub('(\.{6})([^”])',    r"\1\n\2",text) 
    text = re.sub('(\…{2})([^”])'    ,r"\1\n\2",text)
    text = '"'.join( [ x if i % 2 == 0 else x.replace('\n', '') 
                         for i, x in enumerate(text.split('"'))] )
    text = re.sub( '\n+', '\n', text ).strip() # replace '\n+' to '\n'
    return text.split("\n")


text = '1、急性咽炎    2、I型糖尿病\n1、患儿男,11岁3月,因“发现血糖高半年余,发热、间断头晕头痛6小时”入院。查体:双侧扁桃体Ⅰ度肿大,有充血,咽部中度充血,血常规提示白细胞增高,以中性粒细胞为主,可诊断。\n2、患儿男,11岁3月,因“发现血糖高半年余,发热、间断头晕头痛6小时”入院。2014-08-22我院胰岛素释放试验:73.46-495.3-309.8-432.4-293.5pmol/L;糖尿病分型三项:胰岛素细胞抗体阳性;糖化血红蛋白 6.3%;2014-10-04指尖血糖11.6mmol/L。可诊断。'

'''
text = re.sub( ' +', ' ', text ).strip()
text = re.sub('([。！？\?])([^”])',r"\1\n\2",text) 
text = re.sub('(\.{6})([^”])',    r"\1\n\2",text) 
text = re.sub('(\…{2})([^”])'    ,r"\1\n\2",text)
text = '"'.join( [ x if i % 2 == 0 else x.replace('\n', '') for i, x in enumerate(text.split('"'))] )
text = re.sub( '\n+', '\n', text ).strip()
text
'''
reCutText2Sent(text)

### 2.3.2 Old Method (Stupid) `(deprecate)`

In [None]:
# # depecated seg sentence
# def filterSeps(sentSep, quota):
#     if len(quota) == 0:
#         return sentSep

#     elif len(quota) % 2:
#         return sentSep
        
#     a = int(len(quota)/2)
#     quotas = [[quota[2*i], quota[2*i+1] ] for i in range(a)]
#     # quotas
#     newSeps = []
#     for sep in sentSep:
#         flag = 0
#         for a, b in quotas:
#             if a < sep and sep < b:
#                 flag = 1
#                 break
#         if flag == 0:
#             newSeps.append(sep)
#     return newSeps

# def segmentText2Sent(text):
#     #0# Clean the Whole Text
#     textAheadSpace = 0
#     while text[textAheadSpace]    in [' ', '\n']:
#         textAheadSpace = textAheadSpace + 1
    
#     textBehindSpace = len(text)
#     while text[textBehindSpace-1] in [' ', '\n']:
#         # print(text[: textBehindSpace])
#         textBehindSpace = textBehindSpace - 1
        
#     # print(textAheadSpace, textBehindSpace)
#     origtext = text
#     text = text[textAheadSpace:textBehindSpace]
#     # print(text)

#     #1# For Spliting Sentence Based on '\n' and '。'
    
#     sents = text.splitlines(True)
#     L = []
#     for sent in sents:
#         periodIndex = [-1] + [i for i in range(len(sent)) if sent[i] == '。']
#         #print(periodIndex)
#         quota = [i for i in range(len(sent)) if sent[i] in '“”']
        
#         #print(quota)
#         periodIndex = filterSeps(periodIndex, quota)
#         #print(periodIndex)
#         #print('---')
#         l = [sent[periodIndex[ind]+1:periodIndex[ind+1]+1] 
#              for ind in range(len(periodIndex)-1)]
#         L.extend( l+ [sent[periodIndex[-1]+1:]])
#     # pprint(L)
    
#     newL = []
    
#     for ind in range(len(L)):
#         currentL = ''.join(list(set(L[ind])))
#         if len(L[ind]) >= 4 and currentL not in [' ', '', '\n', '\n ', ' \n'] :
#             newL.append(L[ind])
#         else:
#             newL[-1] = newL[-1] + L[ind]
#     #1# Spliting End
#     # pprint(newL)
    
    
#     #2# For Calculating Cum Len
#     cumLens = [0] + list(np.cumsum([len(i) for i in newL]))
#     #2# Calculating End
    
#     #3# For Spliting Head and Tail Space and Adding Start and End Index
#     newLStartEnd = []
#     for ind_s in range(len(newL)):
#         sent = newL[ind_s]
        
        
#         ## For SentAheadSpace
#         sentAheadSpace = 0
#         while sent[:sentAheadSpace + 1] == ' ' * (sentAheadSpace+1):
#             sentAheadSpace = sentAheadSpace + 1
#         ## SentAheadSpace End
            
#         ## For sentBehindSpace
#         sentBehindSpace = len(sent)
        
#         try:
#             while sent[sentBehindSpace-1] in [' ', '\n']:
#                 # print(sent[: sentBehindSpace])
#                 sentBehindSpace = sentBehindSpace - 1
#         ## sentBehindSpace End
#         except:
#             print(L)
#             print(newL)
#             exit(0)

#         sent_start = cumLens[ind_s] + sentAheadSpace  + textAheadSpace
#         sent_end   = cumLens[ind_s] + sentBehindSpace + textAheadSpace
#         new_sent   = sent[sentAheadSpace:sentBehindSpace]
#         # print(new_sent)
#         assert new_sent == origtext[sent_start: sent_end]

#         # newLStartEnd.append([new_sent , sent_start, sent_end])
#         newLStartEnd.append(new_sent)
    
#     return newLStartEnd

### 2.3.3 Cut Big Text to Sent by Each Line

In [None]:
def lineCutText2Sent(fullfilepath):
    with open(fullfilepath, 'r', encoding = 'utf-8') as f:
        for sent in f:
            yield strQ2B(sent).replace('\n', '')

### Corporated Function

In [None]:
def segText2Sents(text, method = 'whole'):
    
    '''
    text:
        1. textfilepath
        2. text-level string
    method: 
        1. 'whole': when text is a text-level string,
                    then use this text-level string as sent-level string directly.
                    and return text = [sent-level string].
        2. `funct`: when method is a function, whose input is a text-level string,
                    then return text = funct(text) = [..., sent-level string, ...]
        3. 'line' : string. when text is filepath where each line is a sentence
                    then return a generator text = generate(text), item is a sent-level string.
                    
    '''
    # return method(text)
    
    if os.path.isfile(text):
        # filepath
        if method == 'line':
            text = lineCutText2Sent(text)
            return text
        else:
            text = fileReader(text)
        
    if method == 'whole':
        return [text]
    
    else:
        return method(text)

In [None]:
text = fullfilepath
print(text)
a = segText2Sents(text, method = 'line')
[i for i in a]

## 2.4 Segment Sentence to Tokens

### Corporated Function

In [None]:
def segSent2Tokens(sent, method = 'iter'):
    return [i for i in sent]

## 2.5 Initializing a Folder Type Corpus `(deprecate)`

In [None]:

IDXOrient = 1
SET_ANNO  = True
TAG_SCHEME = "BIO"
CORPUSPath = 'dataset/ner/'
ORIGIden = '.txt'
ANNOIden = '.Entity'
Text2SentMethod = 'whole' # reCutText2Sent
Sent2TokenMethod = 'iter'
TOKENLevel = 'char'


CORPUS = {}
        
CORPUS['CORPUSPath'] = CORPUSPath
CORPUS['ORIGIden']   = ORIGIden
CORPUS['SET_ANNO']   = SET_ANNO
CORPUS['ANNOIden']   = ANNOIden if SET_ANNO else None
CORPUS['IDXOrient'] = IDXOrient if SET_ANNO else None
CORPUS['TAG_SCHEME'] = TAG_SCHEME if SET_ANNO else None


CORPUS['NUMFolders']    = []
CORPUS['EndIDXFolders'] = []
CORPUS['EndIDXFolders'] = []

FOLDER = {}
FOLDER['FolderName'] = []
FOLDER['NUMTexts'] = []
FOLDER['EndIDXTexts'] = []

TEXT = {}
TEXT['NUMSents'] = []
TEXT['EndIDXSents'] = []
TEXT['ORIGFileName'] = []
if SET_ANNO:
    TEXT['ANNOFileName'] = []

SENT = {}
SENT['NUMTokens'] = []
SENT['EndIDXTokens'] = []

TOKEN = {}
TOKEN['ORIGToken'] = []

if SET_ANNO:
    TOKEN['ANNOToken'] = []


corpus = geneTextFilePaths(CORPUSPath, 
                           orig_iden = ORIGIden, 
                           anno_iden = ANNOIden)
# corpus: a dictionary
# print(corpus)

lenCorpus = len(corpus)
CORPUS['NUMFolders'] = [lenCorpus]
CORPUS['EndIDXFolders'] = [lenCorpus]


for folderIdx, folder in enumerate(corpus):
    foldername = folder
    FOLDER['FolderName'].append(folder)
    # folder: string - folder name
    folder, AnnoFilePath = corpus[folder]
    # folder: a list of orig files
    
    lenFolder = len(folder)
    
    FOLDER['NUMTexts'].append(lenFolder)
    
    try:
        FOLDER['EndIDXTexts'].append(FOLDER['EndIDXTexts'][-1] + lenFolder)
    except:
        FOLDER['EndIDXTexts'].append(lenFolder)
            
            
    for textIdx, text in enumerate(folder):
        
        # text: text file name
        TEXT['ORIGFileName'].append(text)
        
        text = CORPUSPath + foldername + '/' + text
        # text: full file path of the this orig file text
        
        text = segText2Sents(text, method = Text2SentMethod) ### KEY
        # text: a list of sentence-level string
        lenText = len(text)
        
        
        TEXT['NUMSents'].append(lenText)
        
        try:
            TEXT['EndIDXSents'].append(TEXT['EndIDXSents'][-1] + lenText)
        except:
            TEXT['EndIDXSents'].append(lenText)
        
        
        for sentIdx, sent in enumerate(text):
            # sent: a sent-level string
            sent = segSent2Tokens(sent, method=Sent2TokenMethod)
            # sent: list of token-level strings
            
            lenSent = len(sent)
            SENT['NUMTokens'].append(lenSent)
            try:
                SENT['EndIDXTokens'].append(SENT['EndIDXTokens'][-1] + lenSent)
            except:
                SENT['EndIDXTokens'].append(lenSent)

            # Do not need to iterrate them.
            TOKEN['ORIGToken'].extend(sent)
            
            
        # TEXT LEVEL ANNOTATION
        if SET_ANNO:
            
            S_sentIdx = TEXT['EndIDXSents' ][textIdx-1]     if textIdx!= 0 else 0
            S_tokenIdx= SENT['EndIDXTokens'][S_sentIdx - 1] if S_sentIdx != 0 else 0
            
            E_sentIdx = TEXT['EndIDXSents' ][textIdx]
            E_tokenIdx= SENT['EndIDXTokens'][E_sentIdx - 1] # Pay attention here
            numTokenInText = E_tokenIdx - S_tokenIdx
            
            ORIGTokenInText = TOKEN['ORIGToken'][S_tokenIdx:E_tokenIdx]
            
            print('\n--------------- textIdx is', textIdx, numTokenInText)
            print(S_tokenIdx, E_tokenIdx)
            ANNOTokenInText = ['O'] * numTokenInText
            annofilepath = AnnoFilePath[textIdx]
            TEXT['ANNOFileName'].append(annofilepath)
            annofilepath = CORPUSPath + foldername + '/' + annofilepath
            print(annofilepath)
            if os.path.isfile(annofilepath):
                annotext = fileReader(annofilepath)
                SSET = [sset.split('\t') for sset in annotext.split('\n') if '\t' in sset] 
                # print(SSET)
                for sset in SSET:
                    string, start, end, tag = sset[0], int(sset[1]), int(sset[2]), sset[3] # start id ind 
                    if IDXOrient == 1:
                        start = start - 1
                    # print(string) 
                    # print(''.join(ORIGTokenInText[start:end]))
                    assert string == ''.join(ORIGTokenInText[start:end])
                    
                    taglist = [tag + '-I' ] * (end - start)
                    if TAG_SCHEME == 'BIO':
                        taglist[0] = tag + '-B'
                        
                    elif TAG_SCHEME == 'BIOE':
                        taglist[-1] = tag + '-E'
                        taglist[0] = tag + '-B'
                        
                    # print(taglist)
                        
                    ANNOTokenInText[start: end] = taglist
                    
                print(ANNOTokenInText)
                    
            TOKEN['ANNOToken'].extend(ANNOTokenInText)

In [None]:
print(CORPUS)
print(FOLDER)
print(TEXT)
print(SENT)
print(TOKEN)

In [None]:

from bisect import bisect
tokenIdx = 3
sentIdx = bisect(SENT_DICT['EndIDXTokens'] , tokenIdx)

print('sentIdx is', sentIdx)

#######################

s = SENT['EndIDXTokens'][sentIdx-1] if sentIdx != 0 else 0
e = SENT['EndIDXTokens'][sentIdx]
#######################

print('start and end are', s,e)
token = TOKEN['ORIGToken'][tokenIdx]
print(token)
idx = tokenIdx - s
print('idx in the sent', idx)
print(TOKEN['ORIGToken'][s:e][idx])

In [None]:
print(len(TOKEN['ANNOToken']))
print(len(TOKEN['ORIGToken']))

## 2.6 Build Token String to Index Dict

In [None]:
import collections

PAD   = '</pad>'
UNK   = '</unk>'
START = '</start>'
END   = '</end>'

init_dict = {PAD: 0, START: 1, END: 2, UNK : 3, }

def buildTokens(tokens, init_dict = init_dict):
    """
        Process raw inputs into a dataset.
        words: a list of the whole corpus
    """
    count = collections.Counter(tokens).most_common()
    specailTokens = list(init_dict.keys())
    dictionary = init_dict
    
    for token, _ in count:
        if token is not specailTokens:
            dictionary[token] = len(dictionary) 
    data = []
    for token in tokens:
        index = dictionary.get(token, 1)
        data.append(index)
    return data, dictionary

TOKEN['ORIGTokenIndex'], DictToken =  buildTokens(TOKEN['ORIGToken'])
print(DictToken)
ListToken = list(DictToken.keys())
print(ListToken)

# 3. New Pyramid Structure

`utils.getCorpusFolders`

In [None]:
import os

def getCorpusFolders(CORPUSPath):
    corpusFile = [i for i in os.listdir(CORPUSPath) if '.' in i]
    if len(corpusFile) == 1:
        return {os.path.join(CORPUSPath, corpusFile[0]): ''}, 'File'
    else:
        results = [x for x in os.walk(CORPUSPath) if x[2]]
        return {i[0]: i[2] for i in results},                 'Dir'



`utils.getCITText`

In [None]:


strText = '''结肠多发息肉。\n患中老年男性,慢性病程。 因“体检发现大肠多发息肉3月余”入院。查体:无阳性体征。'''

print(strText)

strAnnoText = '''标注文本名称:/Users/zhangling/Documents/新标的数据530/529李选-已检查/Entity/patient4378.txt\n标注文本字数统计:87\n多发息肉\t3\t6\t疾病\n慢性\t16\t17\t修饰\n多发息肉\t30\t33\t疾病\n3月余\t34\t36\t修饰\n无阳性体征\t44\t48\t不确定\n'''
print(strAnnoText)

# BIOES

sep = '\t'
SSETText = [sset.split('\t') for sset in strAnnoText.split('\n') if sep in sset]

notZeroIndex = 1 

sset = SSETText[0]

strAnno = sset[0]
s       = int(sset[1]) - notZeroIndex
tag     = sset[3] 
CIT = [[c, s + idx, tag+ '-I']  for idx, c in enumerate(strAnno)]

CIT[-1][2] = tag + '-E'
CIT[0][2]  = tag + '-B'
    
if len(CIT) == 1:
    CIT[2] = tag + '-S'   
print(CIT)


CITAnnoText = []
for sset in SSETText:
    strAnno = sset[0]
    s       = int(sset[1]) - notZeroIndex
    tag     = sset[3] 
    CIT = [[c, s + idx, tag+ '-I']  for idx, c in enumerate(strAnno)]

    CIT[-1][2] = tag + '-E'
    CIT[0][2]  = tag + '-B'

    if len(CIT) == 1:
        CIT[0][2] = tag + '-S' 
        
    CITAnnoText.extend(CIT)
    
    
CITText = [[char, idx, 'O'] for idx, char in enumerate(strText)]

for citAnno in CITAnnoText:
    c, idx, t = citAnno
    assert CITText[idx][0] == c
    CITText[idx] = citAnno
    
CITText[:10]

`utils.getCITSents`

In [None]:

strSents = ['结肠多发息肉。', '患中老年男性,慢性病程。']

CITText = [['结', 0, 'O'],
 ['肠', 1, 'O'],
 ['多', 2, '疾病-B'],
 ['发', 3, '疾病-I'],
 ['息', 4, '疾病-I'],
 ['肉', 5, '疾病-E'],
 ['。', 6, 'O'],
 ['\n', 7, 'O'],
 ['患', 8, 'O'],
 ['中', 9, 'O'],
 ['老', 10, 'O'],
 ['年', 11, 'O'],
 ['男', 12, 'O'],
 ['性', 13, 'O'],
 [',', 14, 'O'],
 ['慢', 15, '修饰-B'],
 ['性', 16, '修饰-E'],
 ['病', 17, 'O'],
 ['程', 18, 'O'],
 ['。', 19, 'O'],
 [' ', 20, 'O'],]



lenLastSent = 0
collapse    = 0
 
CITSents = []
for strSent in strSents:
    CITSent = []
    for sentTokenIdx, c in enumerate(strSent):
        # sentTokenIdx = txtTokenIdx - lenLastSent - collapse
        txtTokenIdx = sentTokenIdx + lenLastSent + collapse
        cT, _, tT = CITText[txtTokenIdx]
        while c != cT:
            collapse = collapse + 1
            txtTokenIdx = sentTokenIdx + lenLastSent + collapse
            cT, _, tT = CITText[txtTokenIdx]
            
        CITSent.append([c,sentTokenIdx, tT])
    lenLastSent = lenLastSent + len(strSent)
    CITSents.append(CITSent)
CITSents    

`utils.textLineReader` with `anno == 'embed'`

In [None]:
from pprint import pprint

string = 'This is an annotated entity {{2018-12-22:date}}, try to extract it out!'

ST = [(block, 'O') if idx%2==0 else (block.split(':')[0], block.split(':')[-1]) 
    for idx, block in enumerate(string.replace("}}", '{{').split('{{'))]

pprint(ST)
# SSET, Str, S(char), E(char), Tag.

txtCharIdx = 0
SSET = []
strText = ''
for st in ST:
    string, tag = st
    sset = [string, txtCharIdx, txtCharIdx + len(string), tag]
    SSET.append(sset)
    txtCharIdx = sset[2]
    strText = strText + string
    
pprint(SSET)
pprint(strText) 

# Only Way to Check a SSET
for sset in SSET:
    assert sset[0] == strText[sset[1]: sset[2]]


# Part 1 From Corpus to Folders and From Folder to Texts


There are three methods

1. textFile

2. textLine

3. textBlock

In [None]:
# STAGE 1
from pprint import pprint

from nlptext.utils.pyramid import CorpusFoldersReader, FolderTextsReaders

########### NER ###########

CORPUSPath = 'dataset/ner/'
corpusFileIden = None
textType   = 'file'
Text2SentMethod  = reCutText2Sent
Sent2TokenMethod = 'iter'
TOKENLevel = 'char'
anno = '.Entity'
annoKW = {
    'sep': '\t',
    'notZeroIndex': 1,
}



########### MedPOS ###########

CORPUSPath = 'dataset/medpos/'
textType   = 'file'
corpusFileIden = None
Text2SentMethod  = reCutText2Sent
Sent2TokenMethod = 'iter'
TOKENLevel = 'char'
anno = '.UMLSTag'
annoKW = {
    'sep': '\t',
    'notZeroIndex': 0,
}

########### Weibo Test ###########
CORPUSPath = 'dataset/weibotest/'
corpusFileIden = None
textType   = 'file'
Text2SentMethod  = reCutText2Sent
Sent2TokenMethod = 'sep'
TOKENLevel = 'word'
anno = False
annoKW = {}


########### Wiki ###########
CORPUSPath = 'dataset/wiki/'
corpusFileIden = '.txt'

textType   = 'line'

Text2SentMethod  = reCutText2Sent
Sent2TokenMethod = 'sep'
TOKENLevel = 'word'

anno = False
annoKW = {}



########### ResumeNER ###########
CORPUSPath = 'dataset/ResumeNER/'
corpusFileIden = '.bmes'
textType   = 'block'
Text2SentMethod  = reCutText2Sent
Sent2TokenMethod = 'iter'
TOKENLevel = 'char'
anno = 'embed'
annoKW = {}

########### BOSON ###########
CORPUSPath = 'dataset/boson/'
corpusFileIden = '.txt'
textType   = 'line'
Text2SentMethod  = reCutText2Sent
Sent2TokenMethod = 'iter'
TOKENLevel = 'char'
anno = 'embed'
annoKW = {}



assert anno == False or '.' in anno or anno == 'embed'
########################################################


MaxTextIdx = 10



Folders, CORPUSType = CorpusFoldersReader(CORPUSPath, iden = corpusFileIden)

pprint(Folders) # all possible files in this directory
pprint(CORPUSType)


for folderPath in Folders:
    print(folderPath)
    fileNames = Folders[folderPath]
    
    FolderTexts = FolderTextsReaders[textType](folderPath, fileNames, anno, **annoKW)
    
    for textIdx, strText_SSET_O_A in enumerate(FolderTexts):
        
        # we need to add some constraits to filter the textStrs
        strText, SSETText, origTextName, annoTextName = strText_SSET_O_A
        print(textIdx, '--', strText)
        print(SSETText, '\n')
        if textIdx == MaxTextIdx:
            break
        
        # we need to add some constraits to filter the textStrs

# PART 2 Text to Sentences and Sentence to Tokens

# PART 3 Saved Information

In [None]:
# STAGE 1
from pprint import pprint
from nlptext.utils.pyramid import CorpusFoldersReader, FolderTextsReaders

# STAGE 2
from nlptext.utils.pyramid import reCutText2Sent
from nlptext.utils.pyramid import segText2Sents, segSent2Tokens# (text, method = 'whole')


########################################################
################## Dataset Description #################
########################################################


########### NER ###########

CORPUSPath = 'dataset/ner/'
corpusFileIden = None
textType   = 'file'
Text2SentMethod  = 're'
Sent2TokenMethod = 'iter'
TOKENLevel = 'char'
anno = '.Entity'
annoKW = {
    'sep': '\t',
    'notZeroIndex': 1,
}


########### MedPOS ###########

CORPUSPath = 'dataset/medpos/'
textType   = 'file'
corpusFileIden = None
Text2SentMethod  = 're'
Sent2TokenMethod = 'iter'
TOKENLevel = 'char'
anno = '.UMLSTag'
annoKW = {
    'sep': '\t',
    'notZeroIndex': 0,
}

########### Weibo Test ###########
CORPUSPath = 'dataset/weibotest/'
corpusFileIden = None
textType   = 'file'
Text2SentMethod  = 're'
Sent2TokenMethod = 'sep-\t'
TOKENLevel = 'word'
anno = False
annoKW = {}


########### Wiki ###########
CORPUSPath = 'dataset/wiki/'
corpusFileIden = '.txt'

textType   = 'line'

Text2SentMethod  = 're'
Sent2TokenMethod = 'sep- '
TOKENLevel = 'word'

anno = False
annoKW = {}



########### ResumeNER ###########
CORPUSPath = 'dataset/ResumeNER/'
corpusFileIden = '.bmes'
textType   = 'block'
Text2SentMethod  = 're'
Sent2TokenMethod = 'iter'
TOKENLevel = 'char'
anno = 'embed' # TODO
annoKW = {}


########### BOSON ###########
CORPUSPath = 'dataset/boson/'
corpusFileIden = '.txt'
textType   = 'line'
Text2SentMethod  = 're'
Sent2TokenMethod = 'iter'
TOKENLevel = 'char'
anno = 'embed'
annoKW = {}



assert anno == False or '.' in anno or anno == 'embed'
########################################################
########################################################

MaxTextIdx = 10



########################################################
################   Things to Save   ####################
########################################################


CORPUS = {}
CORPUS['CORPUSPath'] = CORPUSPath
CORPUS['corpusFileIden'] = corpusFileIden # None if Dir else
CORPUS['CORPUSType']     = 'File' if corpusFileIden else 'Dir'
CORPUS['textType'] = textType

FOLDER = {}
FOLDER['folderPaths'] = [] 
FOLDER['NUMTexts'] = []
FOLDER['EndIDXTexts'] = []
        
TEXT = {}
TEXT['NUMSents'] = []
TEXT['EndIDXSents'] = []
TEXT['Text2SentMethod'] = Text2SentMethod
if textType == 'file':
    TEXT['ORIGFileName'] = []
if anno:
    TEXT['ANNOFileName'] = []
    
SENT = {}
SENT['NUMTokens'] = []
SENT['EndIDXTokens'] = []
SENT['Sent2TokenMethod'] = Sent2TokenMethod

TOKEN = {}
TOKEN['ORIGToken'] = []
TOKEN['TOKENLevel'] = TOKENLevel
if anno:
    TOKEN['ANNOToken'] = []

ANNO = {}
ANNO['anno'] = anno
ANNO['annoKW'] = annoKW

    
    
########################################################
##################     CHAINES      ####################
########################################################



###--> CHAIN: from Corpus to Folders <--###

CorpusFolders, CORPUSType = CorpusFoldersReader(CORPUSPath, iden = corpusFileIden)
assert CORPUS['CORPUSType'] == CORPUSType
pprint(CorpusFolders) # all possible files in this directory
pprint(CORPUSType)

for folderIdx, folderPath in enumerate(CorpusFolders):
    print(folderPath)
    fileNames = CorpusFolders[folderPath]
    
    ###--> CHAIN: from Folder to Texts <--###
    FolderTexts = FolderTextsReaders[textType](folderPath, fileNames, anno, **annoKW)
    
    for textIdx, strText_SSET_O_A in enumerate(FolderTexts):
        
        # we need to add some constraits to filter the textStrs
        strText, SSETText, origTextName, annoTextName = strText_SSET_O_A
        
        print('\n', textIdx, '--', strText)
        print(SSETText, '\n')
        
        ###--> CHAIN: from strText to strSents <--###
        strSents = segText2Sents(strText, method = Text2SentMethod) # fixed
        
        for strSent in strSents:
            #- print(strSent)
            ###--> CHAIN: from strSent to strTokens <--###
            strTokens = segSent2Tokens(strSent, method = Sent2TokenMethod)
            
            ###--> CHAIN's End: Token itself <--###
            #- print(strTokens)
            TOKEN['ORIGToken'].extend(strTokens)
            
            lenSent = len(strTokens)
            SENT['NUMTokens'].append(lenSent)
            try:
                SENT['EndIDXTokens'].append(SENT['EndIDXTokens'][-1] + lenSent)
            except:
                SENT['EndIDXTokens'].append(lenSent)
            
        
        lenText = len(strSents)
        TEXT['NUMSents'].append(lenText)
        try:
            TEXT['EndIDXSents'].append(TEXT['EndIDXSents'][-1] + lenText)
        except:
            TEXT['EndIDXSents'].append(lenText)
            
        if origTextName:
            TEXT['ORIGFileName'].append(origTextName)
            
            
        ########################################
        if anno:
            # TOKEN['ANNOToken'] = []
            # assert SSETText   != [] # May occur Errors
            for sset in SSETText:
                assert sset[0] == strText[sset[1]: sset[2]]
            if SSETText == []:
                print('\nThe SSET of this Text is Empty!!!')
                print(strText, '\n') # to check what happen
                    
            ############### PART One: Get CITText ###########
            #
            # CITText  = foo1(strText, SSETText)
            # 
            
            from nlptext.utils.pyramid import getCITText
            CITText = getCITText(strText, SSETText)
            #- print(CITText)
            '''
            CITAnnoText = []
            for sset in SSETText:
                # BIOES
                strAnno, s, e, tag = sset
                CIT = [[c, s + idx, tag+ '-I']  for idx, c in enumerate(strAnno)]
                CIT[-1][2] = tag + '-E'
                CIT[ 0][2] = tag + '-B'
                if len(CIT) == 1:
                    CIT[0][2] = tag + '-S' 
                CITAnnoText.extend(CIT)

            # print(strAnnoText)
            CITText = [[char, idx, 'O'] for idx, char in enumerate(strText)]
            for citAnno in CITAnnoText:
                c, idx, t = citAnno
                assert CITText[idx][0] == c
                CITText[idx] = citAnno
            # CITText 
            # Here we get a CITText
            #- pprint(CITText)
            '''
            
            
                
            ############### PART TWO: Get CITSents ###########
            #
            # CITSents = foo2(strSents, CITText)
            #
            
            from nlptext.utils.pyramid import getCITSents
            CITSents = getCITSents(strSents, CITText)
            '''
            lenLastSent = 0
            collapse    = 0 # don't need to move 
            CITSents = []
            for strSent in strSents:
                CITSent = []
                for sentTokenIdx, c in enumerate(strSent):
                    # sentTokenIdx = txtTokenIdx - lenLastSent - collapse
                    txtTokenIdx = sentTokenIdx + lenLastSent + collapse
                    cT, _, tT = CITText[txtTokenIdx]
                    while c != cT:
                        collapse = collapse + 1
                        txtTokenIdx = sentTokenIdx + lenLastSent + collapse
                        cT, _, tT = CITText[txtTokenIdx]
                    CITSent.append([c,sentTokenIdx, tT])
                lenLastSent = lenLastSent + len(strSent)
                CITSents.append(CITSent)
            # CITSents
            # Here we get CITSents 
            '''
            

            ############### PART THREE: Get TOKEN['ANNOToken'] ###########
            #
            # TOKEN['ANNOToken'] = foo3(CITSents, strSents)
            #
            for sentIdx, CITSent in enumerate(CITSents):
                
                # Corporate into TOKEN['ANNOToken']
                # pay attention here, CIT is char-based, but TOKEN may be word-based.
                # strTokens = segSent2Tokens(strSent, method=Sent2TokenMethod)
            
                if TOKENLevel == 'char':
                    TOKEN['ANNOToken'].extend([CITToken[2] for CITToken in CITSent])
                    #- pprint(sentIdx)
                    #- pprint(CITSent)
                else:
                    # TODO
                    pass 
            # save the file
            
            if annoTextName:
                TEXT['ANNOFileName'].append(origTextName)
            
        
        if textIdx == MaxTextIdx:
            break
    
    # Back to Folder
    lenFolder = textIdx
    FOLDER['folderPaths'].append(folderPath)
    
    FOLDER['NUMTexts'].append(lenFolder) # to remove
    try:
        FOLDER['EndIDXTexts'].append(FOLDER['EndIDXTexts'][-1] + lenFolder)
    except:
        FOLDER['EndIDXTexts'].append(lenFolder)
        
# End here
lenCorpus = folderIdx
CORPUS['NUMFolders'] = [lenCorpus]
CORPUS['EndIDXFolders'] = [lenCorpus]

# 4. Test

In [None]:
from pprint import pprint
from nlptext.base import BasicObject


########### BOSON ###########
CORPUSPath = 'dataset/boson/'
corpusFileIden = '.txt'
textType   = 'line'
Text2SentMethod  = 're'
Sent2TokenMethod = 'iter'
TOKENLevel = 'char'
anno = 'embed'
annoKW = {}


MaxTextIdx = False

BasicObject.INIT(CORPUSPath, corpusFileIden, textType, 
                 Text2SentMethod, Sent2TokenMethod, TOKENLevel, 
                 anno, annoKW, MaxTextIdx)

from nlptext.corpus import Corpus
corpus = Corpus()
# corpus.IdxFolderStartEnd

# DictToken = corpus.DictToken
# print(DictToken)

In [None]:
print(corpus.CORPUS)

In [None]:
print(corpus.FOLDER)

In [None]:
print(corpus.TEXT)

In [None]:
print(corpus.SENT)

In [None]:
print(corpus.TOKEN)

In [None]:

import numpy as np
txtIdxes = list(set(list(np.random.randint(corpus.TEXT['length'], size = 10))))
txtIdxes

In [None]:
# print(corpus.Folders)
# print(corpus.FOLDER)
# print(corpus.Texts)
# print(corpus.TEXT)
# print(corpus.Sentences)
from nlptext.text import Text

sentIdx = 0
for txtIdx in txtIdxes:
    
    txt = Text(txtIdx)
    print('\n', txt, '\n')
    for st in txt.Sentences:
        print(sentIdx, '-->',st.sentence)
        sentIdx = sentIdx + 1

In [None]:
st = corpus.Sentences[31]
st.Tokens

In [None]:
BasicObject.TokenNum_Dir

In [None]:
from nlptext.text import Text

txt = Text(9)
txt.Tokens

In [None]:

# def readFile2GrainList(channel_name_path):
#     ListGrainUnique = []
#     with open(channel_name_path, 'r', encoding = 'utf-8') as f:
#         for gr in f.readlines():
#             gr = '\n' if  '\\n' in gr[:-1] else gr[:-1]
#             ListGrainUnique.append(gr)
#     return ListGrainUnique


# LTU = readFile2GrainList('token.tsv')
# a = [i for i in LTU if i >= '\u4e00' and i <= '\u9fff'][:6000]
# a_dict = dict(zip(a, range(len(a))))




# # # def modify(line):
# # #     L = []
# # #     for char in line:
        
# #         if char >= '\u4e00' and char <= '\u9fff':
# #             L.append(char)
        
# #         else:
# #             inside_code = ord(char)
# #             if inside_code == 12288:
# #                 inside_code = 32
# #             elif (inside_code >= 65281 and inside_code <= 65374):
# #                 inside_code -= 65248
# #             char = chr(inside_code)
            
# #             if char in selected_non_cn_char:
# #                 L.append(char)
            
# #     return ''.join(L)
            

# def modify(line):
#     L = []
#     strange = 0
#     for char in line:
#         if char >= '\u4e00' and char <= '\u9fff':
#             if char not in a_dict:
#                 # print(char)
#                 char = '𐩧'
#                 strange = strange + 1
                
#         L.append(char)
#     return ''.join(L), strange
                

# from datetime import datetime
# # BasicObject.BUILD_LIST_GRAIN_UNIQUE_AND_LOOKUP(CHANNEL_SETTINGS_TEMPLATE)
# CORPUSPath = 'dataset/WikiTotal/'

# total_strange = 0
# with open('dataset/WikiTotal/WikiTotal2.txt', 'r') as f1:
#     with open('dataset/WikiTotal/WikiTotal6k.txt', 'w') as f2:
#         lastkey = ''
#         i = 0
#         count = 1
#         for line in f1.readlines():
#             # line = strQ2B(line)# .decode()

#             line, strange = modify(line)
#             total_strange = total_strange + strange
#             # line = ''.join([i for i in line if i in pre_given_list])
#             # key  = line.replace('\n', '').split('\t')[1]
#             # line = line.replace('\n', '\t' + str(count) + '\n') 
#             f2.write(line+'\n')
#             if i % 500000 == 0:
#                 print(i, total_strange, datetime.now())
#             i  = i + 1
            
# print('Total Strange:', total_strange)

* 7k
```
0 0 2019-04-01 20:17:44.575479
500000 6872 2019-04-01 20:18:00.339529
1000000 13107 2019-04-01 20:18:13.829641
1500000 18364 2019-04-01 20:18:26.433165
2000000 24764 2019-04-01 20:18:38.503569
2500000 31154 2019-04-01 20:18:50.281386
3000000 34702 2019-04-01 20:18:58.808097
3500000 39756 2019-04-01 20:19:09.941478
4000000 43891 2019-04-01 20:19:21.889133
4500000 49958 2019-04-01 20:19:33.441578
Total Strange: 52514
```

* 6k
```
0 0 2019-04-01 20:22:29.925298
500000 16943 2019-04-01 20:22:45.594240
1000000 32007 2019-04-01 20:22:58.979631
1500000 44940 2019-04-01 20:23:11.898943
2000000 60453 2019-04-01 20:23:23.856685
2500000 75492 2019-04-01 20:23:36.061031
3000000 83949 2019-04-01 20:23:44.742128
3500000 96150 2019-04-01 20:23:55.557540
4000000 106365 2019-04-01 20:24:07.696283
4500000 119614 2019-04-01 20:24:19.378625
Total Strange: 125211
```


In [None]:
Path2Pyramid = 'data/boson/char/Token3870/Pyramid'
Path2LGUnique = 'data/boson/char/Token3870/GrainUnique/'

from pprint import pprint
from nlptext.base import BasicObject

BasicObject.INIT_FROM_PICKLE(Path2Pyramid, Path2LGUnique)