In [212]:
import re
import emoji
from text_cleaner import remove
from text_cleaner.processor.common import ASCII, SYMBOLS_AND_PUNCTUATION_EXTENSION, GENERAL_PUNCTUATION
from text_cleaner.processor.chinese import CHINESE, CHINESE_SYMBOLS_AND_PUNCTUATION
import math
import sys

In [213]:
emoji_pattern = re.compile(
        u'(\U0001F1F2\U0001F1F4)|'       # Macau flag
        u'([\U0001F1E6-\U0001F1FF]{2})|' # flags
        u'([\U0001F600-\U0001F64F])'     # emoticons
        "+", flags=re.UNICODE)
en_pattern = "[a-zA-Z0-9]+"

In [214]:
def preprocess(s):
    new_str = ''
    for i in s:
        # remove emoji
        if i not in emoji.UNICODE_EMOJI:
            new_str = new_str+i
    new_str = emoji_pattern.sub('', new_str)
    new_str = re.sub(en_pattern,'', new_str)
    new_str = SYMBOLS_AND_PUNCTUATION_EXTENSION.remove(new_str)
    new_str = GENERAL_PUNCTUATION.remove(new_str)
    new_str = CHINESE_SYMBOLS_AND_PUNCTUATION.remove(new_str)
    new_str = ' '.join(new_str.split())
    return new_str

In [137]:
s = "🇩🇪-消防车🚒//@app菌:哈哈哈哈哈哈哈哈 谈个恋爱还不够生气的//@一个阿呆仔:学猪叫那个真笑出猪声[允悲]//@梨 园西池水: 哈哈哈哈哈哈哈哈哈哈//@太皇太后您有喜啦:哈哈哈哈哈哈哈哈哈怎么这么好笑ˊ_>ˋ"
print(s)

🇩🇪-消防车🚒//@app菌:哈哈哈哈哈哈哈哈 谈个恋爱还不够生气的//@一个阿呆仔:学猪叫那个真笑出猪声[允悲]//@梨 园西池水: 哈哈哈哈哈哈哈哈哈哈//@太皇太后您有喜啦:哈哈哈哈哈哈哈哈哈怎么这么好笑ˊ_>ˋ


In [138]:
s = preprocess(s)
maxlen = 5
seqlen = len(s)
print(s)

消防车 菌 哈哈哈哈哈哈哈哈 谈个恋爱还不够生气的 一个阿呆仔 学猪叫那个真笑出猪声 允悲 梨 园西池水 哈哈哈哈哈哈哈哈哈哈 太皇太后您有喜啦 哈哈哈哈哈哈哈哈哈怎么这么好笑ˊ ˋ


In [193]:
def get_segment(s):
    words = []
    slist = s.split(' ')
    # print(slist)
    for segment in slist:
        segment = "$" + segment + "$"
#        for i, char in enumerate(segment):
#             for j in range(i+1, i+maxlen+1):
#                 if j > len(segment):
#                     break
#                 w = segment[i:j]
#                 print("1:",w)
#                 words.append(w) 
        for j in range(1, len(segment)-1):
            w = segment[j: min(maxlen + j,  len(segment))]
            words.append(w) 
    return words

In [194]:
segmentList = get_segment(s)
print(segmentList)

['消防车$', '防车$', '车$', '菌$', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈$', '哈哈哈$', '哈哈$', '哈$', '谈个恋爱还', '个恋爱还不', '恋爱还不够', '爱还不够生', '还不够生气', '不够生气的', '够生气的$', '生气的$', '气的$', '的$', '一个阿呆仔', '个阿呆仔$', '阿呆仔$', '呆仔$', '仔$', '学猪叫那个', '猪叫那个真', '叫那个真笑', '那个真笑出', '个真笑出猪', '真笑出猪声', '笑出猪声$', '出猪声$', '猪声$', '声$', '允悲$', '悲$', '梨$', '园西池水$', '西池水$', '池水$', '水$', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈$', '哈哈哈$', '哈哈$', '哈$', '太皇太后您', '皇太后您有', '太后您有喜', '后您有喜啦', '您有喜啦$', '有喜啦$', '喜啦$', '啦$', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈怎', '哈哈哈怎么', '哈哈怎么这', '哈怎么这么', '怎么这么好', '么这么好笑', '这么好笑ˊ', '么好笑ˊ$', '好笑ˊ$', '笑ˊ$', 'ˊ$', 'ˋ$']


In [195]:
# counterMap = {}
# for seg in segmentList:
#     if seg in counterMap.keys():
#         counterMap[seg] += 1
#     else:
#         counterMap[seg] = 1
# print(counterMap)

In [196]:
ngramsort = sorted(segmentList)
print(ngramsort)

['ˊ$', 'ˋ$', '一个阿呆仔', '不够生气的', '个恋爱还不', '个真笑出猪', '个阿呆仔$', '么好笑ˊ$', '么这么好笑', '仔$', '允悲$', '出猪声$', '叫那个真笑', '后您有喜啦', '呆仔$', '哈$', '哈$', '哈哈$', '哈哈$', '哈哈哈$', '哈哈哈$', '哈哈哈哈$', '哈哈哈哈$', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈怎', '哈哈哈怎么', '哈哈怎么这', '哈怎么这么', '啦$', '喜啦$', '园西池水$', '声$', '够生气的$', '太后您有喜', '太皇太后您', '好笑ˊ$', '学猪叫那个', '怎么这么好', '恋爱还不够', '您有喜啦$', '悲$', '有喜啦$', '梨$', '气的$', '水$', '池水$', '消防车$', '爱还不够生', '猪叫那个真', '猪声$', '生气的$', '的$', '皇太后您有', '真笑出猪声', '笑ˊ$', '笑出猪声$', '菌$', '西池水$', '谈个恋爱还', '车$', '还不够生气', '这么好笑ˊ', '那个真笑出', '防车$', '阿呆仔$']


In [215]:
def getRight(ngramsort):
    outpath = "freqright.data"
    sys.stdout = open(outpath, 'wt')
    ngram = ""
    pause = False
    sameWord = []
    for curr in ngramsort:
    #     print("curr", curr)
        if pause:
            break
        if ngram == "":
            sameWord.append(curr)
            ngram = curr
        else:
            if curr.startswith(ngram):
                sameWord.append(curr)
    #             print("1",sameWord)
            else:
                if not sameWord: # sameword is empty
                    pause = False
                    sameWord.append(curr)
                    ngram = curr
                    continue
                right = {}
                freq = 0
                for w in sameWord:
                    if w.startswith(ngram) == False:
                        break
                    if w == ngram:
                        continue
                    freq += 1
                    neww = w[:len(ngram)-1]
                    right[neww] = right.get(neww, 0) + 1
                res= 0.0
                for t in right.keys():
                    p = right[t] * 1.0 / freq
                    res += -1 * p * math.log(p)
                print(ngram,freq,res)
                newlist = []
                for w in sameWord:
                    if w != ngram:
                        newlist.append(w)
                sameWord = newlist
                if not sameWord:
                    pause = False
                    sameWord.append(curr)
                    ngram = curr
                    continue
                ngram = sameWord[0]
                if curr.startswith(ngram):
                    sameWord.append(curr)
                else:
                    pause = True
getRight(ngramsort)

In [201]:
revngramsort = [c[::-1] for c in ngramsort]
print(revngramsort)

['$ˊ', '$ˋ', '仔呆阿个一', '的气生够不', '不还爱恋个', '猪出笑真个', '$仔呆阿个', '$ˊ笑好么', '笑好么这么', '$仔', '$悲允', '$声猪出', '笑真个那叫', '啦喜有您后', '$仔呆', '$哈', '$哈', '$哈哈', '$哈哈', '$哈哈哈', '$哈哈哈', '$哈哈哈哈', '$哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '哈哈哈哈哈', '怎哈哈哈哈', '么怎哈哈哈', '这么怎哈哈', '么这么怎哈', '$啦', '$啦喜', '$水池西园', '$声', '$的气生够', '喜有您后太', '您后太皇太', '$ˊ笑好', '个那叫猪学', '好么这么怎', '够不还爱恋', '$啦喜有您', '$悲', '$啦喜有', '$梨', '$的气', '$水', '$水池', '$车防消', '生够不还爱', '真个那叫猪', '$声猪', '$的气生', '$的', '有您后太皇', '声猪出笑真', '$ˊ笑', '$声猪出笑', '$菌', '$水池西', '还爱恋个谈', '$车', '气生够不还', 'ˊ笑好么这', '出笑真个那', '$车防', '$仔呆阿']


In [216]:
def getLeft(revngramsort):
    outpath = "freqleft.data"
    sys.stdout = open(outpath, 'wt')
    ngram = ""
    pause = False
    sameWord = []
    for curr in revngramsort:
    #     print("curr", curr)
        if pause:
            break
        if ngram == "":
            sameWord.append(curr)
            ngram = curr
        else:
            if curr.startswith(ngram):
                sameWord.append(curr)
                pause = False
    #             print("1",sameWord)
            else:
                if not sameWord: # sameword is empty
                    pause = False
                    sameWord.append(curr)
                    ngram = curr
                    continue
                left = {}
                freq = 0
                for w in sameWord:
                    if w.startswith(ngram) == False:
                        break
                    if w == ngram:
                        continue
                    freq += 1
                    neww = w[:len(ngram)-1]
                    left[neww] = left.get(neww, 0) + 1
                res= 0.0
                for t in left.keys():
                    p = left[t] * 1.0 / freq
                    res += -1 * p * math.log(p)
                print(ngram,freq,res)
                newlist = []
                for w in sameWord:
                    if w != ngram:
                        newlist.append(w)
                sameWord = newlist
                if not sameWord:
                    pause = False
                    sameWord.append(curr)
                    ngram = curr
                    continue
                ngram = sameWord[0]
                if curr.startswith(ngram):
                    sameWord.append(curr)
                else:
                    pause = True
getLeft(revngramsort)

In [218]:
def mergeEntropy(rightpath, leftpath):
    
    with open(rightpath, 'r') as rf:
        rightline = rf.readlines() 
    with open(leftpath, 'r') as lf:
        leftline = lf.readlines() 
        
    outpath = "merge.tmp"
    sys.stdout = open(outpath, 'wt')
    newlines = rightline + leftline
    for line in rightline:
        print(line)
    for line in leftline:
        print(line)
        
    sortFile(mergetmp, mergetmp2)
    
    outpath = "merge_entropy.data"
    sys.stdout = open(outpath, 'wt')
    f = open(mergetmp2, 'r')
    line1 = ""
    line2 = ""
    line1 = f.readline()
    line2 = f.readline()
    while True:
        if line1 == "" or line2 == "":
            break
        seg1 = line1.split("\t")
        seg2 = line2.split("\t")
        if seg1[0] != seg2[0]
            line1 = line2
            line2 = f.readline()
            continue
        if len(seg1) < 2:
            line1 = line2
            line2 = f.readline()
            continue
            
        le = float(seg1[1]) if len(seg1) == 2 else float(seg2[1])
        re = float(seg1[2]) if len(seg1) == 3 else float(seg2[2])
        freq = int(seg1[1]) if len(seg1) == 3 else int(seg2[1])
        print(seg1[0],freq, e)
        line1 = f.readline()
        line2 = f.readline()
            
        
mergeEntropy("freqright.data", "freqleft.data")

In [238]:
t = False
i = 1
j = 1
j != (t == True)
while j != t == True:
    if i == 10: 
        t = True
    i += 1
    print(i)

In [242]:
float("1")

1.0

In [243]:
def extractWords(freqFile, entropyFile):
    with open(freqFile, 'r') as f:
        fr = f.readlines() 
    with open(entropyFile, 'r') as f:
        er = f.readlines() 
    
    outpath = "words.data"
    sys.stdout = open(outpath, 'wt')
    
    freq = {}
    for line in fr:
        seg = line.split("\t")
        if len(seg) < 3:
            continue
        freq[seg[0]] = int(seg[1])
    
    for line in er:
        seg = line.split("\t")
        if len(seg) == 3:
            continue
        w = seg[0]
        f = int(seg[1])
        e = float(seg[2])
        max = -1
        for s in range(1, len(w)):
            lw = w[:s]
            rw = w[s:]
            if lw not in freq.keys() or rw not in freq.keyw():
                continue
            ff = freq[lw] * freq[rw]
            if ff > max:
                max = ff
        pf = f * 200000.0 / max
        if pf < 10 or e < 2:
            continue
        print(w, pf, e)

In [256]:
import re
numeric_const_pattern = "[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?"
one_str = re.sub(numeric_const_pattern, '', "-1.7605603	无 法 从 三	-0.08918797")
one_str = ''.join(one_str.split())
print(one_str)

In [257]:
one_str

'无法从三'

In [1]:
int("1")

1

In [2]:
str(int("34"))

'34'