In [67]:
# -*- coding: utf-8 -*-
"""包导入"""
from simhash import Simhash
import jieba
import jieba.posseg as pseg
from jieba import analyse
import numpy as np
import os
import matplotlib.pyplot as plt 
import matplotlib as mpl
import json
from math import log
import re
import datetime
import logging

In [68]:
"""文本相似度计算类"""
class TextSimilarity(object):
    
    def __init__(self):
        '''
        初始化类，修改了原来的初始化函数，不加文件初始化参数，空初始化。
        '''
        str_a = ''
        str_b = ''
        
        self.str_a = str_a
        self.str_b = str_b
            
    #get LCS(longest common subsquence),DP
    def lcs(self,str_a, str_b):
        lensum = float(len(str_a) + len(str_b))
        #得到一个二维的数组，类似用dp[lena+1][lenb+1],并且初始化为0
        lengths = [[0 for j in range(len(str_b)+1)] for i in range(len(str_a)+1)]

        #enumerate(a)函数： 得到下标i和a[i]
        for i, x in enumerate(str_a):
            for j, y in enumerate(str_b):
                if x == y:
                    lengths[i+1][j+1] = lengths[i][j] + 1
                else:
                    lengths[i+1][j+1] = max(lengths[i+1][j], lengths[i][j+1])

        #到这里已经得到最长的子序列的长度，下面从这个矩阵中就是得到最长子序列
        result = ""
        x, y = len(str_a), len(str_b)
        while x != 0 and y != 0:
            #证明最后一个字符肯定没有用到
            if lengths[x][y] == lengths[x-1][y]:
                x -= 1
            elif lengths[x][y] == lengths[x][y-1]:
                y -= 1
            else: #用到的从后向前的当前一个字符
                assert str_a[x-1] == str_b[y-1] #后面语句为真，类似于if(a[x-1]==b[y-1]),执行后条件下的语句
                result = str_a[x-1] + result #注意这一句，这是一个从后向前的过程
                x -= 1
                y -= 1
                
                #和上面的代码类似
                #if str_a[x-1] == str_b[y-1]:
                #    result = str_a[x-1] + result #注意这一句，这是一个从后向前的过程
                #    x -= 1
                #    y -= 1
        longestdist = lengths[len(str_a)][len(str_b)]
        ratio = longestdist/min(len(str_a),len(str_b))
        #return {'longestdistance':longestdist, 'ratio':ratio, 'result':result}
        return ratio
        
    
    def minimumEditDistance(self,str_a,str_b):
        '''
        最小编辑距离，只有三种操作方式 替换、插入、删除
        '''
        lensum = float(len(str_a) + len(str_b))
        if len(str_a) > len(str_b): #得到最短长度的字符串
            str_a,str_b = str_b,str_a
        distances = range(len(str_a) + 1) #设置默认值
        for index2,char2 in enumerate(str_b): #str_b > str_a
            newDistances = [index2+1] #设置新的距离，用来标记
            for index1,char1 in enumerate(str_a):
                if char1 == char2: #如果相等，证明在下标index1出不用进行操作变换，最小距离跟前一个保持不变，
                    newDistances.append(distances[index1])
                else: #得到最小的变化数，
                    newDistances.append(1 + min((distances[index1],   #删除
                                                 distances[index1+1], #插入
                                                 newDistances[-1])))  #变换
            distances = newDistances #更新最小编辑距离

        mindist = distances[-1]
        ratio = (lensum - mindist)/lensum
        #return {'distance':mindist, 'ratio':ratio}
        return ratio

    def levenshteinDistance(self,str1, str2):
        '''
        编辑距离——莱文斯坦距离,计算文本的相似度
        '''
        m = len(str1)
        n = len(str2)
        lensum = float(m + n)
        d = []           
        for i in range(m+1):
            d.append([i])        
        del d[0][0]    
        for j in range(n+1):
            d[0].append(j)       
        for j in range(1,n+1):
            for i in range(1,m+1):
                if str1[i-1] == str2[j-1]:
                    d[i].insert(j,d[i-1][j-1])           
                else:
                    minimum = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+2)         
                    d[i].insert(j, minimum)
        ldist = d[-1][-1]
        ratio = (lensum - ldist)/lensum
        #return {'distance':ldist, 'ratio':ratio}
        return ratio
    
    @classmethod
    def splitWords(self,str_a):
        '''
        接受一个字符串作为参数，返回分词后的结果字符串(空格隔开)和集合类型
        '''
        wordsa=pseg.cut(str_a)
        cuta = ""
        seta = set()
        for key in wordsa:
            #print(key.word,key.flag)
            cuta += key.word + " "
            seta.add(key.word)
        
        return [cuta, seta]
    
    def JaccardSim(self,str_a,str_b):
        '''
        Jaccard相似性系数
        计算sa和sb的相似度 len（sa & sb）/ len（sa | sb）
        '''
        seta = self.splitWords(str_a)[1]
        setb = self.splitWords(str_b)[1]
        
        sa_sb = 1.0 * len(seta & setb) / len(seta | setb)
        
        return sa_sb
    
    
    def countIDF(self,text,topK):
        '''
        text:字符串，topK根据TF-IDF得到前topk个关键词的词频，用于计算相似度
        return 词频vector
        '''
        tfidf = analyse.extract_tags

        cipin = {} #统计分词后的词频

        fenci = jieba.cut(text)

        #记录每个词频的频率
        for word in fenci:
            if word not in cipin.keys():
                cipin[word] = 0
            cipin[word] += 1

        # 基于tfidf算法抽取前10个关键词，包含每个词项的权重
        keywords = tfidf(text,topK,withWeight=True)

        ans = []
        # keywords.count(keyword)得到keyword的词频
        # help(tfidf)
        # 输出抽取出的关键词
        for keyword in keywords:
            #print(keyword ," ",cipin[keyword[0]])
            ans.append(cipin[keyword[0]]) #得到前topk频繁词项的词频

        return ans
    @staticmethod
    def cos_sim(a,b):
        a = np.array(a)
        b = np.array(b)
        
        #return {"文本的余弦相似度:":np.sum(a*b) / (np.sqrt(np.sum(a ** 2)) * np.sqrt(np.sum(b ** 2)))}
        return np.sum(a*b) / (np.sqrt(np.sum(a ** 2)) * np.sqrt(np.sum(b ** 2)))
    @staticmethod
    def eucl_sim(a,b):
        a = np.array(a)
        b = np.array(b)
        #print(a,b)
        #print(np.sqrt((np.sum(a-b)**2)))
        #return {"文本的欧几里德相似度:":1/(1+np.sqrt((np.sum(a-b)**2)))}
        return 1/(1+np.sqrt((np.sum(a-b)**2)))
    @staticmethod
    def pers_sim(a,b):
        a = np.array(a)
        b = np.array(b)

        a = a - np.average(a)
        b = b - np.average(b)

        #print(a,b)
        #return {"文本的皮尔森相似度:":np.sum(a*b) / (np.sqrt(np.sum(a ** 2)) * np.sqrt(np.sum(b ** 2)))}
        return np.sum(a*b) / (np.sqrt(np.sum(a ** 2)) * np.sqrt(np.sum(b ** 2)))

    def splitWordSimlaryty(self,str_a,str_b,topK = 20,sim =cos_sim):
        '''
        基于分词求相似度，默认使用cos_sim 余弦相似度,默认使用前20个最频繁词项进行计算
        '''
        #得到前topK个最频繁词项的字频向量
        vec_a = self.countIDF(str_a,topK)
        vec_b = self.countIDF(str_b,topK)
        
        return sim(vec_a,vec_b)
        
    @staticmethod
    def string_hash(self,source):  #局部哈希算法的实现
        if source == "":  
            return 0  
        else:  
            #ord()函数 return 字符的Unicode数值
            x = ord(source[0]) << 7  
            m = 1000003  #设置一个大的素数
            mask = 2 ** 128 - 1  #key值
            for c in source:  #对每一个字符基于前面计算hash
                x = ((x * m) ^ ord(c)) & mask  

            x ^= len(source) # 
            if x == -1:  #证明超过精度
                x = -2  
            x = bin(x).replace('0b', '').zfill(64)[-64:]  
            #print(source,x)  

        return str(x)
    
    
    def simhash(self,str_a,str_b):
        '''
        使用simhash计算相似度
        '''
        pass

In [42]:
"""具体逻辑"""
def isJson(jsonstr):
    """判断是否是json"""
    try:
        a = json.loads(jsonstr)
        return True
    except:
        return False
    
with open("all_result_break.json",'r',encoding="utf8") as load_f:  
    xianbingshi = []
    for i in range(100000):
        #准备工作
        line = load_f.readline()
        if not line:
            break
        if not isJson(line):
            continue;
        json_to_dic = json.loads(line)
        
        #存储现病史字符串
        if "subject_name" not in json_to_dic:
            continue
        if "呼吸" not in json_to_dic["subject_name"]:
            continue
        if "xianbingshi" not in json_to_dic:
            continue
        temp_xianbingshi = json_to_dic["xianbingshi"]
        xianbingshi.append(temp_xianbingshi)
    
    #计算相似度,打印相似度较大的句子
    text_similarity_manager = TextSimilarity()
    print(xianbingshi[2])
    print("-"*100)
#     similarity_JaccardSim = []
#     similarity_minimumEditDistance = []
    for index,item in enumerate(xianbingshi):
        temp_similarity_JaccardSim = text_similarity_manager.JaccardSim(xianbingshi[2],item)
        temp_similarity_minimumEditDistance = text_similarity_manager.minimumEditDistance(xianbingshi[2],item)
        if temp_similarity_JaccardSim>=0.3 and temp_similarity_JaccardSim!=1.0:
            print(item)
            print("JaccardSim",":",temp_similarity_JaccardSim)
            print("minimumEditDistance",":",temp_similarity_minimumEditDistance)
            print("simhash_distance",":",Simhash(xianbingshi[2]).distance(Simhash(item)))
#             similarity_JaccardSim.append(temp_similarity_JaccardSim)
#             similarity_minimumEditDistance.append(temp_similarity_minimumEditDistance)
            print("-"*100)
#     plt.scatter(similarity_JaccardSim,similarity_minimumEditDistance)
#     plt.show()
    load_f.close()

患儿1周前无诱因下发热，热峰39.0℃，无咳嗽流涕，无吐泻，当地诊所予静脉输液3天，发烧间隔稍有延长，于人民就诊，予静滴头孢呋辛1天，体温仍有反复升高，精神欠佳，发热时有头痛，无呕吐，热退后无头痛，昨日于我院就诊，查血常规白细胞12.09×109/L，中性粒细胞%77.3%，静滴头孢唑肟、补液1次，拟“发热精神差”入院。病程中，患儿精神进食差，睡眠一般，大小便正常。
----------------------------------------------------------------------------------------------------
患儿1周前在无明显诱因下出现发热，最高体温达39.5，偶有咳嗽，咳痰，呈白色粘痰，无腹泻、呕吐，自行服用“头孢克肟、小葵花颗粒”3天，患儿体温正常，咳嗽咳痰减轻，2天前早晨患儿出现头痛伴有呕吐，呕吐物为胃内容物，进食差，无腹泻，伴发热，精神差，就诊于省立儿童医院，血检提示血象正常，中性粒细胞比例升高，予静脉用药“喜炎平、奥美拉唑”1次，患儿仍有发热，呕吐，昨日就诊于我院门诊，予以、补液等治疗后，患儿呕吐消失，体温正常，进食可，门诊拟“发热伴呕吐待查”，病程中患儿精神差，进食差，睡眠可，大小便减少。
JaccardSim : 0.33064516129032256
minimumEditDistance : 0.5609195402298851
simhash_distance : 32
----------------------------------------------------------------------------------------------------
患儿2周前无明显诱因下发热，最高达38.5℃，每次晚上发热1次，具体不清，自行可退热，无咳嗽流涕，无鼻塞，近2天再次发热，于就诊，查血常规白细胞15.7×109/L，血红蛋白72g/L，血小板45×109/L，中性粒细胞(N)%4.9%，淋巴细胞比率%84.70%，遂来我院就诊，查血常规白细胞12.45×109/L，血红蛋白63g/L，血小板51×109/L，中性粒细胞(N)%5.5%，淋巴细胞比率%85.5%，拟“呼吸道感染三系减少原因待查”收住我科。病程中，患儿精神睡眠可，进食一般，大小便正常，近期体重无明显减轻。
Ja

In [71]:
"""使用simhash的方法计算相似度"""
def isJson(jsonstr):
    """判断是否是json"""
    try:
        a = json.loads(jsonstr)
        return True
    except:
        return False
    
with open("all_result_break.json",'r',encoding="utf8") as load_f:  
    xianbingshi = []
    for i in range(30000):
        #准备工作
        line = load_f.readline()
        if not line:
            break
        if not isJson(line):
            continue;
        json_to_dic = json.loads(line)
        
        #存储现病史字符串
        if "subject_name" not in json_to_dic:
            continue
        if "呼吸" not in json_to_dic["subject_name"]:
            continue
        if "xianbingshi" not in json_to_dic:
            continue
        temp_xianbingshi = json_to_dic["xianbingshi"]
        xianbingshi.append(temp_xianbingshi)
    
    #计算相似度,打印相似度较大的句子
    simhash_list = [] # 里面存储着每一个句子对应的simhash对象
    print("现病史数据个数",":",len(xianbingshi))
    print(datetime.datetime.now())
    for index,item in enumerate(xianbingshi):
        temp_simhash_obj = Simhash(item)
        simhash_list.append(temp_simhash_obj)
    print(datetime.datetime.now())
    for i_x,x in enumerate(simhash_list):
        for i_y,y in enumerate(simhash_list):
            if i_y<=i_x:
                continue
            if(x.distance(y)>=28):
                logging.info("#"*50)
                logging.info(xianbingshi[i_x])
                logging.info(xianbingshi[i_y])
                logging.info(x.distance(y))
                logging.info("-"*50)
    print(datetime.datetime.now())
                
    load_f.close()

现病史数据个数 : 1634
2018-06-27 16:32:19.222858
2018-06-27 16:32:26.482752
2018-06-27 16:32:49.860203
