In [1]:
#! python3
# -*- coding: utf-8 -*-

In [2]:
import os, codecs
import jieba
from collections import Counter
import re
import sys
from xlutils.copy import copy 
from xlrd import open_workbook 
from tqdm import tqdm

In [3]:
#preprocessing 

def filter (s):
    r1 = re.sub(r"\s{1,}", "", s)
    r2 = re.sub("[0-9\□\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\！\@\#\\\&\*\%\（\）\、\- \- \，\。\？\！\；\：\‘\’\“\”\{\}\【\】\￥\=\-\——|《\》\`\~]",
                "", r1)
    r3 = re.sub("A-Za-z", "",r2)
    return r3

def get_stop_list (path):
    result=[]
    with open (path,'r',  encoding='UTF-8') as f:
        for line in f:
            result.append (line.strip('\n').split(','))
    s_list = [i for item in result for i in item]
    return s_list

def delete_stop_words (seg, stop_list):
    for i in seg:
        if i in stop_list:
            seg.remove(i)
    return seg

In [4]:
# get word lists

def get_disease_list(path):
    with codecs.open (path, 'r', encoding='UTF-8') as f:
        txt = f.read()
        txt = re.sub(r"\s{1,}", "", txt)
    d_list, seg = get_words(txt)
    del d_list[0]
    return d_list

def get_uncertain_list (path):
    result = []
    with open (path, 'r', encoding = 'UTF-8') as f:
        for line in f:
            result.append (line.strip('\n').split(','))
    u_list = [i for item in result for i in item]
    del u_list[0]
    return u_list

def get_positive_list (path):
    result = []
    with open (path, 'r', encoding = 'UTF-8') as f:
        for line in f:
            result.append (line.strip('\n').split(','))
    p_list = [i for item in result for i in item]
    del p_list[0]
    return p_list

def get_negative_list (path):
    result = []
    with open (path, 'r', encoding = 'UTF-8') as f:
        for line in f:
            result.append (line.strip('\n').split(','))
    n_list = [i for item in result for i in item]
    del n_list[0]
    return n_list

def get_word_lists (d_path, u_path, p_path, n_path):
    d_list = get_disease_list(d_path)
    u_list = get_uncertain_list(u_path)
    p_list = get_positive_list(p_path)
    n_list = get_negative_list(n_path)
    return d_list, u_list, p_list, n_list

In [5]:
# words segmentation

def get_words(txt):
    seg = jieba.cut(txt)
    seg_list = list (seg)
    return seg_list, seg

def total_words(seg_list, top, name):
    c = Counter()
    for x in seg_list:
        if len(x)>1 and x != '\r\n':
            c[x] += 1
    total = 0
    for (k,v) in c.most_common(len(c)):
        total = total + v
        
    #print (name, "总计出现 ", len(c), "个不同的词\n")
    #print ("总词数为", total, "\n")
    #print ('常用词频度统计结果(前', top, ')')
    #for (k,v) in c.most_common(top):
        #print('%s%s %s  %d' % ('  '*(5-len(k)), k, '*'*10, v))
    #print ("\n")
    #print ("*"*30)
    #print ("\n")
    
    return len(c), c, total

def show_frequency (seg_list,top):
    le, c, total = total_words (seg_list)
    print ("本文总计出现 ", le, "个不同的词\n")
    print ("总词数为", total, "\n")
    print ('常用词频度统计结果')
    for (k,v) in c.most_common(top):
        print('%s%s %s  %d' % ('  '*(5-len(k)), k, '*'*int(v/3), v))

In [6]:
# dictionary augmentation

def dic_augmentation (path_d, path_u, path_p, path_n):
    jieba.load_userdict(path_d)
    jieba.load_userdict(path_u)
    jieba.load_userdict(path_p)
    jieba.load_userdict(path_n)

In [7]:
# calculation

def get_d_exposure (seg_list, d_list, total):
    d_index = []
    d_total = 0
    d_exposure = 0
    for i in range(0, len(seg_list)):
        if seg_list[i] in d_list:
            #print(seg_list[i])
            d_total = d_total + 1
            d_index.append(i)
    d_exposure = d_total / total
    return d_total, d_exposure, d_index

def get_d_risk (d_index, seg_list, d_list, u_list, ran, total):
    d_risk_total = 0
    d_risk = 0
    for i in d_index:
        for j in range (max(i-ran, 0), min(len(seg_list), i+ran+1)):
            if seg_list[j] in u_list:
                d_risk_total  = d_risk_total  + 1
    d_risk = d_risk_total / total
    return d_risk_total, d_risk

def get_p_sentiment (d_index, seg_list, d_list, p_list, ran, total):
    p_sentiment_total = 0
    p_sentiment = 0
    for i in d_index:
        for j in range (max(i-ran, 0), min(len(seg_list), i+ran+1)):
            if seg_list[j] in p_list:
                p_sentiment_total  = p_sentiment_total  + 1
    p_sentiment = p_sentiment_total / total
    return p_sentiment_total, p_sentiment

def get_n_sentiment (d_index, seg_list, d_list, n_list, ran, total):
    n_sentiment_total = 0
    n_sentiment = 0
    for i in d_index:
        for j in range (max(i-ran, 0), min(len(seg_list), i+ran+1)):
            if seg_list[j] in n_list:
                n_sentiment_total  = n_sentiment_total  + 1
    n_sentiment = n_sentiment_total / total
    return n_sentiment_total, n_sentiment

def get_d_sentiment (p_sentiment_total, n_sentiment_total, total):
    d_sentiment_total = p_sentiment_total - n_sentiment_total
    d_sentiment = d_sentiment_total / total
    return d_sentiment_total, d_sentiment


In [8]:
# write to .xls excel file

def write_to_excel (path, start_row, name, words, total, d_exposure_total,
                    d_exposure, d_risk_total, d_risk, p_sentiment_total, p_sentiment, n_sentiment_total, n_sentiment, d_sentiment_total,
                    d_sentiment):
    
    rb = open_workbook (path)
    r_sheet = rb.sheet_by_index(0)
    wb = copy(rb)
    w_sheet = wb.get_sheet(0)
    
    name_column = 1
    words_column = 2
    total_column = 3
    exposure_total_column = 4
    exposure_column = 5
    risk_total_column = 6
    risk_column = 7
    p_sentiment_total_column = 8
    p_sentiment_column = 9
    n_sentiment_total_column = 10
    n_sentiment_column = 11
    sentiment_total_column = 12
    sentiment_column = 13
    
    w_sheet.write(start_row, name_column, name)
    w_sheet.write(start_row, words_column, words)
    w_sheet.write(start_row, total_column, total)
    w_sheet.write(start_row, exposure_total_column, d_exposure_total)
    w_sheet.write(start_row, exposure_column, d_exposure)
    w_sheet.write(start_row, risk_total_column, d_risk_total)
    w_sheet.write(start_row, risk_column, d_risk)
    w_sheet.write(start_row, p_sentiment_total_column, p_sentiment_total)
    w_sheet.write(start_row, p_sentiment_column, p_sentiment)
    w_sheet.write(start_row, n_sentiment_total_column, n_sentiment_total)
    w_sheet.write(start_row, n_sentiment_column, n_sentiment)
    w_sheet.write(start_row, sentiment_total_column, d_sentiment_total)
    w_sheet.write(start_row, sentiment_column, d_sentiment)
    
    wb.save(path)

In [9]:
# file name processing

def get_name (path):
    tail = os.path.split(path)[1]
    name = tail.split("_")[1]
    #name = tail
    return tail, name

In [10]:
# load necessary lists

def load_lists ():
    d_list, u_list, p_list, n_list = get_word_lists("D:\\Activities\\NLP_RA\\COVID.txt",
                                                    "D:\\Activities\\NLP_RA\\Unc.txt",
                                                    "D:\\Activities\\NLP_RA\\Pos.txt",
                                                    "D:\\Activities\\NLP_RA\\Neg.txt")
    
    s_list = get_stop_list("D:\\Activities\\NLP_RA\\Stopwords\\stopwords-master\\cn_stopwords.txt")
    
    dic_augmentation ("D:\\Activities\\NLP_RA\\COVID.txt",
                      "D:\\Activities\\NLP_RA\\Unc.txt",
                      "D:\\Activities\\NLP_RA\\Pos.txt",
                      "D:\\Activities\\NLP_RA\\Neg.txt")
    
    return d_list, u_list, p_list, n_list, s_list

In [19]:
def run(path, start_row, excel_path, d_list, u_list, p_list, n_list, s_list):

    tail, name = get_name(path)
    
    with codecs.open(path,'r', encoding = 'UTF-8') as f:
        txt = f.read()
        txt = filter (txt)
        

    seg_list, seg = get_words(txt)
    seg_list = delete_stop_words(seg_list, s_list)
    #print (seg_list)
    
    top = 10
    
    words, frequency, total = total_words(seg_list, top, name)
    #show_frequency(frequency, 10)
    
    ran = 10
    d_exposure_total, d_exposure, d_index = get_d_exposure (seg_list, d_list, total)
    d_risk_total, d_risk = get_d_risk (d_index, seg_list, d_list, u_list, ran, total)
    p_sentiment_total, p_sentiment = get_p_sentiment (d_index, seg_list, d_list, p_list, ran, total)
    n_sentiment_total, n_sentiment = get_n_sentiment (d_index, seg_list, d_list, n_list, ran, total)
    d_sentiment_total, d_sentiment = get_d_sentiment (p_sentiment_total, n_sentiment_total, total)
    
    
    #start_row = 2
    print ("\n")
    print (name, " ", words, " ", total, " ", d_exposure_total, " ",
           d_exposure, " ", d_risk_total, " ", d_risk, " ", p_sentiment_total, " ", p_sentiment, " ", 
           n_sentiment_total, " ", n_sentiment, " ", d_sentiment_total, " ", d_sentiment)
    
    write_to_excel (excel_path, start_row, name, words, total, d_exposure_total,
                    d_exposure, d_risk_total, d_risk, p_sentiment_total, p_sentiment, n_sentiment_total, n_sentiment, d_sentiment_total,
                    d_sentiment)

In [20]:
def main(file_name_path, file_name_list, excel_path):
    d_list, u_list, p_list, n_list, s_list = load_lists()
    try:
        with tqdm (range(2, len(file_name_list) + 2), ncols = 80, desc = "分析进度") as t:
            for i in t:
                try:
                    run(file_name_path + "\\" + file_name_list[i-2], i, excel_path, d_list, u_list, p_list, n_list, s_list)
                except ZeroDivisionError:
                    pass
    except KeyboardInterrupt:
        t.close()
        raise
    t.close()

In [None]:
if __name__ == '__main__':
    excel_path = "E:\\NLP_Dataset\\2019年报\\轻工制造\\轻工制造_Results - 副本.xls"
    file_name_path = "E:\\NLP_Dataset\\2019年报\\轻工制造\\轻工制造txt"
    file_name_list = os.listdir(file_name_path)
    main (file_name_path, file_name_list, excel_path)