In [1]:
import sqlite3
import pandas as pd

import re
import jieba
import jieba.analyse as anls

import matplotlib .pyplot as plt
plt.style.use('ggplot')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
import seaborn as sns
sns.set(font='SimHei')
from wordcloud import WordCloud

In [2]:
def contentToStr(cond, col='period'):
    '''
        将 DataFrame 中的指定内容转化为字符串
        @param cond: filter condition, str
        @param col: column name, str, default 'period'
        @return text: conversion results, str
    '''
    text = ' '.join(df[df[col] == cond]['content'].tolist())
    return text

In [3]:
stopwords = [line.strip() for line in open('stopwords.txt', 'r').readlines()] #停用词列表

def cutText(raw_text):
    '''
        中文分词，并去除停用词
        @param raw_text: raw text, str
        @return ouput: split text, str
    '''
    cut_text = jieba.cut(raw_text)
    output = ''
    
    for word in cut_text:
        if word not in stopwords:
            if word != '\t':
                output += word
                output += " "
    
    output = re.sub(r'\(\d*\)|\（\d*\）|\d*', '', output) #去除数字和序号
    
    return output

In [4]:
def wordCount(text):
    '''
        词频统计，生成条形图
        @param text: split text, str
    '''
    word_freq = anls.extract_tags(text, topK=20, withWeight=True)
    data = pd.DataFrame(word_freq)
    data.columns = ['word', 'frequency']

    fig, ax = plt.subplots(figsize = (6, 4), dpi=120)

    sns.barplot(x="frequency", y="word", data=data, label="word", color="#1890ff")
    sns.despine(bottom=True);

In [5]:
def wordCloud(text):
    '''
        生成词云
        @param text: split text, str
    '''
    cloud = WordCloud(
        font_path = 'fonts\FZBYSK.ttf', 
        background_color = 'white',
        max_words = 1024,
        max_font_size = 100
    )
    
    word_cloud = cloud.generate(text)

    plt.figure(figsize=(12, 12))
    plt.imshow(word_cloud) 
    plt.axis('off');