In [4]:
import pandas as pd
import numpy as np
import jieba
import torch
from flair.embeddings import FlairEmbeddings, DocumentPoolEmbeddings, Sentence, BertEmbeddings,WordEmbeddings
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import euclidean
from scipy.spatial import distance_matrix
from collections import Counter
from collections import defaultdict
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import font_manager
import random
import seaborn as sns
import chartify
import os
matplotlib.use('AGG')

In [5]:
class doument_analysis:
    def __init__(self,path_csv):
        self.path = path_csv
        self.df = pd.read_csv(path_csv)
        self.filepath,self.fullname=os.path.split(path_csv)
        self.fname,self.ext=os.path.splitext(self.fullname)
        self.N_CLUSTERS = 10
    def csv_token(self):#中文分词
        stopword =[]
        with open('./stopwords.txt','r') as st:
            for line in st:
                stopword.append(line.strip())
        def cut(x):
            segs = []
            for word in jieba.cut(x):
                if word not in stopword:
                    segs.append(word)
            return ' '.join(segs)
        self.df['正文']= self.df['正文'].apply(cut)
        self.df.to_csv(self.filepath+'/'+self.fname+"_token"+'.csv')
    
    def document_embedding(self):
        #self.df=pd.read_csv(self.filepath+'/'+self.fname+"_token"+".csv") #有分好词的csv的时候直接取消注释
        flair_embedding_forward = FlairEmbeddings('./news-forward-0.4.1.pt')
        flair_embedding_backward = FlairEmbeddings('./news-backward-0.4.1.pt')
        #bert_embedding = BertEmbeddings('bert-base-chinese')
        glove_embedding = WordEmbeddings('./glove.gensim')
        # combine word embedding models
        document_embeddings = DocumentPoolEmbeddings([glove_embedding, flair_embedding_backward, flair_embedding_forward])
        CUDA_LAUNCH_BLOCKING=1 
        # set up empty tensor
        X = torch.empty(size=(len(self.df.index),4196)).cuda()
        #X = torch.empty(size=(len(self.df.index)//3,4196)).cuda()
        #X = torch.empty(size=(len(self.df.index)-len(self.df.index)//3,4196)).cuda()
        #X = torch.empty(size=(len(self.df.index)-len(self.df.index)//3*2,4196)).cuda()
        #torch.backends.cudnn.enabled = False
        # fill tensor with embeddings
        i=0
        #for text in tqdm(self.df['正文'][:len(self.df.index)//3]):
        #for text in tqdm(self.df['正文'][len(self.df.index)//3:len(self.df.index)//3*2]):
        #for text in tqdm(self.df['正文'][len(self.df.index)//3*2:]):
        for text in tqdm(self.df['正文']):
            sentence = Sentence(text)
            document_embeddings.embed(sentence)
            embedding = sentence.get_embedding()
            X[i] = embedding
            i += 1
            
        self.X = X.cpu().detach().numpy()
        np.save(self.filepath+'/'+self.fname+"_doc_emd.npy",self.X)
        del(X)
        
    def PCA_cluster(self):
        self.X=np.load(self.filepath+'/'+self.fname+"_doc_emd.npy")#有词向量打开
        self.df=pd.read_csv(self.filepath+'/'+self.fname+"_token"+".csv")#有分好词的csv打开
        pca =PCA(n_components=768)
        self.X_red=pca.fit_transform(self.X)
        #self.N_CLUSTERS = 10
        ward = AgglomerativeClustering(n_clusters=self.N_CLUSTERS,affinity='euclidean',linkage='ward')
        self.pred_ward = ward.fit_predict(self.X_red)
        self.df['topic'] = self.pred_ward
    def ploting_higram(self):
        matplotlib.rcParams['axes.unicode_minus']=False     
        plt.hist(self.df['topic'], bins=10, facecolor="blue", edgecolor="black", alpha=0.7)
        # 显示横轴标签
        plt.xlabel("Interval")
        # 显示纵轴标签
        plt.ylabel("Frequency/frequency")
        # 显示图标题
        plt.title("Frequency/frequency distribution histogram")
        plt.savefig(self.filepath+'/'+self.fname+'_freout.png',dpi=300)
        plt.tight_layout()
        plt.cla()
    
    def ploting_scatter(self):
        pca = PCA(n_components=2)
        X_two = pca.fit_transform(self.X_red)
        centers = defaultdict(list)
        for label, location in zip(self.pred_ward,X_two):
            centers[label].append(location)
        color = ['red','green','grey','black','yellow','orange','pink','blue','brown','violet']
        for i,c in enumerate(centers):
            for location in centers[c]:
                plt.scatter(*location,c=color[i])
        plt.savefig(self.filepath+'/'+self.fname+'_scatter.png',dpi=300)
        plt.tight_layout()
        plt.cla()
    def get_top_ten(self):

        def get_top_words(documents, top_n):
            '''
            function to get top tf-idf words and phrases
            '''
            vectoriser = TfidfVectorizer(ngram_range=(1, 2),max_df=0.5)
            tfidf_matrix = vectoriser.fit_transform(documents)
            feature_names = vectoriser.get_feature_names()
            df_tfidf = pd.DataFrame()
            for doc in range(len(documents)):
                words = []
                scores = []
                feature_index = tfidf_matrix[doc,:].nonzero()[1]
                tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])
                for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
                    words.append(w)
                    scores.append(s)
                df_temp = pd.DataFrame(data={'word':words, 'score':scores})
                df_temp = df_temp.sort_values('score',ascending=False).head(top_n)
                df_temp['topic'] = doc
                df_tfidf = df_tfidf.append(df_temp)
            return df_tfidf

        topic_docs = []
        # group text into topic-documents
        for topic in range(self.N_CLUSTERS):
            topic_docs.append(' '.join(self.df[self.df['topic']==topic]['正文'].values))
        # apply function
        self.df_tfidf = get_top_words(topic_docs, 10)
        self.df_tfidf.to_csv(self.filepath+'/'+self.fname+"_top_ten"+'.csv')
    def ploting_ten(self):       
        for i in range(10):
            # 中文乱码和坐标轴负号处理。
            #matplotlib.rc('font', family='SimHei', weight='bold')
            """ matplotlib.rcdefaults()
            plt.rcParams['axes.unicode_minus'] = False
            plt.rcParams.update({'figure.autolayout': True})
            # 实例字体对象
            font = font_manager.FontProperties(fname=r'./simhei.ttf')
            #城市数据。 """
            word = self.df_tfidf['word'].tolist()[i*10:(i+1)*10]
            data = self.df_tfidf['score'].tolist()[i*10:(i+1)*10]
            dic = {
            'keywords':word,
            'rate':data
            }
            topic=pd.DataFrame(dic)
            ch = chartify.Chart(blank_labels=True, y_axis_type='categorical')
            ch.set_title("Theme Horizontal bar plot")
            ch.plot.bar(
                data_frame=topic,
                categorical_columns='keywords',
                numeric_column='rate',
                categorical_order_by='values',
                # color_column='rate'
            )
            #ch.show(self.filepath+'/'+self.fname+"_top_ten"+str(i)+".png")
            ch.save(self.filepath+'/'+self.fname+"_top_ten"+str(i)+"."+"html")
            
    def ploting_gauss(self):
        self.topic_centroids=[]
        dic ={0:'first',1:'second',2:'third',3:'forth',4:'fifth',5:'sixth',6:'seventh',7:'eighth',8:'ninth',9:'tenth'}
        for topic in tqdm(range(self.N_CLUSTERS)):
            X_topic = self.X_red[self.df.index[self.df['topic']==topic]]
            X_mean = np.mean(X_topic, axis=0)
            self.topic_centroids.append(X_mean)
        topic_distances = []
        for row in tqdm(self.df.index):
            topic_centroid = self.topic_centroids[self.df.iloc[row]['topic']]
            X_row = self.X_red[row]
            topic_distance = euclidean(topic_centroid, X_row)
            topic_distances.append(topic_distance)
            
        self.df['topic_distance'] = topic_distances
        fig, ax = plt.subplots(figsize = (10,10))
        sns.kdeplot(self.df[self.df['topic']==0]['topic_distance'], shade=True,label="the zero theme")
        sns.kdeplot(self.df[self.df['topic']==1]['topic_distance'], shade=True,label="the first theme")
        sns.kdeplot(self.df[self.df['topic']==2]['topic_distance'], shade=True,label="the second theme")
        sns.kdeplot(self.df[self.df['topic']==3]['topic_distance'], shade=True,label="the third theme")
        sns.kdeplot(self.df[self.df['topic']==4]['topic_distance'], shade=True,label="the forth theme")
        sns.kdeplot(self.df[self.df['topic']==5]['topic_distance'], shade=True,label="the fifth theme")
        sns.kdeplot(self.df[self.df['topic']==6]['topic_distance'], shade=True,label="the sixth theme")
        sns.kdeplot(self.df[self.df['topic']==7]['topic_distance'], shade=True,label="the seven theme")
        sns.kdeplot(self.df[self.df['topic']==8]['topic_distance'], shade=True,label="the eight theme")
        kd=sns.kdeplot(self.df[self.df['topic']==9]['topic_distance'], shade=True,label="the ninth theme")
        fig =kd.get_figure()
        plt.tight_layout()
        fig.savefig(self.filepath+'/'+self.fname+'_gauss.png',dpi=300)
        plt.cla()
        for i in range(10):
            fig,ax=plt.subplots(figsize=(10,10))
            kd=sns.kdeplot(self.df[self.df['topic']==i]['topic_distance'],shade=True,label="the "+dic[i]+" theme")
            fig=kd.get_figure()
            plt.tight_layout()
            fig.savefig(self.filepath+'/'+self.fname+'_gauss'+str(i)+'.png',dpi=300)
            plt.cla()
    def ploting_matrix(self):

        df_dist_matrix = pd.DataFrame(distance_matrix(self.topic_centroids,self.topic_centroids),index=range(self.N_CLUSTERS),columns=range(self.N_CLUSTERS))

        a = np.array(df_dist_matrix)

        fig, ax = plt.subplots(figsize = (20,20))
        #二维的数组的热力图，横轴和数轴的ticklabels要加上去的话，既可以通过将array转换成有column
        #和index的DataFrame直接绘图生成，也可以后续再加上去。后面加上去的话，更灵活，包括可设置labels大小方向等。
        sns.heatmap(pd.DataFrame(np.round(a,2), columns = ['theme0', 'theme1', 'theme2','theme3','theme4','theme5','theme6','theme7','theme8','theme9'], index = ['theme0', 'theme1', 'theme2','theme3','theme4','theme5','theme6','theme7','theme8','theme9']),
                        annot=True, vmax=1,vmin = 0, xticklabels= True, yticklabels= True, square=True, cmap="Blues")
        #sns.heatmap(np.round(a,2), annot=True, vmax=1,vmin = 0, xticklabels= True, yticklabels= True,
        #            square=True, cmap="YlGnBu")
        # ax.set_title('二维数组热力图', fontsize = 18)
        ax.set_ylabel('image', fontsize = 18)
        ax.set_xlabel('iamge', fontsize = 18) #横变成y轴，跟矩阵原始的布局情况是一样的
        plt.savefig(self.filepath+'/'+self.fname+'_matrix.png',dpi=300)
        print(self.filepath+'/'+self.fname+'_matrix.png')
        plt.tight_layout()
        plt.cla()

In [10]:
da=doument_analysis('./sample/news_202002.csv')
da.csv_token()#分词
# da.document_embedding()#embedding
da.PCA_cluster()#PCA聚类
da.ploting_higram()
da.ploting_scatter()
da.get_top_ten()
da.ploting_ten()
da.ploting_gauss()
da.ploting_matrix()

Saved to ./sample/news_202002_top_ten0.html
Saved to ./sample/news_202002_top_ten1.html
Saved to ./sample/news_202002_top_ten2.html
Saved to ./sample/news_202002_top_ten3.html
Saved to ./sample/news_202002_top_ten4.html
Saved to ./sample/news_202002_top_ten5.html
Saved to ./sample/news_202002_top_ten6.html
Saved to ./sample/news_202002_top_ten7.html


100%|██████████| 10/10 [00:00<00:00, 421.23it/s]
  0%|          | 0/10291 [00:00<?, ?it/s]

Saved to ./sample/news_202002_top_ten8.html
Saved to ./sample/news_202002_top_ten9.html


100%|██████████| 10291/10291 [00:04<00:00, 2223.61it/s]


./sample/news_202002_matrix.png
