In [None]:
import pandas as pd
import clear
import time
import numpy as np
import jieba
import re
import multiprocessing
import collections
from gensim.models import Word2Vec
from gensim.models import word2vec
from gensim.models.word2vec import LineSentence
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号

In [None]:
weibo_text = pd.read_csv('微博大V数据.csv', usecols=['created_at', 'text_gsub'])
weibo_text.drop_duplicates(['created_at', 'text_gsub'], inplace=True)
weibo_text.fillna('.', inplace=True)
weibo_text['text_gsub'] = weibo_text['text_gsub'].apply(lambda x: clear.filter_tags(str(x)))
weibo_text['text_gsub'] = weibo_text['text_gsub'].apply(lambda x: x.replace(' ', '.').replace('\xa0', '.'))
weibo_text['text_gsub'].replace('', '.', inplace=True)
weibo_text.drop_duplicates(['created_at', 'text_gsub'], inplace=True)
weibo_text['index'] = weibo_text.index
weibo_text.to_csv('weibo_text.csv', index=False)

In [None]:
btime = time.time()
with open('weibo.tok.txt', 'w', encoding='utf-8') as output_file:
    count = 0
    for line in list(weibo_text['text_gsub']):
        output_file.write(' '.join(jieba.cut(line.split('\n')[0].replace(' ', ''))) + '\n')
        count += 1
        if count % 10000 == 0:
            print(f"#{count} of texts have been tokenized.", time.time()-btime)
print('Tokenization finished.')

In [None]:
with open('weibo.tok.txt', 'r', encoding='utf-8') as input_file:
    print('data reading...')
    lines = input_file.readlines()
    print('data reading finishes.')
print('Remove Non-zh begins...')
with open('weibo.data.txt', 'w', encoding='utf-8') as output_file:
    count = 0
    remove = r'^[\u4e00-\u9fa5]+$'
    for line in lines:
        line_list = line.split('\n')[0].split(' ')
        new_line = []
        for word in line_list:
            if re.search(remove, word):
                new_line.append(word)
        output_file.write(' '.join(new_line) + '\n')
        count += 1
        if count % 10000 == 0:
            print(f"#{count} of texts have been processed.", time.time()-btime)
print('Remove Non-zh finishes.')

In [None]:
print('Word2Vec Generation begin...')
model = Word2Vec(LineSentence('weibo.data.txt'),
                 size=100,
                 window=5,
                 min_count=5,
                 workers=2)
print('Word2Vec Generation finishes.')
print('Model Saving...')
model.save('weibo.model')
print('Model Saved.')

model.wv.save_word2vec_format('weibo.model.vector', binary=False)
index2word_set = set(model.wv.index2word)

In [None]:
def avg_feature_vector(line, model=model, num_features=100, index2word_set=index2word_set):
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in line:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return [format(x, '.4f') for x in feature_vec]

In [None]:
with open('weibo.tok.txt', 'r', encoding='utf-8') as input_file:
    print('data reading...')
    lines = input_file.readlines()
    print('data reading finishes.')



In [None]:
btime= time.time()
with open('weibo.text.vector', 'w', encoding='utf-8') as output_file:
    for i, line in enumerate(lines):
        output_file.write(','.join([str(x) for x in avg_feature_vector(line.split('\n')[0].split(' '))]) + '\n')
        if (i+1) % 10000 == 0:
            print(f"#{(i+1)} of texts have been processed.", time.time()-btime)

In [None]:
weibo_vec = pd.read_csv('weibo.text.vector', header=None, dtype=np.float32)
weibo_vec['abs_sum'] = weibo_vec.apply(lambda x: abs(x).sum(), axis=1)
weibo_vec = weibo_vec[weibo_vec['abs_sum']!=0].copy()
del weibo_vec['abs_sum']
weibo_vec.to_csv('weibo_nonzero.vec.csv')
X = np.array(weibo_vec)
X_normalized = normalize(X, norm='l2')
kmeans = KMeans(n_clusters=10, random_state=43).fit(X_normalized)
weibo_vec['index'] = weibo_vec.index
weibo_vec['label'] = kmeans.labels_
weibo_vec[['index', 'label']].to_csv('weibo_cluster_label.csv', index=False)


In [None]:
with open('weibo.data.txt', 'r', encoding='utf-8') as input_file:
    print('data reading...')
    lines = input_file.readlines()
    print('data reading finishes.')

stop_words = open('stop_words.txt', encoding='utf-8').readlines()
stop_words = [i.strip() for i in stop_words]

weibo_label = pd.read_csv('weibo_cluster_label.csv')

#进行一下去重
weibo_text = pd.read_csv('weibo_text.csv', usecols=['index'])
weibo_label = pd.merge(weibo_label, weibo_text[['index']], how='inner', left_on='index', right_on='index')


In [None]:
tok_label = [lines[x] for x in list(weibo_label[weibo_label['label']==6]['index'])]
words = []
for line in tok_label:
    words.extend(line.split('\n')[0].split(' '))

word_count = collections.Counter(words)


In [None]:
word_count.most_common(500)

In [None]:
weibo_covid_senti = pd.read_csv('weibo_covid_senti.csv', usecols=['created_at', 'senti'])
weibo_non_covid_senti = pd.read_csv('weibo_non_covid_senti.csv', usecols=['created_at', 'senti'])

weibo_text = pd.read_csv('微博大V数据.csv', usecols=['text_gsub'])
weibo_text.fillna('None', inplace=True)
covid_str = '口罩|肺炎|疫情|传染|病毒|冠状|隔离|防控'
weibo_covid = weibo_text[weibo_text['text_gsub'].str.contains(covid_str)]
weibo_covid_senti['index'] = weibo_covid.index
weibo_non_covid = weibo_text[~weibo_text['text_gsub'].str.contains(covid_str)]
weibo_non_covid_senti['index'] = weibo_non_covid.index


weibo_senti = pd.concat([weibo_covid_senti, weibo_non_covid_senti])
weibo_senti.sort_values('index', inplace=True)
weibo_senti.to_csv('weibo_senti.csv', index=False)

In [None]:
weibo_senti_label = pd.merge(weibo_senti, weibo_label, left_on='index', right_on='index', how='inner')
weibo_senti_label['created_at'] = pd.to_datetime(weibo_senti_label['created_at'])
weibo_senti_label['created_at_string'] = weibo_senti_label['created_at'].apply(lambda x: x.strftime('%Y-%m-%d'))
del weibo_senti_label['created_at']
weibo_senti_label.to_csv('weibo_senti_label.csv', index=False)

In [None]:
weibo_senti_label = pd.read_csv('weibo_senti_label.csv')
#进行一下去重
weibo_senti_label = pd.merge(weibo_senti_label, weibo_text[['index']], how='inner', left_on='index', right_on='index')

In [None]:
weibo_senti_label.groupby(['label', 'created_at_string'])['senti'].mean().sort_index().unstack('label').plot(figsize=(15,9), style='.-', grid=True, title='微博情绪指数')