In [1]:
import jieba
import re
import csv
import wordcloud
import matplotlib.pyplot as plt

In [2]:
csv_name = 'yyqx_csv.csv'  # 微博信息文件

In [3]:
with open(csv_name, 'r', encoding='utf-8') as f:
    """
        读取微博信息到info列表
        info: type: list
    """
    reader = csv.reader(f)
    info = [row[1:]for row in reader][1:]

In [4]:
info[:20]

[['[置顶]补作业 7000W请查收 TFBOYS-易烊千玺的微博视频 \xa0', 'True'],
 ['好久不见，哥@胡歌 ，一起去看场电影？ #LongTimeNoSee# FIRST青年电影展的秒拍视频 \xa0', 'True'],
 ['今晚八点，@阿玛尼 星品线上发布会，不见不散。 \xa0原图\xa0', 'True'],
 ['转发理由:坚持自我主张，展现自信态度，和我一起成为#无可T代#的自己。@TiffanyAndCo蒂芙尼\xa0\xa0', 'False'],
 ['谢谢倾听和陪伴，今日收官，我们不说告别。#朋友请听好# \xa0[组图共4张]\xa0原图\xa0', 'True'],
 ['转发理由:今晚七点，@可爱多 线上发布会见。#吃在一起 可爱多了#\xa0\xa0', 'False'],
 ['转发理由:小彤哥新剧明天开播，看起来\xa0\xa0', 'False'],
 ['#朋友请听好#“湖南地方代表队”上线🤗朋友请听好第10期：千玺李俊濠变湖南代表队！ 谢娜被张颜齐猜崩溃？ \xa0[组图共3张]\xa0原图\xa0',
  'True'],
 ['#相信未来义演# “乐”抚人心，希望这首《南屏晚钟》能给大家带来温暖。 TFBOYS-易烊千玺的微博视频 \xa0', 'True'],
 ['#朋友请听好# “在水一方器乐团”直播首秀朋友请听好第9期：“在水一方乐器团”限定营业 何炅假期过后考虑退休？ \xa0[组图共4张]\xa0原图\xa0',
  'True'],
 ['转发理由:#因为热爱，尽善而行# 和麦当劳一起，遇见更多美好。\xa0\xa0', 'False'],
 ['#奋斗吧青春# 《奋斗的青春最美丽——2020年五·四青年节特别节目》，今晚在CCTV1、CCTV3播出，一起奋斗一起追梦！ \xa0[组图共2张]\xa0原图\xa0',
  'True'],
 ['转发理由:#五四致敬战疫青年# 致敬战疫青年，致敬最可爱的人！#谢谢你保护了我们#//@TFBOYS组合:#五四致敬战疫青年#五四青年节到来，让我们说一声，#谢谢你保护了我们#\xa0\xa0',
  'False'],
 ['#朋友请听好# 一起来听小站音乐会 朋友请听好第8期：千玺又被谢娜套路要跳拉丁？ 杨迪沉浸式读信扮演蟑螂笑Skr人 \

In [5]:
with open(csv_name, 'r', encoding='utf-8') as f:
    """
        读取微博内容到content列表
        content: type: list
    """
    reader = csv.reader(f)
    content = [row[1]for row in reader][1:]

In [6]:
with open(csv_name, 'r', encoding='utf-8') as f:
    """
        读取微博内容到content列表
        is_original: type: list
    """
    reader = csv.reader(f)
    is_original = [row[2]for row in reader][1:]

In [7]:
weibo_num = len(info)  # 数据集中微博总数
original_num = is_original.count('True')  # 原创微博数量
repost_num = is_original.count('False')  # 转发微博数量

In [8]:
print(weibo_num,original_num,repost_num)

828 516 312


In [9]:
original_list = [i[0] for i in info if i[1] == 'True']  # 原创微博列表
repost_list = [i[0] for i in info if i[1] == 'False']  # 转发微博列表

In [10]:
def read_list_file(filename):
    """
        读取文件中内容为列表的txt文件(每行以'\n'分隔)
        如: 停用词，否定词，程度副词
        filename: 文件路径
        return:file_list type: list
    """
    file_list = [k.strip() for k in open(
        filename, encoding='utf-8').readlines() if k.strip() != '\n']
    return file_list


In [11]:
def read_dict_file(filename):
    """
        读取文件中内容为键值对的txt文件
        如: BosonNLP情感词典
        filename: 文件路径
        return: classify_dict tyoe: dict
    """
    classify_list = [k.strip() for k in open(
        filename, encoding='utf-8').readlines() if k.strip() != '\n']
    classify_dict = dict()
    for i in classify_list:
        if len(i.split(' ')) == 2:
            classify_dict[i.split(' ')[0]] = i.split(' ')[1]
    return classify_dict

In [12]:
def jieba_cut(string, stopwords, emoji_dict):
    """
        对每一条微博进行jieba分词并去除停用词
        string: 微博 type: str
        stopwords: 停用词列表 type: list
        return: 去除停用词后的分词列表
    """
    string = string.replace(u'\xa0', u' ')  # 将\xa0字符替换为' '
    word_list = jieba.lcut(string)
    word_list = [i for i in word_list if i not in stopwords and i != ' ']
    for i in range(0, len(word_list)):
        if word_list[i] in emoji_dict.keys():
            word_list[i] = str(emoji_dict[word_list[i]])
    return word_list

In [13]:
# BosonNLP情感词典
BosonNLP = 'senti_dict/BosonNLP_sentiment_score/BosonNLP_sentiment_score/BosonNLP_sentiment_score.txt'
BosonNLP_dict = read_dict_file(BosonNLP)  # 读取情感词典内容到dcit中

# 否定词列表
Nega = 'senti_dict/polar_dict/nega_dict.txt'
Nega_list = read_list_file(Nega)  # 读取否定词内容到list中

# 程度副词
degree = 'senti_dict/polar_dict/degree_dict.txt'
degree_dict = read_dict_file(degree)  # 读取程度副词词典内容到dict中

# 停用词
stop_file = 'senti_dict/stopwords/stopwords.txt'
stopwords = read_list_file(stop_file)  # 读取程度副词词典内容到list中

# emoji词典
emoji_file = 'senti_dict/emoji_dict/emoji.txt'
emoji_dict = read_dict_file(emoji_file)   # 读取emoji词典内容到dict中

In [14]:
emoji_dict

{'⚽': "{'足球'}",
 '🎆': "{'焰火'}",
 '🎇': "{'烟花'}",
 '🎁': "{'礼物'}",
 '🎖': "{'勋章'}",
 '🇨🇳': "{'中国'}",
 '🏁': "{'终点'}",
 '🇦🇩': "{'安道尔'}",
 '🇦🇪': "{'阿拉伯联合大公国'}",
 '🇦🇫': "{'阿富汗'}",
 '🇦🇱': "{'阿尔巴尼亚'}",
 '🇦🇲': "{'亚美尼亚'}",
 '🇦🇷': "{'阿根廷共和国'}",
 '🇦🇶': "{'南极洲'}",
 '🇦🇴': "{'安哥拉'}",
 '🇦🇹': "{'奥地利'}",
 '🇦🇺': "{'澳大利亚'}",
 '🇧🇪': "{'比利时'}",
 '🇧🇩': "{'孟加拉国'}",
 '🇧🇭': "{'巴林'}",
 '🇧🇬': "{'保加利亚'}",
 '🇧🇳': "{'文莱'}",
 '🇧🇷': "{'巴西'}",
 '🇧🇴': "{'玻利维亚'}",
 '🇨🇦': "{'加拿大'}",
 '🥪': "{'三明治'}",
 '☢': "{'辐射'}",
 '♈': "{'白羊座'}",
 '♌': "{'狮子座'}",
 '♉': "{'金牛座'}",
 '♊': "{'双子座'}",
 '♑': "{'摩羯座'}",
 '♋': "{'巨蟹座'}",
 '♎': "{'天秤座'}",
 '♍': "{'处女座'}",
 '♏': "{'天蝎座'}",
 '♓': "{'双鱼座'}",
 '🚾': "{'厕所'}",
 '♿': "{'轮椅'}",
 '🛂': "{'护照'}",
 '🛃': "{'海关'}",
 '🛅': "{'寄存'}",
 '㊗': "{'祝贺'}",
 '㊙': "{'秘密'}",
 '🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007f': "{'英格兰'}",
 '🏴\U000e0067\U000e0062\U000e0073\U000e0063\U000e0074\U000e007f': "{'苏格兰'}",
 '🇿🇲': "{'赞比亚'}",
 '🇿🇼': "{'津巴布韦'}",
 '🇿🇦': "{'南非'}",
 '🇾🇪': "{'也门'}",
 '🇻🇳': "{'越

In [15]:
# 处理emoji_dict词典
pattern = re.compile('{(.*?)}')
for k, v in emoji_dict.items():
    emoji_dict[k] = re.findall(pattern, v)

In [16]:
def senti_ana(string, stopwords, BosonNLP_dict, Nega_list, degree_dict):
    """
        对文本内容进行情感分析, 返回情感得分
        string: 微博文本内容 str
        stopwords: 停用词 list
        BosonNLP_dict: 情感词典 dict
        Nega_list: 否定词词典 list
        degree_dict: 程度副词词典 dict
        return: score 情感得分
    """
    word_list = jieba_cut(string, stopwords, emoji_dict)  # 对文本内容进行分词

    sen_word = dict()  # 情感词词典 索引为分词结果在word_list中的下标
    nega_word = dict()  # 否定词词典 同上
    degree_word = dict()  # 程度副词词典 同上

    # 对分词结果进行分类
    for i in range(len(word_list)):
        word = word_list[i]
        if word in BosonNLP_dict and word not in Nega_list and word not in degree_dict.keys():
            sen_word[i] = BosonNLP_dict[word]
        elif word in Nega_list and word not in degree_dict.keys():
            nega_word[i] = -1
        elif word in degree_dict.keys():
            degree_word[i] = degree_dict[word]

    # 计算得分
    score = 0  # 初始得分为0
    weight = 1  # 初始权重为1
    sen_index = -1  # 初始情感词索引为-1
    sen_index_list = list(sen_word.keys())  # 情感词索引列表

    for i in range(len(word_list)):
        if i in sen_word.keys():
            score += weight*float(sen_word[i])
            sen_index += 1
            if sen_index < len(sen_index_list)-1:
                for j in range(sen_index_list[sen_index], sen_index_list[sen_index+1]):
                    # 如果两相邻情感词之间有否定词
                    if j in nega_word.keys():
                        weight = -weight
                    # 如果两相邻情感词之间有程度副词
                    elif j in degree_word.keys():
                        weight *= float(degree_word[j])
            if sen_index < len(sen_index_list)-1:
                i = sen_index_list[sen_index+1]

    return score  # 返回情感得分

In [18]:
scorelist=[]

In [19]:
for i in content:
    score=senti_ana(i,stopwords,BosonNLP_dict,Nega_list,degree_dict)
    scorelist.append(score)

In [20]:
scorelist

[3.327957989256,
 7.041072768816001,
 1.751244102216,
 7.4055712036948,
 10.257853563626398,
 -1.5835186884039998,
 2.4998980278248,
 10.691034156097299,
 13.214040636815,
 11.666422477182307,
 4.869280964226,
 18.757999070974,
 27.79602031190401,
 13.0113564837478,
 0.8157435622978,
 14.054463902222,
 6.054915806592001,
 20.057459078763994,
 9.460532395609,
 8.818706169143,
 0,
 13.907916380992798,
 -0.774714979278,
 7.900647849628,
 0.936928577561,
 12.6009140727118,
 2.621344163859,
 14.084339295864,
 5.53995301691,
 10.7021951593924,
 3.926346595606,
 2.1652620641599998,
 10.607268389990999,
 14.3550810478461,
 9.2525486982034,
 1.82635453446387,
 12.9795878320004,
 1.208576322805,
 8.5693786927574,
 3.7407899477799997,
 3.3006182172365,
 11.979659612730503,
 8.484153183450399,
 5.4202767833294,
 11.522728448043598,
 3.5335667733174008,
 2.637977876389,
 6.951356877991401,
 15.253599960120601,
 9.715882269659401,
 24.0947757276622,
 -1.8490212101484,
 2.7408664743213005,
 14.307936

In [21]:
len(scorelist)

828

In [22]:
for i in range(0,len(scorelist)):
    if scorelist[i] == 0:
        print(scorelist[i])
        print(content[i])

0
😆  
0
转发理由:好  
0
转发理由:益起来，帮助更多需要帮助的人  
0
成年……你  原图 
0
  [组图共4张] 原图 
0
转发理由:先看起来～   
0
转发理由:我们五岁啦  
0
❤️  原图 
0
🔥🔥🔥🔥🔥🔥🔥🔥🔥🔥  [组图共3张] 原图 
0
  [组图共4张] 原图 
0
喏  [组图共9张] 原图 
0
..............  原图 
0
今天是白色的⬜️⬜️⬜️  [组图共6张] 原图 
0
马上了………  [组图共3张] 原图 
0
微博之夜  
0
转发理由:😊😊😊  
0
就不  [组图共4张] 原图 
0
2，1。  [组图共2张] 原图 
0
  原图 
0
今天🔆🔆🔆  [组图共4张] 原图 
0
👇🏿👇🏿👇🏿  原图 
0
  原图 
0
  [组图共4张] 原图 
0
转发理由:大家觉得怎么样呢  
0
转发理由:#TFBOYS益起来#是  
0
转发理由:#TFBOYS益起来#不会  
0
转发理由:#TFBOYS益起来#一定  
0
转发理由:#TFBOYS益起来#会啊  
0
转发理由:#TFBOYS益起来#能感受到阿  
0
转发理由:#TFBOYS益起来#有  
0
转发理由:#TFBOYS益起来#当然会  
0
转发理由:有我  
0
转发理由:[呵呵]嗯！谢谢你们！  


In [24]:
result_headers = ['微博id', '微博正文/转发理由', '是否原创', '情感得分']
with open('yyqx_csv.csv', 'r', encoding='utf-8') as f:
    reader=csv.reader(f)
    content = [row[:]for row in reader][1:]

In [25]:
content

[['I9Va8w9Ab', '[置顶]补作业 7000W请查收 TFBOYS-易烊千玺的微博视频 \xa0', 'True'],
 ['J5Fx4aSdp',
  '好久不见，哥@胡歌 ，一起去看场电影？ #LongTimeNoSee# FIRST青年电影展的秒拍视频 \xa0',
  'True'],
 ['J5dhzs7rm', '今晚八点，@阿玛尼 星品线上发布会，不见不散。 \xa0原图\xa0', 'True'],
 ['J4TDnCTGL',
  '转发理由:坚持自我主张，展现自信态度，和我一起成为#无可T代#的自己。@TiffanyAndCo蒂芙尼\xa0\xa0',
  'False'],
 ['J3RLPvOlZ', '谢谢倾听和陪伴，今日收官，我们不说告别。#朋友请听好# \xa0[组图共4张]\xa0原图\xa0', 'True'],
 ['J2k3RgU82', '转发理由:今晚七点，@可爱多 线上发布会见。#吃在一起 可爱多了#\xa0\xa0', 'False'],
 ['J1UlTqZwI', '转发理由:小彤哥新剧明天开播，看起来\xa0\xa0', 'False'],
 ['J1JsH6NZ0',
  '#朋友请听好#“湖南地方代表队”上线🤗朋友请听好第10期：千玺李俊濠变湖南代表队！ 谢娜被张颜齐猜崩溃？ \xa0[组图共3张]\xa0原图\xa0',
  'True'],
 ['J1axn9v32',
  '#相信未来义演# “乐”抚人心，希望这首《南屏晚钟》能给大家带来温暖。 TFBOYS-易烊千玺的微博视频 \xa0',
  'True'],
 ['J0EKPioaW',
  '#朋友请听好# “在水一方器乐团”直播首秀朋友请听好第9期：“在水一方乐器团”限定营业 何炅假期过后考虑退休？ \xa0[组图共4张]\xa0原图\xa0',
  'True'],
 ['J0tVnqd5A', '转发理由:#因为热爱，尽善而行# 和麦当劳一起，遇见更多美好。\xa0\xa0', 'False'],
 ['J0oMQcPVn',
  '#奋斗吧青春# 《奋斗的青春最美丽——2020年五·四青年节特别节目》，今晚在CCTV1、CCTV3播出，一起奋斗一起追梦！ \xa0[组图共2张]\xa0原图\xa0',
  'True'],
 [

In [55]:
scorelist

[-1.135763976946,
 30.602560582269806,
 4.5173920744908,
 3.8686835700503006,
 7.414674054577999,
 2.25384130479,
 8.660106955145,
 1.462636277784,
 -1.0454766696545004,
 -6.6631419541664005,
 -13.527011818393001,
 9.796449948599,
 14.915809818248997,
 1.145660809559,
 8.88966867245,
 -2.29324480176777,
 -3.09685785465035,
 3.73739167021,
 4.8042289509554,
 -1.162493158958,
 2.4286209497855005,
 -0.471869645084,
 -3.8297721235413,
 -7.5982470021345,
 -3.7644088901362,
 0.16561660161299985,
 2.749019770253,
 -2.9141022840368,
 0.7674950737590001,
 -0.6381741959236,
 0,
 1.7565519562695,
 -6.021281124096,
 -4.517876423582,
 0.238138860445,
 -0.9884442060366698,
 -0.9491350766440001,
 0.475396905266,
 1.182219424647,
 -0.19383707691500018,
 0.3399235669108,
 -3.9181656053269998,
 0.297887116711,
 6.442784656269998,
 1.3791769032240002,
 3.84296452259,
 0,
 0,
 -1.26012141149,
 3.73739167021,
 -1.3100386763126002,
 18.989660783101,
 20.939836826729998,
 2.9393605944,
 3.602202084702,
 -0.4

In [26]:
all_filename = 'yyqx_with_score.csv'

def write_csv(filename):
    try:
        result_headers = ['微博id', '微博正文/转发理由', '是否原创', '情感得分']
        with open('lfq_csv.csv', 'r', encoding='utf-8') as f:
            reader=csv.reader(f)
            content = [row[:]for row in reader][1:]
        
        for i in range(0,len(content)):
            content[i].append(scorelist[i])

        with open(filename, 'a', encoding='utf-8-sig', newline='') as f:
            writer = csv.writer(f)
            writer.writerows([result_headers])
            writer.writerows(content)
        
    except Exception as e:
        print(e)


In [27]:
write_csv(all_filename)