In [1]:
import jieba
import re
import csv
import wordcloud
import matplotlib.pyplot as plt

In [2]:
csv_name = 'lfq_csv.csv'  # 微博信息文件

In [3]:
with open(csv_name, 'r', encoding='utf-8') as f:
    """
        读取微博信息到info列表
        info: type: list
    """
    reader = csv.reader(f)
    info = [row[1:]for row in reader][1:]

In [4]:
info[:20]

[['复习期末的这几天，我从钻五到了钻一。 \xa0', 'True'],
 ['转发理由:恭喜@-席瑾- 1名用户获得【1k】。微博官方唯一抽奖工具@微博抽奖平台 对本次抽奖进行监督，结果公正有效。公示链接：微博抽奖平台 //@_老番茄_:庆祝吃鸡，转发抽1k！明天20点开奖！ 查看图片\xa0\xa0',
  'False'],
 ['转发理由:庆祝吃鸡，转发抽1k！明天20点开奖！ 查看图片\xa0\xa0', 'False'],
 ['又到了一年两度，万众期待的考前临时抱佛脚时间 \xa0', 'True'],
 ['默契挑战！看看我和BOY谁才是真正的好兄弟#YSL212#  _老番茄_的微博视频 \xa0', 'True'],
 ['生日快乐，我的偶像 _老番茄_的微博视频 \xa0', 'True'],
 ['儿童节快乐！希望我的脸能变回小时候那么圆润！ \xa0原图\xa0', 'True'],
 ['失去梦想变成大鸡腿 \xa0原图\xa0', 'True'],
 ['决定利用每天上课开小差的时间学一点蒙语。如果下次回内蒙能看懂蒙文了，说明我这学期一定开了很多小差。 \xa0原图\xa0', 'True'],
 ['为什么b站在我要交作业之前上线柯南剧场版。可恶，已经看了一天柯南了。 \xa0', 'True'],
 ['昨晚失眠睡不着，翻来覆去。四点钟的时候，突然悟到，好像是热得睡不着。于是毅然打开了空调，立马睡着了。 \xa0', 'True'],
 ['转发理由:给我的好弟弟打call 查看图片 //@最强大脑陆英泽:#最强大脑开播##最强大脑#还有半小时就要开播拉，看不了电视的也可以访问http://t.cn/heFOA收看网络直播\xa0\xa0',
  'False'],
 ['完美还原世界名画！你说我画挑战。天猫搜索OPPO官方旗舰店get同款耳机  _老番茄_的微博视频 \xa0', 'True'],
 ['藏在黑影中的小怪兽 _老番茄_的微博视频 \xa0', 'True'],
 ['感谢@腾讯NintendoSwitch 送的国行switch和李维斯联名小夹克儿！耀西真可爱 \xa0[组图共3张]\xa0原图\xa0',
  'True'],
 ['打算做视频之前休息一会→看一眼直播→看到有人在玩奥特曼游戏→勾起回忆→看了一整晚

In [5]:
with open(csv_name, 'r', encoding='utf-8') as f:
    """
        读取微博内容到content列表
        content: type: list
    """
    reader = csv.reader(f)
    content = [row[1]for row in reader][1:]

In [6]:
with open(csv_name, 'r', encoding='utf-8') as f:
    """
        读取微博内容到content列表
        is_original: type: list
    """
    reader = csv.reader(f)
    is_original = [row[2]for row in reader][1:]

In [7]:
weibo_num = len(info)  # 数据集中微博总数
original_num = is_original.count('True')  # 原创微博数量
repost_num = is_original.count('False')  # 转发微博数量

In [9]:
print(weibo_num,original_num,repost_num)

71 64 7


In [10]:
original_list = [i[0] for i in info if i[1] == 'True']  # 原创微博列表
repost_list = [i[0] for i in info if i[1] == 'False']  # 转发微博列表

In [18]:
def read_list_file(filename):
    """
        读取文件中内容为列表的txt文件(每行以'\n'分隔)
        如: 停用词，否定词，程度副词
        filename: 文件路径
        return:file_list type: list
    """
    file_list = [k.strip() for k in open(
        filename, encoding='utf-8').readlines() if k.strip() != '\n']
    return file_list


In [19]:
def read_dict_file(filename):
    """
        读取文件中内容为键值对的txt文件
        如: BosonNLP情感词典
        filename: 文件路径
        return: classify_dict tyoe: dict
    """
    classify_list = [k.strip() for k in open(
        filename, encoding='utf-8').readlines() if k.strip() != '\n']
    classify_dict = dict()
    for i in classify_list:
        if len(i.split(' ')) == 2:
            classify_dict[i.split(' ')[0]] = i.split(' ')[1]
    return classify_dict

In [43]:
def jieba_cut(string, stopwords, emoji_dict):
    """
        对每一条微博进行jieba分词并去除停用词
        string: 微博 type: str
        stopwords: 停用词列表 type: list
        return: 去除停用词后的分词列表
    """
    string = string.replace(u'\xa0', u' ')  # 将\xa0字符替换为' '
    word_list = jieba.lcut(string)
    word_list = [i for i in word_list if i not in stopwords and i != ' ']
    for i in range(0, len(word_list)):
        if word_list[i] in emoji_dict.keys():
            word_list[i] = str(emoji_dict[word_list[i]])
    return word_list

In [44]:
# BosonNLP情感词典
BosonNLP = 'senti_dict/BosonNLP_sentiment_score/BosonNLP_sentiment_score/BosonNLP_sentiment_score.txt'
BosonNLP_dict = read_dict_file(BosonNLP)  # 读取情感词典内容到dcit中

# 否定词列表
Nega = 'senti_dict/polar_dict/nega_dict.txt'
Nega_list = read_list_file(Nega)  # 读取否定词内容到list中

# 程度副词
degree = 'senti_dict/polar_dict/degree_dict.txt'
degree_dict = read_dict_file(degree)  # 读取程度副词词典内容到dict中

# 停用词
stop_file = 'senti_dict/stopwords/stopwords.txt'
stopwords = read_list_file(stop_file)  # 读取程度副词词典内容到list中

# emoji词典
emoji_file = 'senti_dict/emoji_dict/emoji.txt'
emoji_dict = read_dict_file(emoji_file)   # 读取emoji词典内容到dict中

In [45]:
emoji_dict

{'⚽': "{'足球'}",
 '🎆': "{'焰火'}",
 '🎇': "{'烟花'}",
 '🎁': "{'礼物'}",
 '🎖': "{'勋章'}",
 '🇨🇳': "{'中国'}",
 '🏁': "{'终点'}",
 '🇦🇩': "{'安道尔'}",
 '🇦🇪': "{'阿拉伯联合大公国'}",
 '🇦🇫': "{'阿富汗'}",
 '🇦🇱': "{'阿尔巴尼亚'}",
 '🇦🇲': "{'亚美尼亚'}",
 '🇦🇷': "{'阿根廷共和国'}",
 '🇦🇶': "{'南极洲'}",
 '🇦🇴': "{'安哥拉'}",
 '🇦🇹': "{'奥地利'}",
 '🇦🇺': "{'澳大利亚'}",
 '🇧🇪': "{'比利时'}",
 '🇧🇩': "{'孟加拉国'}",
 '🇧🇭': "{'巴林'}",
 '🇧🇬': "{'保加利亚'}",
 '🇧🇳': "{'文莱'}",
 '🇧🇷': "{'巴西'}",
 '🇧🇴': "{'玻利维亚'}",
 '🇨🇦': "{'加拿大'}",
 '🥪': "{'三明治'}",
 '☢': "{'辐射'}",
 '♈': "{'白羊座'}",
 '♌': "{'狮子座'}",
 '♉': "{'金牛座'}",
 '♊': "{'双子座'}",
 '♑': "{'摩羯座'}",
 '♋': "{'巨蟹座'}",
 '♎': "{'天秤座'}",
 '♍': "{'处女座'}",
 '♏': "{'天蝎座'}",
 '♓': "{'双鱼座'}",
 '🚾': "{'厕所'}",
 '♿': "{'轮椅'}",
 '🛂': "{'护照'}",
 '🛃': "{'海关'}",
 '🛅': "{'寄存'}",
 '㊗': "{'祝贺'}",
 '㊙': "{'秘密'}",
 '🏴\U000e0067\U000e0062\U000e0065\U000e006e\U000e0067\U000e007f': "{'英格兰'}",
 '🏴\U000e0067\U000e0062\U000e0073\U000e0063\U000e0074\U000e007f': "{'苏格兰'}",
 '🇿🇲': "{'赞比亚'}",
 '🇿🇼': "{'津巴布韦'}",
 '🇿🇦': "{'南非'}",
 '🇾🇪': "{'也门'}",
 '🇻🇳': "{'越

In [46]:
# 处理emoji_dict词典
pattern = re.compile('{(.*?)}')
for k, v in emoji_dict.items():
    emoji_dict[k] = re.findall(pattern, v)

In [47]:
def senti_ana(string, stopwords, BosonNLP_dict, Nega_list, degree_dict):
    """
        对文本内容进行情感分析, 返回情感得分
        string: 微博文本内容 str
        stopwords: 停用词 list
        BosonNLP_dict: 情感词典 dict
        Nega_list: 否定词词典 list
        degree_dict: 程度副词词典 dict
        return: score 情感得分
    """
    word_list = jieba_cut(string, stopwords, emoji_dict)  # 对文本内容进行分词

    sen_word = dict()  # 情感词词典 索引为分词结果在word_list中的下标
    nega_word = dict()  # 否定词词典 同上
    degree_word = dict()  # 程度副词词典 同上

    # 对分词结果进行分类
    for i in range(len(word_list)):
        word = word_list[i]
        if word in BosonNLP_dict and word not in Nega_list and word not in degree_dict.keys():
            sen_word[i] = BosonNLP_dict[word]
        elif word in Nega_list and word not in degree_dict.keys():
            nega_word[i] = -1
        elif word in degree_dict.keys():
            degree_word[i] = degree_dict[word]

    # 计算得分
    score = 0  # 初始得分为0
    weight = 1  # 初始权重为1
    sen_index = -1  # 初始情感词索引为-1
    sen_index_list = list(sen_word.keys())  # 情感词索引列表

    for i in range(len(word_list)):
        if i in sen_word.keys():
            score += weight*float(sen_word[i])
            sen_index += 1
            if sen_index < len(sen_index_list)-1:
                for j in range(sen_index_list[sen_index], sen_index_list[sen_index+1]):
                    # 如果两相邻情感词之间有否定词
                    if j in nega_word.keys():
                        weight = -weight
                    # 如果两相邻情感词之间有程度副词
                    elif j in degree_word.keys():
                        weight *= float(degree_word[j])
            if sen_index < len(sen_index_list)-1:
                i = sen_index_list[sen_index+1]

    return score  # 返回情感得分

In [48]:
scorelist=[]

In [49]:
for i in content:
    score=senti_ana(i,stopwords,BosonNLP_dict,Nega_list,degree_dict)
    scorelist.append(score)

In [50]:
scorelist

[-1.135763976946,
 30.602560582269806,
 4.5173920744908,
 3.8686835700503006,
 7.414674054577999,
 2.25384130479,
 8.660106955145,
 1.462636277784,
 -1.0454766696545004,
 -6.6631419541664005,
 -13.527011818393001,
 9.796449948599,
 14.915809818248997,
 1.145660809559,
 8.88966867245,
 -2.29324480176777,
 -3.09685785465035,
 3.73739167021,
 4.8042289509554,
 -1.162493158958,
 2.4286209497855005,
 -0.471869645084,
 -3.8297721235413,
 -7.5982470021345,
 -3.7644088901362,
 0.16561660161299985,
 2.749019770253,
 -2.9141022840368,
 0.7674950737590001,
 -0.6381741959236,
 0,
 1.7565519562695,
 -6.021281124096,
 -4.517876423582,
 0.238138860445,
 -0.9884442060366698,
 -0.9491350766440001,
 0.475396905266,
 1.182219424647,
 -0.19383707691500018,
 0.3399235669108,
 -3.9181656053269998,
 0.297887116711,
 6.442784656269998,
 1.3791769032240002,
 3.84296452259,
 0,
 0,
 -1.26012141149,
 3.73739167021,
 -1.3100386763126002,
 18.989660783101,
 20.939836826729998,
 2.9393605944,
 3.602202084702,
 -0.4

In [51]:
len(scorelist)

71

In [52]:
for i in range(0,len(scorelist)):
    if scorelist[i] == 0:
        print(scorelist[i])
        print(content[i])

0
我也开始了  原图 
0
好闲啊。  原图 
0
转发理由:新年快乐！多夸夸自己！  


In [53]:
result_headers = ['微博id', '微博正文/转发理由', '是否原创', '情感得分']
with open('lfq_csv.csv', 'r', encoding='utf-8') as f:
    reader=csv.reader(f)
    content = [row[:]for row in reader][1:]

In [54]:
content

[['J61JpuIB4', '复习期末的这几天，我从钻五到了钻一。 \xa0', 'True'],
 ['J61Eck4cB',
  '转发理由:恭喜@-席瑾- 1名用户获得【1k】。微博官方唯一抽奖工具@微博抽奖平台 对本次抽奖进行监督，结果公正有效。公示链接：微博抽奖平台 //@_老番茄_:庆祝吃鸡，转发抽1k！明天20点开奖！ 查看图片\xa0\xa0',
  'False'],
 ['J5PbLfDL4', '转发理由:庆祝吃鸡，转发抽1k！明天20点开奖！ 查看图片\xa0\xa0', 'False'],
 ['J5InF2nQB', '又到了一年两度，万众期待的考前临时抱佛脚时间 \xa0', 'True'],
 ['J5dfz4Aer', '默契挑战！看看我和BOY谁才是真正的好兄弟#YSL212#  _老番茄_的微博视频 \xa0', 'True'],
 ['J4PiG9jQv', '生日快乐，我的偶像 _老番茄_的微博视频 \xa0', 'True'],
 ['J4FBYipCS', '儿童节快乐！希望我的脸能变回小时候那么圆润！ \xa0原图\xa0', 'True'],
 ['J4wyYePK4', '失去梦想变成大鸡腿 \xa0原图\xa0', 'True'],
 ['J4dd11Eu9',
  '决定利用每天上课开小差的时间学一点蒙语。如果下次回内蒙能看懂蒙文了，说明我这学期一定开了很多小差。 \xa0原图\xa0',
  'True'],
 ['J3isW0rI4', '为什么b站在我要交作业之前上线柯南剧场版。可恶，已经看了一天柯南了。 \xa0', 'True'],
 ['J2xp6awdt',
  '昨晚失眠睡不着，翻来覆去。四点钟的时候，突然悟到，好像是热得睡不着。于是毅然打开了空调，立马睡着了。 \xa0',
  'True'],
 ['J24SoEtY3',
  '转发理由:给我的好弟弟打call 查看图片 //@最强大脑陆英泽:#最强大脑开播##最强大脑#还有半小时就要开播拉，看不了电视的也可以访问http://t.cn/heFOA收看网络直播\xa0\xa0',
  'False'],
 ['J21Fu3W9u',
  '完美还原世界名画！你说我画挑战。天猫搜索OPPO官方旗舰店get同款耳机  _老番茄_的微博视频 \

In [55]:
scorelist

[-1.135763976946,
 30.602560582269806,
 4.5173920744908,
 3.8686835700503006,
 7.414674054577999,
 2.25384130479,
 8.660106955145,
 1.462636277784,
 -1.0454766696545004,
 -6.6631419541664005,
 -13.527011818393001,
 9.796449948599,
 14.915809818248997,
 1.145660809559,
 8.88966867245,
 -2.29324480176777,
 -3.09685785465035,
 3.73739167021,
 4.8042289509554,
 -1.162493158958,
 2.4286209497855005,
 -0.471869645084,
 -3.8297721235413,
 -7.5982470021345,
 -3.7644088901362,
 0.16561660161299985,
 2.749019770253,
 -2.9141022840368,
 0.7674950737590001,
 -0.6381741959236,
 0,
 1.7565519562695,
 -6.021281124096,
 -4.517876423582,
 0.238138860445,
 -0.9884442060366698,
 -0.9491350766440001,
 0.475396905266,
 1.182219424647,
 -0.19383707691500018,
 0.3399235669108,
 -3.9181656053269998,
 0.297887116711,
 6.442784656269998,
 1.3791769032240002,
 3.84296452259,
 0,
 0,
 -1.26012141149,
 3.73739167021,
 -1.3100386763126002,
 18.989660783101,
 20.939836826729998,
 2.9393605944,
 3.602202084702,
 -0.4

In [57]:
all_filename = 'lfq_with_score.csv'

def write_csv(filename):
    try:
        result_headers = ['微博id', '微博正文/转发理由', '是否原创', '情感得分']
        with open('lfq_csv.csv', 'r', encoding='utf-8') as f:
            reader=csv.reader(f)
            content = [row[:]for row in reader][1:]
        
        for i in range(0,len(content)):
            content[i].append(scorelist[i])

        with open(filename, 'a', encoding='utf-8-sig', newline='') as f:
            writer = csv.writer(f)
            writer.writerows([result_headers])
            writer.writerows(content)
        
    except Exception as e:
        print(e)


In [58]:
write_csv(all_filename)