In [None]:
import pandas as pd
from pymongo import MongoClient

mongo = MongoClient("mongodb://localhost:27017/")
collection = mongo["MobileGameComments"]["BilibiliComments"]
data = [document for document in collection.find()]

df = pd.DataFrame(data)
df


In [None]:
bdf = df.dropna()
del df
bdf

In [None]:
import re

def clean(content):
  content = re.sub(r"回复 @.*? :", "", content) # 回复前缀
  content = re.sub(r"\[.*?\]", "", content) # 表情
  content = re.sub(r"@.*?\s+", "", content) # @某人
  content = re.sub(r"[^\u4e00-\u9fa5]+", "", content) # 非中文
  content = content.strip()
  return None if len(content) == 0 else content

bdf["content"] = bdf["content"].apply(clean)
bdf = bdf.dropna()
bdf

In [None]:
bdf = bdf.drop_duplicates(subset="content")
bdf.shape

In [None]:
collection = mongo["MobileGameComments"]["WeiboComments"]
data = [document for document in collection.find()]

df = pd.DataFrame(data)
df

In [None]:
wdf = df.dropna()
del df
wdf["content"] = wdf["content"].apply(clean)
wdf = wdf.dropna()
wdf = wdf.drop_duplicates(subset="content")
wdf

In [None]:
level_weights = (0, 0.6, 0.6, 0.8, 0.9, 1.0, 1.1) # 对应0-6级

def weight_map(w):
  if w < 50:
    return 1
  elif w < 200:
    return 1.2
  elif w < 500:
    return 1.4
  elif w < 1000:
    return 1.6
  return 1

bdf["weight"] = bdf.apply(lambda row: weight_map(row["like"] * level_weights[row["level"]]) * (1 if row["is_root"] else 0.5), axis=1)
wdf["weight"] = wdf["likes"].apply(lambda x: weight_map(x))
bdf.head(5)


In [None]:
from datetime import datetime

def to_timestamp(x):
  time_format = "%a %b %d %H:%M:%S %z %Y"
  dt = datetime.strptime(x, time_format)
  return int(dt.timestamp())


wdf["time"] = wdf["time"].apply(to_timestamp)
wdf.head(5)

In [None]:
bilibili_videos = {}
weibo_articles = {}
cursor = mongo["MobileGameComments"]["BilibiliVideos"].find()
for docu in cursor:
  bilibili_videos[docu["_id"]] = docu["author"]
cursor = mongo["MobileGameComments"]["WeiboArticles"].find()
for docu in cursor:
  weibo_articles[docu["_id"]] = docu["author"]

bilibili_game_map = {"网易第五人格手游": "第五人格", "光遇手游": "光遇", "Phigros官方": "Phigros", "韵律源点Arcaea": "Arcaea"}
weibo_game_map = {"明日方舟Arknights": "明日方舟", "网易第五人格": "第五人格", "光遇手游": "光遇"}

def get_bilibili_game(aid) -> str:
  author = bilibili_videos[aid]
  return bilibili_game_map.get(author, author)
  
def get_weibo_game(blogid) -> str:
  author = weibo_articles[blogid]
  return weibo_game_map.get(author, author)

bdf["game"] = bdf["aid"].apply(get_bilibili_game)
wdf["game"] = wdf["blogid"].apply(get_weibo_game)
bdf

In [None]:
bdf["source"] = "Bilibili"
wdf["source"] = "Weibo"
cols = ["game", "time", "weight", "content", "source"]
mdf = pd.merge(bdf[cols], wdf[cols], on=cols, how="outer")
mdf

In [None]:
def to_month(x):
  dt = datetime.fromtimestamp(x)
  return int(datetime(dt.year, dt.month, 1).timestamp())

mdf["time"] = mdf["time"].apply(to_month)
mdf.head(5)

In [None]:
import os
from jieba import cut

with open(os.getcwd() + "\\stopwords.txt", "r", encoding="utf-8") as f:
  stopwords = set()
  for word in f.readlines():
    stopwords.add(word.strip())

def cut_content(content):
  words = cut(content)
  res = []
  for word in words:
    if word not in stopwords:
      res.append(word)
  return " ".join(res)

mdf["words"] = mdf["content"].apply(cut_content)
mdf


In [None]:
with open(os.getcwd() + "\\程度副词.txt", "r", encoding="utf-8") as f:
  advs = frozenset(l.strip() for l in f)
with open(os.getcwd() + "\\否定词.txt", "r", encoding="utf-8") as f:
  nots = frozenset(l.strip() for l in f)
with open(os.getcwd() + "\\负面情绪词.txt", "r", encoding="utf-8") as f:
  bads = frozenset(l.strip() for l in f)
with open(os.getcwd() + "\\正面情绪词.txt", "r", encoding="utf-8") as f:
  goods = frozenset(l.strip() for l in f)

def get_emotion(sentence: str):
  words = sentence.split()
  emo = 0
  for word in words:
    w = 1
    if word in advs:
      w *= 1.2
    elif word in nots:
      w *= -1
    elif word in bads:
      emo += w * -1
      w = 0
    elif word in goods:
      emo += w
      w = 0
  return emo

mdf["emotion"] = mdf["words"].apply(get_emotion)
mdf

In [17]:
mdf.to_csv(os.getcwd() + "\\mdf.csv", index=False)

In [None]:
bmdf = mdf[mdf["source"] == "Bilibili"]
grouped = bmdf.groupby(["game", "time"]).apply(
  lambda x: (x["emotion"] * x["weight"]).sum() / x.shape[0]
).reset_index(name="emotion")
collection = mongo["MobileGameComments"]["BilibiliResult"]
data = grouped.to_dict(orient="records")
collection.insert_many(data)


In [None]:
wmdf = mdf[mdf["source"] == "Weibo"]
grouped = wmdf.groupby(["game", "time"]).apply(
  lambda x: (x["emotion"] * x["weight"]).sum() / x.shape[0]
).reset_index(name="emotion")
mongo = MongoClient()
collection = mongo["MobileGameComments"]["WeiboResult"]
data = grouped.to_dict(orient="records")
collection.insert_many(data)

In [None]:
grouped = mdf.groupby(["game", "time"]).apply(
  lambda x: (x["emotion"] * x["weight"]).sum() / x.shape[0]
).reset_index(name="emotion")
mongo = MongoClient()
collection = mongo["MobileGameComments"]["Result"]
data = grouped.to_dict(orient="records")
collection.insert_many(data)
