In [222]:
import pandas as pd
from hanziconv import HanziConv 
import unicodedata
import jieba
from stopwordsiso import stopwords
import emoji
from snownlp import SnowNLP
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

'''
1. Tranditional Chinese to Simplfied Chinese
2. *Count the original chinese letter 
3. Tokenization
4. Removal of stopwords and punctuation
5. Removal of Emoji *Count number of emoji used
6. *Count the review length : count Chinese words(after tokenization and removal of stopwords), count English words and number (no change)
7. *Sentiment score of Chinese token
8. Split into X_train, X_test, y_train, y_test
9. Calculate TFIDF: tfidf.fit_transform(train_X["text"]) tfidf.transform(test_X["text"])
10. Fit into SVC

'''

'\n1. Tranditional Chinese to Simplfied Chinese\n2. *Count the original chinese letter \n3. Tokenization\n4. Removal of stopwords and punctuation\n5. Removal of Emoji *Count number of emoji used\n6. *Count the review length : count Chinese words(after tokenization and removal of stopwords), count English words and number (no change)\n7. *Segmentation score of Chinese token\n8. Split into X_train, X_test, y_train, y_test\n9. Calculate TFDIF: tfidf.fit_transform(train_X["text"]) tfidf.transform(test_X["text"])\n10. Fit into SVC\n\n'

In [223]:
df = pd.read_csv("./finished/finalized_reviews.csv", delimiter= ',')
X = df[["short_comment","score"]].copy()
Y = df[["Fake"]].copy()
X.head()

Unnamed: 0,short_comment,score
0,没想到，这是唯一一部尊重春节档的电影。,4
1,能给负分吗卧槽 浪费我时间,1
2,太痛苦了 大过年一大早让我如坐针毡,1
3,要不是电影院没开空调一直冻我 我能直接睡过去,2
4,低俗谄媚粗制滥造，连起码的诚意都没有，推理全靠奇情轶闻，笑点东拼西凑，初一起了个大早去看电影...,2


In [224]:
Y

Unnamed: 0,Fake
0,1
1,1
2,1
3,1
4,1
...,...
1595,0
1596,0
1597,0
1598,0


In [225]:
X['short_comment'] = X['short_comment'].apply(HanziConv.toSimplified)
X

Unnamed: 0,short_comment,score
0,没想到，这是唯一一部尊重春节档的电影。,4
1,能给负分吗卧槽 浪费我时间,1
2,太痛苦了 大过年一大早让我如坐针毡,1
3,要不是电影院没开空调一直冻我 我能直接睡过去,2
4,低俗谄媚粗制滥造，连起码的诚意都没有，推理全靠奇情轶闻，笑点东拼西凑，初一起了个大早去看电影...,2
...,...,...
1595,披着推理外衣的合家欢喜剧片。最后的强行煽情让人似曾相识，名侦探柯南都是这么结尾的。,3
1596,至少比暗恋强点,3
1597,感觉还差点意思，看出四字努力了，但那装酷时用力的嘴再稍稍放松点就好了，这样表情会自然些。女生...,2
1598,有点好笑，又觉得没啥好看的。,3


In [226]:
X['ori_chinese_count'] = 0

def word_count(words):
  wordcount = 0
  wordcount_CN = 0
  wordcount_EN = 0
  wordcount_N = 0
  start = True
  for word in words:
    cat = unicodedata.category(word)
    if cat == 'Lo':        # Chinese Letter
      wordcount += 1       # each letter counted as a word
      wordcount_CN += 1
      start = True                       
    elif cat[0] == 'P':    # Some kind of punctuation
      # wordcount += 1     
      start = True                       
    elif cat[0] == 'Z':    # Some kind of separator
      start = True
    else:                  # Everything else
      if start:            
        wordcount += 1     # Only count at the start
        if cat[0]== 'N':
            wordcount_N += 1
        else:
            wordcount_EN += 1
      start = False
  return (wordcount_CN, wordcount_EN, wordcount_N, wordcount) 

for i in range(len(X)):
  X.at[i, "ori_chinese_count"] = word_count(X["short_comment"].iloc[i])[0]

X

Unnamed: 0,short_comment,score,ori_chinese_count
0,没想到，这是唯一一部尊重春节档的电影。,4,17
1,能给负分吗卧槽 浪费我时间,1,12
2,太痛苦了 大过年一大早让我如坐针毡,1,16
3,要不是电影院没开空调一直冻我 我能直接睡过去,2,21
4,低俗谄媚粗制滥造，连起码的诚意都没有，推理全靠奇情轶闻，笑点东拼西凑，初一起了个大早去看电影...,2,77
...,...,...,...
1595,披着推理外衣的合家欢喜剧片。最后的强行煽情让人似曾相识，名侦探柯南都是这么结尾的。,3,38
1596,至少比暗恋强点,3,7
1597,感觉还差点意思，看出四字努力了，但那装酷时用力的嘴再稍稍放松点就好了，这样表情会自然些。女生...,2,61
1598,有点好笑，又觉得没啥好看的。,3,12


In [227]:
def tokenize(word):
    return list(jieba.cut(word))

X['text'] = X['short_comment'].apply(tokenize)
X = X.drop("short_comment", axis=1)
X = X[list(("text", "score", "ori_chinese_count"))]
X


Unnamed: 0,text,score,ori_chinese_count
0,"[没想到, ，, 这是, 唯一, 一部, 尊重, 春节, 档, 的, 电影, 。]",4,17
1,"[能, 给, 负分, 吗, 卧槽, , 浪费, 我, 时间]",1,12
2,"[太, 痛苦, 了, , 大过年, 一大早, 让, 我, 如坐针毡]",1,16
3,"[要不是, 电影院, 没, 开, 空调, 一直, 冻, 我, , 我能, 直接, 睡过去]",2,21
4,"[低俗, 谄媚, 粗制滥造, ，, 连, 起码, 的, 诚意, 都, 没有, ，, 推理, ...",2,77
...,...,...,...
1595,"[披, 着, 推理, 外衣, 的, 合家欢, 喜剧片, 。, 最后, 的, 强行, 煽情, ...",3,38
1596,"[至少, 比, 暗恋, 强, 点]",3,7
1597,"[感觉, 还, 差点, 意思, ，, 看出, 四字, 努力, 了, ，, 但, 那, 装酷,...",2,61
1598,"[有点, 好笑, ，, 又, 觉得, 没, 啥, 好看, 的, 。]",3,12


In [228]:
stopwords_list = list(stopwords(["zh"]))
punc_list = '[\.\!︕\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）：；《）《》“”()»〔〕-]+-_-Ｄ '
def removal_sw(text_tokens):

    return [word for word in text_tokens if not word in list(stopwords(["zh"])) and not word in punc_list]

X['text'] = X['text'].apply(removal_sw)
X



Unnamed: 0,text,score,ori_chinese_count
0,"[没想到, 这是, 唯一, 一部, 尊重, 春节, 档, 电影]",4,17
1,"[负分, 卧槽, 浪费, 时间]",1,12
2,"[太, 痛苦, 大过年, 一大早, 如坐针毡]",1,16
3,"[电影院, 没, 开, 空调, 一直, 冻, 我能, 直接, 睡过去]",2,21
4,"[低俗, 谄媚, 粗制滥造, 起码, 诚意, 没有, 推理, 全靠, 奇情, 轶闻, 笑点,...",2,77
...,...,...,...
1595,"[披, 推理, 外衣, 合家欢, 喜剧片, 最后, 强行, 煽情, 似曾相识, 名, 侦探,...",3,38
1596,"[至少, 暗恋, 强]",3,7
1597,"[感觉, 差点, 意思, 看出, 四字, 努力, 装酷, 用力, 嘴, 稍稍, 放松, 表情...",2,61
1598,"[有点, 好笑, 觉得, 没, 好看]",3,12


In [229]:

X["Emoji"] = 0
X

Unnamed: 0,text,score,ori_chinese_count,Emoji
0,"[没想到, 这是, 唯一, 一部, 尊重, 春节, 档, 电影]",4,17,0
1,"[负分, 卧槽, 浪费, 时间]",1,12,0
2,"[太, 痛苦, 大过年, 一大早, 如坐针毡]",1,16,0
3,"[电影院, 没, 开, 空调, 一直, 冻, 我能, 直接, 睡过去]",2,21,0
4,"[低俗, 谄媚, 粗制滥造, 起码, 诚意, 没有, 推理, 全靠, 奇情, 轶闻, 笑点,...",2,77,0
...,...,...,...,...
1595,"[披, 推理, 外衣, 合家欢, 喜剧片, 最后, 强行, 煽情, 似曾相识, 名, 侦探,...",3,38,0
1596,"[至少, 暗恋, 强]",3,7,0
1597,"[感觉, 差点, 意思, 看出, 四字, 努力, 装酷, 用力, 嘴, 稍稍, 放松, 表情...",2,61,0
1598,"[有点, 好笑, 觉得, 没, 好看]",3,12,0


In [230]:
# [word for word in text_tokens if not word in list(stopwords(["zh"])) and not emoji.is_emoji(word)] 
X = X.copy()

def contain_emoji(text):
    global emoji_no
    is_contain = False
    for word in text:
        if emoji.is_emoji(word):
            emoji_no += 1
            is_contain = True
    return is_contain

for i in range(len(X)):
    emoji_no = 0
    if contain_emoji(X["text"].iloc[i]):
        X["text"].iloc[i] = [word for word in X["text"].iloc[i] if not emoji.is_emoji(word)] 
    X.at[i, "Emoji"] = emoji_no

X

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["text"].iloc[i] = [word for word in X["text"].iloc[i] if not emoji.is_emoji(word)]


Unnamed: 0,text,score,ori_chinese_count,Emoji
0,"[没想到, 这是, 唯一, 一部, 尊重, 春节, 档, 电影]",4,17,0
1,"[负分, 卧槽, 浪费, 时间]",1,12,0
2,"[太, 痛苦, 大过年, 一大早, 如坐针毡]",1,16,0
3,"[电影院, 没, 开, 空调, 一直, 冻, 我能, 直接, 睡过去]",2,21,0
4,"[低俗, 谄媚, 粗制滥造, 起码, 诚意, 没有, 推理, 全靠, 奇情, 轶闻, 笑点,...",2,77,0
...,...,...,...,...
1595,"[披, 推理, 外衣, 合家欢, 喜剧片, 最后, 强行, 煽情, 似曾相识, 名, 侦探,...",3,38,0
1596,"[至少, 暗恋, 强]",3,7,0
1597,"[感觉, 差点, 意思, 看出, 四字, 努力, 装酷, 用力, 嘴, 稍稍, 放松, 表情...",2,61,0
1598,"[有点, 好笑, 觉得, 没, 好看]",3,12,0


In [231]:
X["chinese_count"] = 0
X["english_count"] = 0
X["number_count"] = 0

for i in range(len(X)):
  wordcount_list = [0, 0, 0, 0]
  for words in X["text"].iloc[i]:
    wordcount_list = [a + b for a, b in zip(wordcount_list, word_count(words))]
  X.at[i, "chinese_count"] = wordcount_list[0]
  X.at[i, "english_count"] = wordcount_list[1]
  X.at[i, "number_count"] = wordcount_list[2]


X

Unnamed: 0,text,score,ori_chinese_count,Emoji,chinese_count,english_count,number_count
0,"[没想到, 这是, 唯一, 一部, 尊重, 春节, 档, 电影]",4,17,0,16,0,0
1,"[负分, 卧槽, 浪费, 时间]",1,12,0,8,0,0
2,"[太, 痛苦, 大过年, 一大早, 如坐针毡]",1,16,0,13,0,0
3,"[电影院, 没, 开, 空调, 一直, 冻, 我能, 直接, 睡过去]",2,21,0,17,0,0
4,"[低俗, 谄媚, 粗制滥造, 起码, 诚意, 没有, 推理, 全靠, 奇情, 轶闻, 笑点,...",2,77,0,62,0,1
...,...,...,...,...,...,...,...
1595,"[披, 推理, 外衣, 合家欢, 喜剧片, 最后, 强行, 煽情, 似曾相识, 名, 侦探,...",3,38,0,28,0,0
1596,"[至少, 暗恋, 强]",3,7,0,5,0,0
1597,"[感觉, 差点, 意思, 看出, 四字, 努力, 装酷, 用力, 嘴, 稍稍, 放松, 表情...",2,61,0,40,0,0
1598,"[有点, 好笑, 觉得, 没, 好看]",3,12,0,9,0,0


In [232]:
X["Sentiment"] = 0
X

Unnamed: 0,text,score,ori_chinese_count,Emoji,chinese_count,english_count,number_count,Sentiment
0,"[没想到, 这是, 唯一, 一部, 尊重, 春节, 档, 电影]",4,17,0,16,0,0,0
1,"[负分, 卧槽, 浪费, 时间]",1,12,0,8,0,0,0
2,"[太, 痛苦, 大过年, 一大早, 如坐针毡]",1,16,0,13,0,0,0
3,"[电影院, 没, 开, 空调, 一直, 冻, 我能, 直接, 睡过去]",2,21,0,17,0,0,0
4,"[低俗, 谄媚, 粗制滥造, 起码, 诚意, 没有, 推理, 全靠, 奇情, 轶闻, 笑点,...",2,77,0,62,0,1,0
...,...,...,...,...,...,...,...,...
1595,"[披, 推理, 外衣, 合家欢, 喜剧片, 最后, 强行, 煽情, 似曾相识, 名, 侦探,...",3,38,0,28,0,0,0
1596,"[至少, 暗恋, 强]",3,7,0,5,0,0,0
1597,"[感觉, 差点, 意思, 看出, 四字, 努力, 装酷, 用力, 嘴, 稍稍, 放松, 表情...",2,61,0,40,0,0,0
1598,"[有点, 好笑, 觉得, 没, 好看]",3,12,0,9,0,0,0


In [233]:

for i in range(len(X)):
    sentiment_sum = 0
    count = 0
    # if it is Chinese token:
    for word in X["text"].iloc[i]:
        cat = unicodedata.category(word[0])
        if not (cat == 'Ll' or cat == 'Lu'):
            sentiment_sum += SnowNLP(word).sentiments
            count += 1
    if count == 0:
        X.at[i, "Sentiment"] = 0.5 # neutral 
    else:
        X.at[i, "Sentiment"] = sentiment_sum/count


In [234]:
''' 
# This is not work with SNOWNLP because this tool tends to give score > 0.5 (positive)
for i in range(len(X)):
    sentiment_sum = 0
    positive_sum = 0
    negative_sum = 0
    # if it is Chinese token:
    for word in X["text"].iloc[i]:
        cat = unicodedata.category(word[0])
        if not (cat == 'Ll' or cat == 'Lu'):
            sentiment = SnowNLP(word).sentiments
            sentiment_sum += 1
            if sentiment>=0.5:
                positive_sum += 1
            else:
                negative_sum += 1
    if sentiment_sum == 0:
        X.at[i, "Sentiment"] = 0.5 # neutral 
    else:
        X.at[i, "Sentiment"] = (positive_sum + negative_sum) / sentiment_sum
'''

' \n# This is not work with SNOWNLP because this tool tends to give score > 0.5 (positive)\nfor i in range(len(X)):\n    sentiment_sum = 0\n    positive_sum = 0\n    negative_sum = 0\n    # if it is Chinese token:\n    for word in X["text"].iloc[i]:\n        cat = unicodedata.category(word[0])\n        if not (cat == \'Ll\' or cat == \'Lu\'):\n            sentiment = SnowNLP(word).sentiments\n            sentiment_sum += 1\n            if sentiment>=0.5:\n                positive_sum += 1\n            else:\n                negative_sum += 1\n    if sentiment_sum == 0:\n        X.at[i, "Sentiment"] = 0.5 # neutral \n    else:\n        X.at[i, "Sentiment"] = (positive_sum + negative_sum) / sentiment_sum\n'

In [235]:
X

Unnamed: 0,text,score,ori_chinese_count,Emoji,chinese_count,english_count,number_count,Sentiment
0,"[没想到, 这是, 唯一, 一部, 尊重, 春节, 档, 电影]",4,17,0,16,0,0,0.571120
1,"[负分, 卧槽, 浪费, 时间]",1,12,0,8,0,0,0.335371
2,"[太, 痛苦, 大过年, 一大早, 如坐针毡]",1,16,0,13,0,0,0.439382
3,"[电影院, 没, 开, 空调, 一直, 冻, 我能, 直接, 睡过去]",2,21,0,17,0,0,0.389829
4,"[低俗, 谄媚, 粗制滥造, 起码, 诚意, 没有, 推理, 全靠, 奇情, 轶闻, 笑点,...",2,77,0,62,0,1,0.521645
...,...,...,...,...,...,...,...,...
1595,"[披, 推理, 外衣, 合家欢, 喜剧片, 最后, 强行, 煽情, 似曾相识, 名, 侦探,...",3,38,0,28,0,0,0.670782
1596,"[至少, 暗恋, 强]",3,7,0,5,0,0,0.636980
1597,"[感觉, 差点, 意思, 看出, 四字, 努力, 装酷, 用力, 嘴, 稍稍, 放松, 表情...",2,61,0,40,0,0,0.615506
1598,"[有点, 好笑, 觉得, 没, 好看]",3,12,0,9,0,0,0.481583


In [236]:
#X.to_csv("cleaned_reviews.csv", index=False)
#Y.to_csv("cleaned_label.csv", index=False)

In [237]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)
X_train

Unnamed: 0,text,score,ori_chinese_count,Emoji,chinese_count,english_count,number_count,Sentiment
1467,"[剧作, 流畅, 完成度, 抗癌, 中国, 苦难, 一起, 讲, 讲出, 正, 能量, 摇摆...",3,40,0,25,1,0,0.650652
1327,"[动画片, 大纲, 基本, 狮子王, 不算, 成功]",3,24,0,14,0,0,0.578385
672,"[无语子, 钱]",1,8,0,4,0,0,0.341569
264,"[林一, 赵, 妹妹, 演点, 偶像剧, 真, 养眼]",5,15,0,13,0,0,0.732556
91,[无聊],2,4,0,2,0,0,0.200000
...,...,...,...,...,...,...,...,...
1490,"[科幻片, 经不起, 推敲, 喜剧片, 没, 好笑, 割裂, 用心, 值, 票价]",3,33,0,21,0,0,0.536627
620,"[全员, 演技, 在线]",5,6,1,6,0,0,0.380952
1175,"[没, 笑, 没, 哭, 平庸, 至极]",2,9,0,8,0,0,0.502113
756,"[世纪, 烂片, 全程, 尴尬, 癌, 发作, 搞笑, 梗, 烂死, 拜托, 不能, 航天,...",1,34,0,26,0,0,0.438715


In [238]:
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [239]:
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None, use_idf=True) 


In [245]:
temp_text = list(X_train["text"])
temp_X = tfidf.fit_transform(temp_text)
mydf = pd.DataFrame(temp_X.toarray(), columns=tfidf.get_feature_names())

X_train_tfidf = pd.concat([X_train, mydf], axis=1)
X_train_tfidf = X_train_tfidf.drop("text", axis=1)
print(X_train_tfidf)
X_train_tfidf.to_csv("X_train_tfidf.csv")




      score  ori_chinese_count  Emoji  chinese_count  english_count  \
0         3                 40      0             25              1   
1         3                 24      0             14              0   
2         1                  8      0              4              0   
3         5                 15      0             13              0   
4         2                  4      0              2              0   
...     ...                ...    ...            ...            ...   
1275      3                 33      0             21              0   
1276      5                  6      1              6              0   
1277      2                  9      0              8              0   
1278      1                 34      0             26              0   
1279      5                 69      0             60              0   

      number_count  Sentiment   \t  \r\n   ..  ...   鼻酸   鼾声    齁  齐齐哈尔   龃龉  \
0                0   0.650652  0.0   0.0  0.0  ...  0.0  0.0  0.0  

In [246]:
temp_text = list(X_test["text"])
temp_X = tfidf.transform(temp_text)
mydf = pd.DataFrame(temp_X.toarray(), columns=tfidf.get_feature_names())

X_test_tfidf = pd.concat([X_test, mydf], axis=1)
X_test_tfidf = X_test_tfidf.drop("text", axis=1)
print(X_test_tfidf)
X_test_tfidf.to_csv("X_test_tfidf.csv")




     score  ori_chinese_count  Emoji  chinese_count  english_count  \
0        2                 94      0             62              0   
1        4                 45      0             34              0   
2        1                  7      0              4              0   
3        5                 18      0             16              0   
4        5                110      0             69              0   
..     ...                ...    ...            ...            ...   
315      1                 17      0             14              0   
316      5                  7      0              5              0   
317      3                 35      0             20              0   
318      3                  8      0              6              0   
319      4                299      0            212              0   

     number_count  Sentiment   \t  \r\n   ..  ...   鼻酸   鼾声    齁  齐齐哈尔   龃龉  \
0               0   0.604918  0.0   0.0  0.0  ...  0.0  0.0  0.0   0.0  0.0   
1

In [247]:
y_train.to_csv("y_train.csv")
y_test.to_csv("y_test.csv")