In [1]:
import pandas as pd
from pandas.io.parsers import TextFileReader

train = pd.read_csv(r"labeledTrainData.tsv", header=0, \
                    delimiter="\t", quoting=3)  
test = pd.read_csv(r"testData.tsv", header=0, \
                   delimiter="\t", quoting=3)


In [2]:
train

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."
5,"""8196_8""",1,"""I dont know why people think this is such a b..."
6,"""7166_2""",0,"""This movie could have been very good, but com..."
7,"""10633_1""",0,"""I watched this video at a friend's house. I'm..."
8,"""319_1""",0,"""A friend of mine bought this film for £1, and..."
9,"""8713_10""",1,"""<br /><br />This movie is full of references...."


In [3]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [4]:
from nltk.corpus import stopwords
# Python中搜寻集合(set)要比列表(list)快，所以把stop words转换为集合
stops = set(stopwords.words("english"))

In [5]:
from bs4 import BeautifulSoup
import re

def review_to_words(raw_review):
    
    review_text = BeautifulSoup(raw_review).get_text()
    # 移除非文字部分（比如标点）
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    # 转换为小写字母，并把所有词切开
    words = letters_only.lower().split() 
    # 删除stopwords
    meaningful_words = [w for w in words if not w in stops]
    # 将筛分好的词合成一个字符串，并用空格隔开
    words = " ".join(meaningful_words)
    return words

# %%训练集数据清理
from tqdm import tqdm
# 得到评论的总数，即数据的行数
num_reviews = train["review"].size
# 建立一个空列表去装所有的clearn review
clean_train_reviews = []
for i in tqdm(range(0, num_reviews)):
    clean_train_reviews.append(review_to_words(train["review"][i])) 

100%|██████████| 25000/25000 [02:56<00:00, 151.03it/s]


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# vectorizer = CountVectorizer(analyzer="char", tokenizer=None, \
#                             preprocessor=None, stop_words=None, \
#                             max_features=5000)
# vectorizer = CountVectorizer(analyzer="char_wb", tokenizer=None, \
#                              preprocessor=None, stop_words=None, \
#                              max_features=10000)
# vectorizer = CountVectorizer(analyzer="word", tokenizer=None, \
#                              preprocessor=None, stop_words=None, \
#                              max_features=10000)
vectorizer = CountVectorizer(analyzer="word", tokenizer=None, \
                             preprocessor=None, stop_words=None, \
                             max_features=5000)


train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()


In [7]:
# get_feature_names()可看到所有文本的关键字
# vocabulary_可看到所有文本的关键字和其位置

print(vectorizer.get_feature_names())  
print(vectorizer.vocabulary_)
# 可看到词频矩阵的结果
print(train_data_features)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [8]:
# %%用朴素贝叶斯分类器预测
from sklearn.naive_bayes import MultinomialNB as MNB

model_NB = MNB()
model_NB.fit(train_data_features, train["sentiment"])

# K折交叉验证
from sklearn.model_selection import cross_val_score
import numpy as np

score = np.mean(cross_val_score(model_NB, \
                        train_data_features, train["sentiment"], cv=20, scoring='roc_auc'))
print("score is: ", score)
# 多项式贝叶斯分类器20折交叉验证得分: 0.917144192

('score is: ', 0.9171565439999998)


In [None]:
# %%测试集数据清理
clean_test_reviews = []
for i in tqdm(range(0, len(test["review"]))):
    clean_test_reviews.append(review_to_words(test["review"][i]))

# Bag of words
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer="word", tokenizer=None, \
                             preprocessor=None, stop_words=None, \
                             max_features=5000)

test_data_features = vectorizer.fit_transform(clean_test_reviews)
# test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

# 贝叶斯模型输出预测结果
result = model_NB.predict(test_data_features)
output = pd.DataFrame(data={"id": test["id"], "sentiment": result})
output.to_csv("NB_model.csv", index=False, quoting=3)


 72%|███████▏  | 18084/25000 [02:00<01:05, 106.26it/s]