In [1]:
import re
import numpy as np
import pandas as pd
from collections import Counter
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer as TF
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.linear_model import LogisticRegression as LR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
def clean_text(origin_text):
    # 去掉html标签
    text = BeautifulSoup(origin_text).get_text()
    # 去掉标点符号和非法字符
    text = re.sub("[^a-zA-Z]", " ", text)
    # 将字符全部转化为小写，并通过空格符进行分词处理
    words = text.lower().split()
    # 去停用词
    stop_words = set(stopwords.words("english"))
    meaningful_words = [w for w in words if w not in stop_words]
    # 将剩下的词还原成str类型
    cleaned_text = " ".join(meaningful_words)
    return cleaned_text

In [3]:
train_df = pd.read_json('../data/train.json')
test_df = pd.read_json('../data/test.json')
train_df.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,23 September 2013,tt1392214,ur38303522,True,The film Prisoners from the outside looked lik...,10,"""Prisoners"" Better than expected"
1,2 October 2006,tt0095327,ur12460427,False,This is an animated story about a victim and h...,10,Great tearjerker
2,2 May 2017,tt5311514,ur75844575,False,I am going to keep this short but your name is...,10,it is amazing
3,22 November 2012,tt0017136,ur29659325,False,I would have given this movie more stars if it...,1,visuals riveting
4,21 December 2004,tt0119094,ur0263096,False,I mean come on. I thought this story was great...,8,Calm Down People - Give This Picture A Break


In [4]:
train_df['text'] = train_df['review_text'].apply(lambda x: clean_text(x))
test_df['text'] = test_df['review_text'].apply(lambda x: clean_text(x))
train_df['text'].head()

0    film prisoners outside looked like kidnapped s...
1    animated story victim family ww era keep short...
2    going keep short name amazing art style camera...
3    would given movie stars appropriately cut musi...
4    mean come thought story great cop obviously wo...
Name: text, dtype: object

In [5]:
# print(type(train_df['is_spoiler'][0]))
train_df['is_spoiler'] = train_df['is_spoiler'].apply(lambda x: 1 if x else 0)

In [6]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
tfidf = TF(
    analyzer="word",
    tokenizer=None,
    preprocessor=None,
    stop_words=None,
    max_features=200)

# 数据向量化
print("Creating the tfidf vector...\n")
tfidf.fit(train_df['text'])
x_train = tfidf.transform(train_df['text'])
x_train = x_train.toarray()

x_test = tfidf.transform(test_df['text'])
x_test = x_test.toarray()

print(x_train.shape)
print(x_test.shape)

Creating the tfidf vector...

(473913, 200)
(100000, 200)


In [7]:
y_train = train_df['is_spoiler']
y_train.value_counts()

0    349286
1    124627
Name: is_spoiler, dtype: int64

In [8]:
model = LR(solver='liblinear')
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [9]:
print("10折交叉验证：")
print(np.mean(cross_val_score(model, x_train, y_train, cv=10, scoring="accuracy")))

10折交叉验证：
0.7478672275924236


In [10]:
preds = model.predict(x_test)
submission = pd.DataFrame({'id': range(len(preds)), 'pred': preds})
submission['id'] = submission['id']
submission.to_csv("../data/ml_submission.csv", index=False, header=False)
submission.head()

Unnamed: 0,id,pred
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
