In [None]:
import numpy as np
import random
import pandas as pd
import sklearn
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# 读取数据集
train_data = pd.read_csv(r'/train_set.csv')
test_data = pd.read_csv(r'test_set.csv')

# 生成训练集/测试集
seg_train = train_data.word_seg.values.tolist()
seg_test = test_data.word_seg.values.tolist()
label_train = train_data['class'].tolist()
id_test = test_data.id.values.tolist()

# 训练 TF-IDF 模型
vectorizer = TfidfVectorizer(max_df=0.6)
tdm_train = vectorizer.fit_transform(seg_train) # 训练集
tdm_test = vectorizer.transform(seg_test) # 测试集

# 处理成用于 word2vec 的数据格式
def labelizeReviews(reviews, label_type):
    labelized = []
    for i, v in enumerate(reviews):
        label = '%s_%s' % (label_type, i)
        labelized.append(gensim.models.doc2vec.LabeledSentence(v, [label]))
    return labelized

seg_train = labelizeReviews(seg_train,'train')
seg_test = labelizeReviews(seg_test,'test')

# 训练 Word2vec 模型
model= gensim.models.doc2vec.Doc2Vec(seg_train,min_count=1, window=5, vector_size=100, sample=1e-3, negative=5, workers=3,dm=0)
for epoch in range(10): #进行多次重复训练，每一次都需要对训练数据重新打乱，以提高精度
    random.shuffle(seg_train)
    model.train(seg_train,total_examples=len(seg_train),epochs=model.iter)
# 生成 句向量
train_matrix = model.docvecs(seg_train)
test_matrix = model.docvecs(seg_test)

# 训练 LR 模型
clf = LogisticRegression(C=4,dual=True)
clf.fit(train_matrix,label_train)

# 训练 SVM 模型
clf = SVC(decision_function_shape='ovo')
clf.fit(tdm_train,label_train)

# 预测结果
prediction = clf.predict(test_matrix)

# 打印结果
print(prediction)

# 生成结果
df = pd.DataFrame(prediction,columns=['class'])
df['id'] = id_test
# print(df)
# 导出csv文件
df[['id','class']].to_csv(r'/sub.csv',index=None)
