## 读取数据

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
dataset = pd.read_csv('data/TPA2.csv', encoding='cp1252', index_col=0)

In [4]:
dataset.tail()

Unnamed: 0,Title,Abstract,Tag
18459,Registration Based on Projective Reconstructio...,"In AR systems, registration is one of the most...",5
18460,Blind Robust Watermarking Schemes for Copyrigh...,"In this paper, two novel methods suitable for ...",5
18461,Topological Lines in 3D Tensor Fields and Disc...,This paper addresses several issues related to...,5
18462,Decorating Surfaces with Bidirectional Texture...,We present a system for decorating arbitrary s...,5
18463,Comparison of Path Visualizations and Cognitiv...,We describe a between-subjects experiment that...,5


## 训练模型

In [5]:
titleTf = TfidfVectorizer().fit_transform(dataset.Title)

In [6]:
abstractTf = TfidfVectorizer().fit_transform(dataset.Abstract.values.astype('U'))

In [8]:
import time

from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import hstack

features = hstack((titleTf,abstractTf))
labels = dataset.Tag.values

# 随机选取33%数据作为测试集，剩余为训练集
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=0)

## 逻辑斯谛回归

In [10]:
from sklearn.linear_model import LogisticRegression


time_1 = time.time()
print('Start training...') 
# multi_class可选‘ovr’, ‘multinomial’，默认为ovr用于二类分类，multinomial用于多类分类
clf = LogisticRegression(max_iter=100,solver='saga',multi_class='multinomial')
clf.fit(train_features,train_labels)
time_2 = time.time()
print('training cost %f seconds' % (time_2 - time_1))


print('Start predicting...')
test_predict = clf.predict(test_features)
time_3 = time.time()
print('predicting cost %f seconds' % (time_3 - time_2))


accuracy = accuracy_score(test_labels, test_predict)
print("The accruacy score is %f" % accuracy)
precision = precision_score(test_labels, test_predict, average='macro') # MAP
print("The precision score is %f" % precision)
recall = recall_score(test_labels, test_predict, average='macro')
print("The recall score is %f" % recall)
f1 = f1_score(test_labels, test_predict, average='macro')
print("The f1 score is %f" % f1)

Start training...
training cost 2.402528 seconds
Start predicting...
predicting cost 0.005821 seconds
The accruacy score is 0.880538
The precision score is 0.873744
The recall score is 0.861293
The f1 score is 0.866540


## 朴素贝叶斯

In [11]:
from sklearn.naive_bayes import MultinomialNB

print('Start training...')
clf = MultinomialNB(alpha=1.0) # 加入laplace平滑
clf.fit(train_features, train_labels)

print('Start predicting...')
test_predict = clf.predict(test_features)


accuracy = accuracy_score(test_labels, test_predict)
print("The accruacy score is %f" % accuracy)
precision = precision_score(test_labels, test_predict, average='macro')
print("The precision score is %f" % precision)
recall = recall_score(test_labels, test_predict, average='macro')
print("The recall score is %f" % recall)
f1 = f1_score(test_labels, test_predict, average='macro')
print("The f1 score is %f" % f1)

Start training...
Start predicting...
The accruacy score is 0.829012
The precision score is 0.856992
The recall score is 0.774882
The f1 score is 0.788010


## 决策树-CART算法

In [10]:
from sklearn.tree import DecisionTreeClassifier

print('Start training...') 
# criterion可选‘gini’, ‘entropy’，默认为gini(对应CART算法)，entropy为信息增益（对应ID3算法）
clf = DecisionTreeClassifier(criterion='gini') 
clf.fit(train_features,train_labels)

print('Start predicting...')
test_predict = clf.predict(test_features)

score = accuracy_score(test_labels, test_predict)
print("The accruacy score is %f" % score)
precision = precision_score(test_labels, test_predict, average='macro')
print("The precision score is %f" % precision)
recall = recall_score(test_labels, test_predict, average='macro')
print("The recall score is %f" % recall)
f1 = f1_score(test_labels, test_predict, average='macro')
print("The f1 score is %f" % f1)

Start training...
Start predicting...
The accruacy score is 0.688054
The precision score is 0.673339
The recall score is 0.671797
The f1 score is 0.688054


## 决策树-ID3算法

In [12]:
from sklearn.tree import DecisionTreeClassifier

print('Start training...') 
# criterion可选‘gini’, ‘entropy’，默认为gini(对应CART算法)，entropy为信息增益（对应ID3算法）
clf = DecisionTreeClassifier(criterion='entropy') 
clf.fit(train_features,train_labels)

print('Start predicting...')
test_predict = clf.predict(test_features)

score = accuracy_score(test_labels, test_predict)
print("The accruacy score is %f" % score)
precision = precision_score(test_labels, test_predict, average='macro')
print("The precision score is %f" % precision)
recall = recall_score(test_labels, test_predict, average='macro')
print("The recall score is %f" % recall)
f1 = f1_score(test_labels, test_predict, average='macro')
print("The f1 score is %f" % f1)

Start training...
Start predicting...
The accruacy score is 0.669019
The precision score is 0.653375
The recall score is 0.651678
The f1 score is 0.669019


## AdaBoost

In [13]:
from sklearn.ensemble import AdaBoostClassifier

print('Start training...') 
# n_estimators表示要组合的弱分类器个数；
# algorithm可选{‘SAMME’, ‘SAMME.R’}，默认为‘SAMME.R’，表示使用的是real boosting算法，‘SAMME’表示使用的是discrete boosting算法
clf = AdaBoostClassifier(n_estimators=100,algorithm='SAMME.R')
clf.fit(train_features,train_labels)

print('Start predicting...')
test_predict = clf.predict(test_features)

score = accuracy_score(test_labels, test_predict)
print("The accruacy score is %f" % score)
precision = precision_score(test_labels, test_predict, average='macro')
print("The precision score is %f" % precision)
recall = recall_score(test_labels, test_predict, average='macro')
print("The recall score is %f" % recall)
f1 = f1_score(test_labels, test_predict, average='macro')
print("The f1 score is %f" % f1)

Start training...
Start predicting...
The accruacy score is 0.739252
The precision score is 0.744832
The recall score is 0.720623
The f1 score is 0.739252


## SVM

In [14]:
from sklearn import svm

print('Start training...')
clf = svm.SVC()  # svm class   
clf.fit(train_features, train_labels)  # training the svc model 

print('Start predicting...')
test_predict=clf.predict(test_features)

score = accuracy_score(test_labels, test_predict)
print("The accruacy score is %f" % score)
precision = precision_score(test_labels, test_predict, average='macro')
print("The precision score is %f" % precision)
recall = recall_score(test_labels, test_predict, average='macro')
print("The recall score is %f" % recall)
f1 = f1_score(test_labels, test_predict, average='macro')
print("The f1 score is %f" % f1)

Start training...
Start predicting...
The accruacy score is 0.269117
The precision score is 0.053823
The recall score is 0.200000
The f1 score is 0.269117


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
