In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
dataset = pd.read_csv('tagRec.csv', encoding='cp1252', index_col=0)

In [87]:
# dataset2=dataset.sample(n=1000) # 随机选取1000个样本
dataset2 = dataset
dataset2 = dataset2.drop(['Conference/Journal', 'Authors','Year'], axis=1) # 只留下Title和Tag两列

In [88]:
dataset2['Tag'].value_counts()

Database               5059
Visualization          4074
Theory                 3995
Medical Informatics    3066
Data Mining            2270
Name: Tag, dtype: int64

In [89]:
dataset2 = dataset2.replace({'Tag':'Data Mining'},1)
dataset2 = dataset2.replace({'Tag':'Database'},2)
dataset2 = dataset2.replace({'Tag':'Medical Informatics'},3)
dataset2 = dataset2.replace({'Tag':'Theory'},4)
dataset2 = dataset2.replace({'Tag':'Visualization'},5)
dataset2['Tag'].value_counts()

2    5059
5    4074
4    3995
3    3066
1    2270
Name: Tag, dtype: int64

In [102]:
dataset2.tail()

Unnamed: 0,Title,Abstract,Tag
18459,Registration Based on Projective Reconstructio...,"In AR systems, registration is one of the most...",5
18460,Blind Robust Watermarking Schemes for Copyrigh...,"In this paper, two novel methods suitable for ...",5
18461,Topological Lines in 3D Tensor Fields and Disc...,This paper addresses several issues related to...,5
18462,Decorating Surfaces with Bidirectional Texture...,We present a system for decorating arbitrary s...,5
18463,Comparison of Path Visualizations and Cognitiv...,We describe a between-subjects experiment that...,5


In [96]:
titleTf = TfidfVectorizer().fit_transform(dataset2.Title)

In [105]:
abstractTf = TfidfVectorizer().fit_transform(dataset2.Abstract.values.astype('U'))

## 逻辑斯谛回归

In [121]:
import time

from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from scipy.sparse import hstack

from sklearn.linear_model import LogisticRegression


time_1 = time.time()


features = hstack((titleTf,abstractTf))
labels = dataset2.Tag.values

# 随机选取33%数据作为测试集，剩余为训练集
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.33, random_state=0)


print('Start training...') 
# multi_class可选‘ovr’, ‘multinomial’，默认为ovr用于二类分类，multinomial用于多类分类
clf = LogisticRegression(max_iter=100,solver='saga',multi_class='multinomial')
clf.fit(train_features,train_labels)
time_2 = time.time()
print('training cost %f seconds' % (time_2 - time_1))


print('Start predicting...')
test_predict = clf.predict(test_features)
time_3 = time.time()
print('predicting cost %f seconds' % (time_3 - time_2))


accuracy = accuracy_score(test_labels, test_predict)
print("The accruacy score is %f" % accuracy)
precision = precision_score(test_labels, test_predict, average='macro') # MAP
print("The precision score is %f" % precision)
recall = recall_score(test_labels, test_predict, average='macro')
print("The recall score is %f" % recall)
f1 = f1_score(test_labels, test_predict, average='macro')
print("The f1 score is %f" % score)

Start training...
training cost 2.367303 seconds
Start predicting...
predicting cost 0.005853 seconds
The accruacy score is 0.880538
The precision score is 0.873744
The recall score is 0.861293
The f1 score is 0.829012


## 朴素贝叶斯

In [122]:
from sklearn.naive_bayes import MultinomialNB

print('Start training...')
clf = MultinomialNB(alpha=1.0) # 加入laplace平滑
clf.fit(train_features, train_labels)

print('Start predicting...')
test_predict = clf.predict(test_features)


accuracy = accuracy_score(test_labels, test_predict)
print("The accruacy score is %f" % accuracy)
precision = precision_score(test_labels, test_predict, average='macro')
print("The precision score is %f" % precision)
recall = recall_score(test_labels, test_predict, average='macro')
print("The recall score is %f" % recall)
f1 = f1_score(test_labels, test_predict, average='macro')
print("The f1 score is %f" % score)

Start training...
Start predicting...
The accruacy score is 0.829012
The precision score is 0.856992
The recall score is 0.774882
The f1 score is 0.829012
