In [6]:
# 导入所需的package
import seaborn as sns #用于画图
from bs4 import BeautifulSoup #用于爬取arxiv的数据
import re #用于正则表达式，匹配字符串的模式
import requests #用于网络连接，发送网络请求，使用域名获取对应信息
import json #读取数据，我们的数据为json格式的
import pandas as pd #数据处理，数据分析
import matplotlib.pyplot as plt #画图工具

In [10]:
def readArxivFile(path, columns=['id', 'submitter', 'authors', 'title', 'comments', 'journal-ref', 'doi',
       'report-no', 'categories', 'license', 'abstract', 'versions',
       'update_date', 'authors_parsed'], count=None):
    '''
    定义读取文件的函数
        path: 文件路径
        columns: 需要选择的列
        count: 读取行数
    '''
    
    data  = []
    with open(path, 'r') as f: 
        for idx, line in enumerate(f): 
            if idx == count:
                break
                
            d = json.loads(line)
            d = {col : d[col] for col in columns}
            data.append(d)

    data = pd.DataFrame(data)
    return data

data = readArxivFile('arxiv-metadata-oai-2019.json', 
                     ['id', 'title', 'categories', 'abstract'],
                    200000)



In [11]:
print(data.head())

          id                                              title categories  \
0  0704.0297  Remnant evolution after a carbon-oxygen white ...   astro-ph   
1  0704.0342  Cofibrations in the Category of Frolicher Spac...    math.AT   
2  0704.0360  Torsional oscillations of longitudinally inhom...   astro-ph   
3  0704.0525  On the Energy-Momentum Problem in Static Einst...      gr-qc   
4  0704.0535  The Formation of Globular Cluster Systems in M...   astro-ph   

                                            abstract  
0    We systematically explore the evolution of t...  
1    Cofibrations are defined in the category of ...  
2    We explore the effect of an inhomogeneous ma...  
3    This paper has been removed by arXiv adminis...  
4    The most massive elliptical galaxies show a ...  


In [12]:
# 将标题和摘要拼接一起完成分类。
data['text'] = data['title'] + data['abstract']

data['text'] = data['text'].apply(lambda x: x.replace('\n',' '))
data['text'] = data['text'].apply(lambda x: x.lower())
data = data.drop(['abstract', 'title'], axis=1)

# 备注函数
Python lower() 方法转换字符串中所有大写字符为小写

In [13]:
# 多个类别，包含子分类
data['categories'] = data['categories'].apply(lambda x : x.split(' '))
print(data['categories'].head(10))

0    [astro-ph]
1     [math.AT]
2    [astro-ph]
3       [gr-qc]
4    [astro-ph]
5     [nucl-ex]
6    [quant-ph]
7     [math.DG]
8      [hep-ex]
9    [astro-ph]
Name: categories, dtype: object


In [15]:
# 单个类别，不包含子分类 因为这里的.后代表子分类
data['categories_big'] = data['categories'].apply(lambda x : [xx.split('.')[0] for xx in x])
print(data['categories_big'].head(10))

0    [astro-ph]
1        [math]
2    [astro-ph]
3       [gr-qc]
4    [astro-ph]
5     [nucl-ex]
6    [quant-ph]
7        [math]
8      [hep-ex]
9    [astro-ph]
Name: categories_big, dtype: object


In [16]:
# 进行编码
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
data_label = mlb.fit_transform(data['categories_big'].iloc[:])

# 思路1：
思路1使用TFIDF提取特征，限制最多4000个单词
思路1：TF-IDF+机器学习分类器
直接使用TF-IDF对文本提取特征，使用分类器进行分类，分类器的选择上可以使用SVM、LR、XGboost等


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=4000)
data_tfidf = vectorizer.fit_transform(data['text'].iloc[:])

In [20]:
# 划分训练集和验证集
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_tfidf, data_label,
                                                 test_size = 0.2,random_state = 1)

# 构建多标签分类模型
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
clf = MultiOutputClassifier(MultinomialNB()).fit(x_train, y_train)

In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_test, clf.predict(x_test)))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         0
           3       0.91      0.85      0.88      3625
           4       0.00      0.00      0.00         4
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         0
           8       0.77      0.76      0.77      3801
           9       0.84      0.89      0.86     10715
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00       186
          12       0.44      0.41      0.42      1621
          13       0.00      0.00      0.00         1
          14       0.75      0.59      0.66      1096
          15       0.61      0.80      0.69      1078
          16       0.90      0.19      0.32       242
          17       0.53    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 思路2：FastText
FastText是入门款的词向量，利用Facebook提供的FastText工具，可以快速构建分类器



In [23]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data['text'].iloc[:100000], 
                                                    data_label[:100000],
                                                 test_size = 0.95,random_state = 1)


In [24]:
# parameter
max_features= 500
max_len= 150
embed_size=100
batch_size = 128
epochs = 5

from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence

tokens = Tokenizer(num_words = max_features)
tokens.fit_on_texts(list(data['text'].iloc[:100000]))

y_train = data_label[:100000]
x_sub_train = tokens.texts_to_sequences(data['text'].iloc[:100000])
x_sub_train = sequence.pad_sequences(x_sub_train, maxlen=max_len)

ModuleNotFoundError: No module named 'keras'

# 思路3：WordVec+深度学习分类器
WordVec是进阶款的词向量，并通过构建深度学习分类完成分类。深度学习分类的网络结构可以选择TextCNN、TextRnn或者BiLSTM。

思路4：Bert词向量
Bert是高配款的词向量，具有强大的建模学习能力。