Use fancy machine learning to predict whether an article makes it into Nature/Science or PRL. This time we'll only look at articles in the physics.atom-ph section.

In [1]:
#Need to add parent directoy to sys.path to find 'metadataDB'
import sys
sys.path.append('../')

%matplotlib inline
import matplotlib.pyplot as plt 
import time
import numpy as np
import re
from itertools import combinations
import json

# Natural language processing toolkit
# To use this, run nltk.download() and download 'stopwords'
# from nltk.corpus import stopwords
# s=stopwords.words('english') + ['']

# Machine learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.externals import joblib
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

# SQL
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from metadataDB.declareDatabase import *
from sqlalchemy import or_, and_

engine = create_engine("sqlite:///../arXiv_metadata.db", echo=False)
Base.metadata.bind = engine
DBsession = sessionmaker(bind=engine)
session = DBsession()

In [2]:
categories = ['atom-ph', 'quant-ph', 'cond-mat', 'quant-gas', 'hep-th', 'hep-ex']
# categories = ['atom-ph', 'quant-ph']
journals_dict = {'PRL': ['Physics Review Letters%',
                    'Phys. Rev. Lett.%',
                    'Phys.Rev.Lett.%',
                    'PRL%'],
                 'PR':  ['Physics Review%',
                         'Phys. Rev.%',
                         'Phys.Rev.%',
                         'PR%'],
                 'Nature': ['Nature%',
                            'Nat.%',
                            'Science%'],
                 'APL': ['APL%',
                         'Appl.Phys.Lett.%',
                         'Appl. Phys. Lett.%',
                         'Applied Physics Letters%'],
                 'AP': ['AP%',
                        'Appl.Phys.%',
                        'Appl. Phys.%',
                        'Applied Physics%'],
                 'PL': ['Physics Letters%',
                        'Phys. Lett.%',
                        'Phys.Lett.%'],
                 'All': ['%'],
                 }


In [3]:
def get_abstracts(category):
    query = session.query(Article_Category)\
                    .join(Category)\
                    .join(Article)\
                    .filter(Category.name.like('%' + category + '%'),
                            Article.journal_ref.like('Phys.Rev.Lett.%'))
                
    # Don't need to clean up text: CountVectorizer will do everything
    return query
#     return [ result.article.abstract for result in query ]


# def learn(journal, categories):
#     abstracts1 = get_abstracts(journals[0], category)
#     abstracts2 = get_abstracts(journals[1], category)
    
#     half_test_size = int(round(0.2*min(len(abstracts1),len(abstracts2))))
# #     print half_test_size

#     X1_train, X1_test, y1_train, y1_test = train_test_split(abstracts1, [0]*len(abstracts1), test_size=half_test_size, random_state=42)
#     X2_train, X2_test, y2_train, y2_test = train_test_split(abstracts2, [1]*len(abstracts2), test_size=half_test_size, random_state=42)

#     X_train = X1_train + X2_train
#     X_test = X1_test + X2_test
#     y_train = np.array(y1_train + y2_train)
#     y_test = np.array(y1_test + y2_test)
#     target_names = journals
    
    
#     clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
#                     ('tfidf', TfidfTransformer()),
#                     ('clf', OneVsRestClassifier(
#                                 LinearSVC(C=1,penalty='l1',dual=False,fit_intercept=True)))])
#     transform = clf.fit_transform(X_train, y_train)
#     y_predict_train = clf.predict(X_train)
#     y_predict_test = clf.predict(X_test)
    
#     X_train_tfidf = clf.named_steps['tfidf'].transform(
#                         clf.named_steps['vect'].transform(X_train))
    
#     print (metrics.classification_report(y_test, y_predict_test,
#                                     target_names=target_names))
# #                                     target_names=test_target_names))
#     print metrics.confusion_matrix(y_test, y_predict_test)
#     print 'Accuracy: %f' % (metrics.accuracy_score(y_test, y_predict_test))

# #     most_important_words = clf.named_steps['clf'].coef_.argsort()[:, ::-1]

# #     print np.squeeze(X_train_tfidf[y_train==0, 3].toarray()).shape
# #     print np.squeeze(X_train_tfidf[y_train==0, 3].toarray()).shape
    
    
# #     terms =  clf.named_steps['vect'].get_feature_names()
# #     result = [{'name': terms[word],
# #                'value': clf.named_steps['clf'].coef_[0,word],
# #                'vector1': np.squeeze(X_train_tfidf[y_train==0, word].toarray()).tolist(),
# #                'vector2': np.squeeze(X_train_tfidf[y_train==1, word].toarray()).tolist(),
# #               }
# #                 for word in (np.concatenate((most_important_words[0, :15],
# #                              most_important_words[0, -15:]))) ]
#     return clf



In [4]:
# Some abstract have multiple categories. I'll make a dict based on the article
# id number to link these labels.
start = time.time()

query_list = []
for category in categories:
    query_list.append(get_abstracts(category))
    
abstract_dict = dict()
category_dict = dict()
category_to_number = dict(zip(categories, range(0, len(categories))))


for q, category in zip(query_list, categories):
    for x in q:
        abstract_dict[x.article.id] = x.article.abstract
        try:
            category_dict[x.article.id].append(category)
        except KeyError:
            category_dict[x.article.id] = [category]
keys = abstract_dict.keys()



X_train = [abstract_dict[key] for key in keys]
Y_train_tmp = [ category_dict[key] for key in keys]
# Y_train_tmp = [ [category_to_number[x] for x in category_dict[key]] for key in keys]

# With 0.17, we have to use MultiLabelBinarizer
label_binarizer = MultiLabelBinarizer(classes=categories)
Y_train = label_binarizer.fit_transform(Y_train_tmp)

print (time.time() - start)

21.7131679058


In [5]:
start = time.time()

clf_category = OneVsRestClassifier(Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LinearSVC(C=1,penalty='l1',dual=False,fit_intercept=True))]))
clf_category.fit(X_train, Y_train)

print (time.time() - start)

21.4936468601


In [6]:
prediction = clf_category.predict(['We measure the mass, gap, and magnetic moment of a magnon in the ferromagnetic F=1 spinor Bose-Einstein condensate of 87Rb. We find an unusually heavy magnon mass of 1.038(2)stat(8)sys times the atomic mass, as determined by interfering standing and running coherent magnon waves within the dense and trapped condensed gas. This measurement is shifted significantly from theoretical estimates. The magnon energy gap of h×2.5(1)stat(2)sysHz and the effective magnetic moment of −1.04(2)stat(8)μbare times the atomic magnetic moment are consistent with mean-field predictions. The nonzero energy gap arises from magnetic dipole-dipole interactions.'])

label_binarizer.inverse_transform(prediction)

[('cond-mat', 'hep-ex')]

In [7]:
# print metrics.classification_report(Y_train, clf.predict(X_train))
#                                     target_names=test_target_names))
# print metrics.confusion_matrix(y_test, y_predict_test)
# print 'Accuracy: %f' % (metrics.accuracy_score(y_test, y_predict_test))

In [8]:
start = time.time()

joblib.dump((clf_category, label_binarizer), 'svm_category.pkl', compress=1)

print (time.time() - start)

72.4252691269


In [9]:
with open('category_list.json', 'wb') as f:
    category_list = categories
    json.dump(category_list, f)