Use fancy machine learning to predict whether an article makes it into Nature/Science or PRL. This time we'll only look at articles in the physics.atom-ph section.

In [1]:
#Need to add parent directoy to sys.path to find 'metadataDB'
import sys
sys.path.append('../')

%matplotlib inline
import matplotlib.pyplot as plt 
import time
import numpy as np
import re
from itertools import combinations
import json

# Natural language processing toolkit
# To use this, run nltk.download() and download 'stopwords'
# from nltk.corpus import stopwords
# s=stopwords.words('english') + ['']

# Machine learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.externals import joblib
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

# SQL
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from metadataDB.declareDatabase import *
from sqlalchemy import or_, and_

engine = create_engine("sqlite:///../arXiv_metadata.db", echo=False)
Base.metadata.bind = engine
DBsession = sessionmaker(bind=engine)
session = DBsession()

In [2]:
categories = ['astro-ph', 'atom-ph', 'cond-mat', 'hep-th', 'hep-ex', 'quant-ph']
# categories = ['atom-ph', 'quant-ph']
journals_dict = {'PRL': ['Physics Review Letters%',
                    'Phys. Rev. Lett.%',
                    'Phys.Rev.Lett.%',
                    'PRL%'],
                 'PR':  ['Physics Review%',
                         'Phys. Rev.%',
                         'Phys.Rev.%',
                         'PR%'],
                 'Nature': ['Nature%',
                            'Nat.%',
                            'Science%'],
                 'APL': ['APL%',
                         'Appl.Phys.Lett.%',
                         'Appl. Phys. Lett.%',
                         'Applied Physics Letters%'],
                 'AP': ['AP%',
                        'Appl.Phys.%',
                        'Appl. Phys.%',
                        'Applied Physics%'],
                 'PL': ['Physics Letters%',
                        'Phys. Lett.%',
                        'Phys.Lett.%'],
                 'All': ['%'],
                 }


In [3]:
def get_abstracts(category):
    query = session.query(Article_Category)\
                    .join(Category)\
                    .join(Article)\
                    .filter(Category.name.like('%' + category + '%'),
                            or_(*[Article.journal_ref.like(x)
                                  for x in journals_dict['Nature']+journals_dict['PR']]))
#                                   for x in journals_dict['Nature']+journals_dict['PRL']]))

                
    # Don't need to clean up text: CountVectorizer will do everything
    return query



In [4]:
# Some abstract have multiple categories. I'll make a dict based on the article
# id number to link these labels.
# 
# Train with at least 500 abstracts per category, so as not to overlook
# a small category (like atom-ph). At least because many categories are
# multiply listed

start = time.time()

# These numbers are based on the 2401 atom-ph articles in the database.
articles_per_category_train = 2000
articles_per_category_test = 400

query_list = []
for category in categories:
    current_abstracts = get_abstracts(category)
    query_list.append(get_abstracts(category))
#     break
    
abstract_dict_test = dict()
category_dict_test = dict()
category_to_number_test = dict(zip(categories, range(0, len(categories))))

abstract_dict_train = dict()
category_dict_train = dict()
category_to_number_train = dict(zip(categories, range(0, len(categories))))

for q, category in zip(query_list, categories):
    q = q.all() # this line really speeds things up!
    ind = np.random.choice(len(q),
                              articles_per_category_train+articles_per_category_test,
                              replace=False)
    
    # Training set
    for i in ind[:articles_per_category_train]:
#     for x in q:
        x = q[i]
        abstract_dict_train[x.article.id] = x.article.abstract
        try:
            category_dict_train[x.article.id].append(category)
        except KeyError:
            category_dict_train[x.article.id] = [category]
    
    # Testing set
    for i in ind[articles_per_category_train:]:
        x = q[i]
        abstract_dict_test[x.article.id] = x.article.abstract
        try:
            category_dict_test[x.article.id].append(category)
        except KeyError:
            category_dict_test[x.article.id] = [category]
    
    print (category, (time.time() - start), len(q))
    
keys_train = abstract_dict_train.keys()
keys_test = abstract_dict_test.keys()

label_binarizer = MultiLabelBinarizer(classes=categories)


X_train = [abstract_dict_train[key] for key in keys_train]
Y_train = label_binarizer.fit_transform([ category_dict_train[key] for key in keys_train])

X_test = [abstract_dict_test[key] for key in keys_test]
Y_test = label_binarizer.fit_transform([ category_dict_test[key] for key in keys_test])

print (time.time() - start)

('astro-ph', 3.97485613822937, 13027)
('atom-ph', 7.043087005615234, 2404)
('cond-mat', 12.409974098205566, 66018)
('hep-th', 16.21687912940979, 13135)
('hep-ex', 19.63132905960083, 6156)
('quant-ph', 23.21168613433838, 14844)
23.2522759438


In [5]:
start = time.time()

clf_category = OneVsRestClassifier(Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LinearSVC(C=10,penalty='l1',dual=False,fit_intercept=True))]))
clf_category.fit(X_train, Y_train)

print (time.time() - start)

132.589030981


In [6]:
# To save memory, I will retrain the countvectorizer on only the nonzero entries to the coefficient matrix.

In [7]:
start = time.time()
useful_words = []

for x in clf_category.estimators_:
#     print x
    nonzero_coefs = np.nonzero(np.squeeze(x.named_steps['clf'].coef_))[0]
    useful_words.extend(np.array(x.named_steps['vect'].get_feature_names())[nonzero_coefs])

useful_words = list(set(useful_words))
print len(useful_words)
print (time.time() - start)

9863
27.0100297928


In [8]:
start = time.time()

clf_category2 = OneVsRestClassifier(Pipeline([('vect', CountVectorizer(ngram_range=(1,3), vocabulary=useful_words)),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LinearSVC(C=10,penalty='l1',dual=False,fit_intercept=True))]))
clf_category2.fit(X_train, Y_train)

print (time.time() - start)

54.9926068783


In [9]:
# Get a classification report for each category.
def classification_report(clf):
    Y_pred = clf.predict(X_test)

    for i, category in enumerate(categories):
        print metrics.classification_report(Y_test[:,i], Y_pred[:,i], target_names=['not '+category, category])
        print metrics.accuracy_score(Y_test[:,i], Y_pred[:,i])

In [10]:
classification_report(clf_category)

              precision    recall  f1-score   support

not astro-ph       0.94      0.94      0.94      1987
    astro-ph       0.69      0.72      0.71       400

 avg / total       0.90      0.90      0.90      2387

0.899036447424
             precision    recall  f1-score   support

not atom-ph       0.96      0.94      0.95      1987
    atom-ph       0.71      0.78      0.74       400

avg / total       0.91      0.91      0.91      2387

0.909509844994
              precision    recall  f1-score   support

not cond-mat       0.95      0.94      0.94      1987
    cond-mat       0.71      0.74      0.73       400

 avg / total       0.91      0.91      0.91      2387

0.906996229577
             precision    recall  f1-score   support

 not hep-th       0.94      0.93      0.93      1987
     hep-th       0.66      0.69      0.67       400

avg / total       0.89      0.89      0.89      2387

0.888563049853
             precision    recall  f1-score   support

 not hep-ex       

In [11]:
classification_report(clf_category2)

              precision    recall  f1-score   support

not astro-ph       0.93      0.94      0.94      1987
    astro-ph       0.69      0.68      0.68       400

 avg / total       0.89      0.90      0.90      2387

0.895684960201
             precision    recall  f1-score   support

not atom-ph       0.95      0.95      0.95      1987
    atom-ph       0.74      0.73      0.74       400

avg / total       0.91      0.91      0.91      2387

0.912023460411
              precision    recall  f1-score   support

not cond-mat       0.94      0.95      0.94      1987
    cond-mat       0.73      0.69      0.71       400

 avg / total       0.90      0.91      0.90      2387

0.905739421868
             precision    recall  f1-score   support

 not hep-th       0.93      0.94      0.94      1987
     hep-th       0.69      0.64      0.66       400

avg / total       0.89      0.89      0.89      2387

0.891914537076
             precision    recall  f1-score   support

 not hep-ex       

In [12]:
current_abstract = ['We measure the mass, gap, and magnetic moment of a magnon in the ferromagnetic F=1 spinor Bose-Einstein condensate of 87Rb. We find an unusually heavy magnon mass of 1.038(2)stat(8)sys times the atomic mass, as determined by interfering standing and running coherent magnon waves within the dense and trapped condensed gas. This measurement is shifted significantly from theoretical estimates. The magnon energy gap of h×2.5(1)stat(2)sysHz and the effective magnetic moment of −1.04(2)stat(8)μbare times the atomic magnetic moment are consistent with mean-field predictions. The nonzero energy gap arises from magnetic dipole-dipole interactions.']
print label_binarizer.inverse_transform(clf_category.predict(current_abstract))
print label_binarizer.inverse_transform(clf_category2.predict(current_abstract))


[('atom-ph',)]
[('atom-ph',)]


In [None]:
current_abstract = ['Ultracold gases promise access to many-body quantum phenomena at convenient length and time scales. However, it is unclear whether the entropy of these gases is low enough to realize many phenomena relevant to condensed matter physics, such as quantum magnetism. Here we report reliable single-shot temperature measurements of a degenerate 87Rb gas by imaging the momentum distribution of thermalized magnons, which are spin excitations of the atomic gas. We record average temperatures as low as 0.022(1)stat(2)sys times the Bose-Einstein condensation temperature, indicating an entropy per particle, S/N≈0.001kB at equilibrium, that is well below the critical entropy for antiferromagnetic ordering of a Bose-Hubbard system. The magnons themselves can reduce the temperature of the system by absorbing energy during thermalization and by enhancing evaporative cooling, allowing low-entropy gases to be produced within deep traps.']
print label_binarizer.inverse_transform(clf_category.predict(current_abstract))
print label_binarizer.inverse_transform(clf_category2.predict(current_abstract))

[()]
[()]


In [None]:
start = time.time()

joblib.dump((clf_category, label_binarizer), 'svm_category_old.pkl', compress=1)

print (time.time() - start)

In [None]:
start = time.time()

joblib.dump((clf_category2, label_binarizer), 'svm_category.pkl', compress=1)

print (time.time() - start)

In [None]:
with open('category_list.json', 'wb') as f:
    category_list = categories
    json.dump(category_list, f)