Use fancy machine learning to predict whether an article makes it into Nature/Science or PRL. This time we'll only look at articles in the physics.atom-ph section.

In [20]:
#Need to add parent directoy to sys.path to find 'metadataDB'
import sys
sys.path.append('../')

%matplotlib inline
import matplotlib.pyplot as plt 
import time
import numpy as np
import re
from itertools import combinations
import json

# Natural language processing toolkit
# To use this, run nltk.download() and download 'stopwords'
# from nltk.corpus import stopwords
# s=stopwords.words('english') + ['']

# Machine learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.externals import joblib
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

# SQL
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from metadataDB.declareDatabase import *
from sqlalchemy import or_, and_

engine = create_engine("sqlite:///../arXiv_metadata.db", echo=False)
Base.metadata.bind = engine
DBsession = sessionmaker(bind=engine)
session = DBsession()

In [21]:
categories = ['atom-ph', 'chem-ph', 'cond-mat', 'hep-th', 'hep-ex', 'quant-ph']
# categories = ['atom-ph', 'quant-ph']
journals_dict = {'PRL': ['Physics Review Letters%',
                    'Phys. Rev. Lett.%',
                    'Phys.Rev.Lett.%',
                    'PRL%'],
                 'PR':  ['Physics Review%',
                         'Phys. Rev.%',
                         'Phys.Rev.%',
                         'PR%'],
                 'Nature': ['Nature%',
                            'Nat.%',
                            'Science%'],
                 'APL': ['APL%',
                         'Appl.Phys.Lett.%',
                         'Appl. Phys. Lett.%',
                         'Applied Physics Letters%'],
                 'AP': ['AP%',
                        'Appl.Phys.%',
                        'Appl. Phys.%',
                        'Applied Physics%'],
                 'PL': ['Physics Letters%',
                        'Phys. Lett.%',
                        'Phys.Lett.%'],
                 'All': ['%'],
                 }


In [37]:
def get_abstracts(category):
    query = session.query(Article_Category)\
                    .join(Category)\
                    .join(Article)\
                    .filter(Category.name.like('%' + category + '%'),
                            or_(*[Article.journal_ref.like(x)
                                  for x in journals_dict['Nature']+journals_dict['PRL']]))
#                                   for x in journals_dict['Nature']+journals_dict['PRL']]))

                
    # Don't need to clean up text: CountVectorizer will do everything
    return query



In [42]:
# Some abstract have multiple categories. I'll make a dict based on the article
# id number to link these labels.
start = time.time()

query_list = []
for category in categories:
    query_list.append(get_abstracts(category))
    
abstract_dict = dict()
category_dict = dict()
category_to_number = dict(zip(categories, range(0, len(categories))))


for q, category in zip(query_list, categories):
    for x in q:
        abstract_dict[x.article.id] = x.article.abstract
        try:
            category_dict[x.article.id].append(category)
        except KeyError:
            category_dict[x.article.id] = [category]
keys = abstract_dict.keys()


label_binarizer = MultiLabelBinarizer(classes=categories)


X_train_tmp = [abstract_dict[key] for key in keys]
Y_train_tmp = label_binarizer.fit_transform([ category_dict[key] for key in keys])

X_train, _, Y_train, _ = train_test_split(X_train_tmp, Y_train_tmp, train_size=2000)

# Y_train_tmp = [ [category_to_number[x] for x in category_dict[key]] for key in keys]

# With 0.17, we have to use MultiLabelBinarizer
# label_binarizer = MultiLabelBinarizer(classes=categories)
# Y_train = label_binarizer.fit_transform(Y_train_tmp)

print (time.time() - start)

48.0484337807


In [43]:
# 70 MB is about 2600 abstracts... not many!
print len(X_train)

2000


In [44]:
start = time.time()

clf_category = OneVsRestClassifier(Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LinearSVC(C=1,penalty='l1',dual=False,fit_intercept=True))]))
clf_category.fit(X_train, Y_train)

print (time.time() - start)

18.1282448769


In [45]:
prediction = clf_category.predict(['We measure the mass, gap, and magnetic moment of a magnon in the ferromagnetic F=1 spinor Bose-Einstein condensate of 87Rb. We find an unusually heavy magnon mass of 1.038(2)stat(8)sys times the atomic mass, as determined by interfering standing and running coherent magnon waves within the dense and trapped condensed gas. This measurement is shifted significantly from theoretical estimates. The magnon energy gap of h×2.5(1)stat(2)sysHz and the effective magnetic moment of −1.04(2)stat(8)μbare times the atomic magnetic moment are consistent with mean-field predictions. The nonzero energy gap arises from magnetic dipole-dipole interactions.'])

label_binarizer.inverse_transform(prediction)

[('cond-mat',)]

In [46]:
prediction = clf_category.predict(['Ultracold gases promise access to many-body quantum phenomena at convenient length and time scales. However, it is unclear whether the entropy of these gases is low enough to realize many phenomena relevant to condensed matter physics, such as quantum magnetism. Here we report reliable single-shot temperature measurements of a degenerate 87Rb gas by imaging the momentum distribution of thermalized magnons, which are spin excitations of the atomic gas. We record average temperatures as low as 0.022(1)stat(2)sys times the Bose-Einstein condensation temperature, indicating an entropy per particle, S/N≈0.001kB at equilibrium, that is well below the critical entropy for antiferromagnetic ordering of a Bose-Hubbard system. The magnons themselves can reduce the temperature of the system by absorbing energy during thermalization and by enhancing evaporative cooling, allowing low-entropy gases to be produced within deep traps.'])

label_binarizer.inverse_transform(prediction)

[('cond-mat', 'quant-ph')]

In [47]:
# print metrics.classification_report(Y_train, clf.predict(X_train))
#                                     target_names=test_target_names))
# print metrics.confusion_matrix(y_test, y_predict_test)
# print 'Accuracy: %f' % (metrics.accuracy_score(y_test, y_predict_test))

In [None]:
start = time.time()

joblib.dump((clf_category, label_binarizer), 'svm_category.pkl', compress=1)

print (time.time() - start)

In [29]:
with open('category_list.json', 'wb') as f:
    category_list = categories
    json.dump(category_list, f)