Use fancy machine learning to predict whether an article makes it into Nature/Science or PRL. This time we'll only look at articles in the physics.atom-ph section.

In [1]:
#Need to add parent directoy to sys.path to find 'metadataDB'
import sys
sys.path.append('../')

%matplotlib inline
import matplotlib.pyplot as plt 
import time
import numpy as np
import re
from itertools import combinations
import json

# Natural language processing toolkit
# To use this, run nltk.download() and download 'stopwords'
# from nltk.corpus import stopwords
# s=stopwords.words('english') + ['']

# Machine learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.externals import joblib

# SQL
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from metadataDB.declareDatabase import *
from sqlalchemy import or_, and_

engine = create_engine("sqlite:///../arXiv_metadata.db", echo=False)
Base.metadata.bind = engine
DBsession = sessionmaker(bind=engine)
session = DBsession()

In [2]:
journals_dict = {'PRL': ['Physics Review Letters%',
                    'Phys. Rev. Lett.%',
                    'Phys.Rev.Lett.%',
                    'PRL%'],
                 'PR':  ['Physics Review%',
                         'Phys. Rev.%',
                         'Phys.Rev.%',
                         'PR%'],
                 'Nature': ['Nature%',
                            'Nat.%',
                            'Science%'],
                 'APL': ['APL%',
                         'Appl.Phys.Lett.%',
                         'Appl. Phys. Lett.%',
                         'Applied Physics Letters%'],
                 'AP': ['AP%',
                        'Appl.Phys.%',
                        'Appl. Phys.%',
                        'Applied Physics%'],
                 'PL': ['Physics Letters%',
                        'Phys. Lett.%',
                        'Phys.Lett.%'],
                 'All': ['%'],
                 }


In [3]:
def get_abstracts(journal):
#     query = session.query(Article)\
#                     .filter(*[Article.journal_ref.like(x) for x in journals_dict[journal]])
    query = session.query(Article_Category)\
                    .join(Category)\
                    .join(Article)\
                    .filter(Category.name.like('%'),
                            or_(*[Article.journal_ref.like(x) for x in journals_dict[journal]]))
    # Don't need to clean up text: CountVectorizer will do everything
    return [ result.article.abstract for result in query ]


def learn(journals):
    abstracts1 = get_abstracts(journals[0])
    abstracts2 = get_abstracts(journals[1])
    
    half_test_size = int(round(0.1*min(len(abstracts1),len(abstracts2))))


    X1_train, X1_test, y1_train, y1_test = train_test_split(abstracts1, [0]*len(abstracts1), test_size=half_test_size, random_state=42)
    X2_train, X2_test, y2_train, y2_test = train_test_split(abstracts2, [1]*len(abstracts2), test_size=half_test_size, random_state=42)
    
    X_train = X1_train + X2_train
    X_test = X1_test + X2_test
    y_train = np.array(y1_train + y2_train)
    y_test = np.array(y1_test + y2_test)
    target_names = journals
    
    print (len(X1_train), len(X2_train))
    
    clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3),stop_words='english')),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LinearSVC(C=10,penalty='l1',dual=False,fit_intercept=True))])

    clf.fit(X_train, y_train)
#     y_predict_train = clf.predict(X_train)
    y_predict_test = clf.predict(X_test)
    
    # To save memory, I will retrain the countvectorizer on only the nonzero entries to the coefficient matrix.
    
    nonzero_coefs = np.nonzero(np.squeeze(clf.named_steps['clf'].coef_))
    useful_words = np.array(clf.named_steps['vect'].get_feature_names())[nonzero_coefs]
    
    clf2 = Pipeline([('vect', CountVectorizer(ngram_range=(1,3),stop_words='english',vocabulary=useful_words)),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LinearSVC(C=10,penalty='l1',dual=False,fit_intercept=True))])
    clf2.fit(X_train, y_train)
    y_predict_test2 = clf2.predict(X_test)
    
    print 'Full model'
    print (metrics.classification_report(y_test, y_predict_test,
                                    target_names=target_names))

    print metrics.confusion_matrix(y_test, y_predict_test)
    print 'Accuracy: %f' % (metrics.accuracy_score(y_test, y_predict_test))
    
    print ''
    print 'Reduced model'
    print (metrics.classification_report(y_test, y_predict_test2,
                                    target_names=target_names))

    print metrics.confusion_matrix(y_test, y_predict_test2)
    print 'Accuracy: %f' % (metrics.accuracy_score(y_test, y_predict_test2))
    
    return (clf, clf2)



In [4]:
# categories = ['atom-ph']
# categories = ['atom-ph', 'quant-ph', 'cond-mat', 'str-el']
# journals = ['PRL', 'PL', 'Nature']
journals = ['PRL', 'Nature']

start = time.time()

# clf_journal = learn(journals, categories[0])
clf_journal, clf_journal2 = learn(journals)

print (time.time() - start)



(32590, 5215)
Full model
             precision    recall  f1-score   support

        PRL       0.92      0.97      0.95       579
     Nature       0.97      0.92      0.94       579

avg / total       0.94      0.94      0.94      1158

[[560  19]
 [ 46 533]]
Accuracy: 0.943869

Reduced model
             precision    recall  f1-score   support

        PRL       0.88      0.99      0.93       579
     Nature       0.98      0.87      0.92       579

avg / total       0.93      0.93      0.93      1158

[[571   8]
 [ 78 501]]
Accuracy: 0.925734
120.923759937


In [5]:
start = time.time()
joblib.dump((clf_journal, ['Phys. Rev. Lett.', 'Nature/Science']), 'svm_journal_old.pkl', compress=1)
print (time.time() - start)

79.4636948109


In [8]:
start = time.time()
joblib.dump((clf_journal2, ['Phys. Rev. Lett.', 'Nature/Science']), 'svm_journal.pkl', compress=1)
print (time.time() - start)

0.393079042435
