In [208]:
import requests
import time
import feedparser
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
# Used some stuff from here: https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a

pd.options.display.max_seq_items = 1000
request_string = ("http://export.arxiv.org/api/query")
#arxiv_category_list = ['cat:cond-mat.str-el','cat:cond-mat.mes-hall','cat:cond-mat.dis-nn',
#            'cat:cond-mat.stat-mech','cat:cond-mat.supr-con','cat:cond-mat.other','cond-mat.quant-gas']

In [115]:
# All we need from our arxiv database is the arxiv id XXXX.YYYYY
# We need to extract this from the abstract or PDF link
our_preprints = pd.read_csv("roy-group-arxiv-8-2-18.csv")
our_preprints_clean = pd.DataFrame(our_preprints.link.str.split('/').map(lambda x: x[-1][:10]))
our_preprints_clean['in_db'] = 1
our_preprints_clean.columns=['id','in_db']

In [79]:
arxiv_preprints = pd.read_csv("arxiv-since-10-18-17.csv",
                             usecols=['link','title','summary'])
arxiv_preprints.columns = ['title','id','abstract']

In [82]:
arxiv_preprints['id']=arxiv_preprints.id.str.split('/').map(lambda x: x[-1][:10])

In [116]:
all_preprints = arxiv_preprints.merge(our_preprints_clean, on='id', how='left')

In [181]:
all_preprints = all_preprints.fillna(0)
abs_train,abs_test,y_train,y_test=train_test_split(all_preprints.abstract,all_preprints.in_db, test_size=0.33, random_state=28008135)

In [189]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('svm-clf', SGDClassifier(loss='hinge', penalty='l2',max_iter=5))])
text_clf_nb = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('nb-clf', MultinomialNB())])

In [190]:
text_clf_nb.fit(abs_train,y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ar_tf=False, use_idf=True)), ('nb-clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [202]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2),(1,3)],
               'tfidf__use_idf': (True, False),
               'nb-clf__alpha': (1,1e-1,1e-2, 1e-3,1e-4)}

In [203]:
gs_clf = GridSearchCV(text_clf_nb, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(abs_train, y_train)

In [204]:
gs_clf.best_params_

{'nb-clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 3)}

In [205]:
gs_clf.best_score_

0.9235913726801538