In [140]:
import requests
import time
import feedparser
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
# Used some stuff from here: https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a
pd.options.display.max_seq_items = 4000


In [None]:
for cat,max_result in number_per_category.items():
    query_params={'search_query':cat,'sortBy':'lastUpdatedDate',
                  'sortOrder':'descending',
                  'max_results':max_result,
                  'start':0}
    req = requests.post(request_string,data=query_params)
    req.raise_for_status()
    
    entry_dict = feedparser.parse(req.text)['entries']
    
    full_df = pd.DataFrame(entry_dict)
    
    # author list stored as an annoying dict, so unpack it to a comma-separated list.
    temp_df=pd.DataFrame(entry_dict,columns=['authors','title','updated','link','summary'])
    temp_df['authors']=[', '.join([n['name'] for n in entry]) for entry in list(temp_df['authors'].values)]
    
    preprint_df = preprint_df.append(temp_df)

preprint_df = preprint_df.drop_duplicates()

In [115]:
# All we need from our arxiv database is the arxiv id XXXX.YYYYY
# We need to extract this from the abstract or PDF link
our_preprints = pd.read_csv("roy-group-arxiv-8-2-18.csv")
our_preprints_clean = pd.DataFrame(our_preprints.link.str.split('/').map(lambda x: x[-1][:10]))
our_preprints_clean['in_db'] = 1
our_preprints_clean.columns=['id','in_db']

In [64]:
#arxiv_category_list = ['cat:cond-mat.str-el','cat:cond-mat.mes-hall','cat:cond-mat.dis-nn',
#            'cat:cond-mat.stat-mech','cat:cond-mat.supr-con','cat:cond-mat.other','cond-mat.quant-gas']
request_string = ("http://export.arxiv.org/api/query")

# We want to get all of the possible relevant articles from cond-mat going back
# to the start of our lit database.
# I queried each category separately (by trial and error) to find how many articles 
# were posted since we started our database.

#number_per_category = {'cat:cond-mat.str-el':3500,
#                       'cat:cond-mat.mes-hall':5000,
#                       'cat:cond-mat.dis-nn':950,
#                       'cat:cond-mat.stat-mech':3150,
#                       'cat:cond-mat.supr-con':1600,
#                       'cat:cond-mat.other':420,
#                       'cat:cond-mat.quant-gas':1500}
number_per_category = {}
preprint_df = pd.DataFrame()
full_df = pd.DataFrame()

In [79]:
arxiv_preprints = pd.read_csv("arxiv-since-10-18-17.csv",
                             usecols=['link','title','summary'])
arxiv_preprints.columns = ['title','id','abstract']

In [82]:
arxiv_preprints['id']=arxiv_preprints.id.str.split('/').map(lambda x: x[-1][:10])

In [116]:
all_preprints = arxiv_preprints.merge(our_preprints_clean, on='id', how='left')

In [129]:
all_preprints = all_preprints.fillna(0)
abs_train,abs_test,y_train,y_test=train_test_split(all_preprints.abstract,all_preprints.in_db, test_size=0.33, random_state=8008135)

In [131]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('svm-clf', SGDClassifier(loss='hinge', penalty='l2',max_iter=5))])

In [132]:
text_clf.fit(abs_train,y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))])

In [133]:
svm_prediction=text_clf.predict(abs_test)

In [134]:
np.mean(svm_prediction==y_test)

0.9253224711473184

In [166]:
count_vec = CountVectorizer()

In [167]:
doc_term = count_vec.fit_transform(all_preprints.abstract)
tokens = count_vec.get_feature_names()

In [168]:
doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(doc_term)]
df = pd.DataFrame(data=doc_term.toarray(), index=doc_names,
                      columns=tokens)

In [172]:
print(df.columns)

Index(['00', '000', '00000', '00005', '0001', '00013', '00014', '00082',
       '00089', '001',
       ...
       'zrte5', 'zt', 'ztp', 'zumino', 'zureck', 'zurek', 'zv', 'zwanzig',
       'zy', 'zz'],
      dtype='object', length=24106)
