In [59]:
%matplotlib inline

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm

import dask.dataframe as dd

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.matcher import Matcher
from spacy.tokens import Doc, Span, Token

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.metrics import plot_confusion_matrix

pd.options.display.max_columns = 50
pd.options.display.max_rows = 500

In [2]:
%%time
file = "./data/arxiv-metadata-oai-snapshot.json"

metadata  = []

lines = 1000000
with open(file, 'r') as f:
    for line in f: 
        metadata.append(json.loads(line))
        lines -= 1
        if lines == 0: break

orig_df = pd.DataFrame(metadata)

orig_df.head()

CPU times: user 24.7 s, sys: 2.57 s, total: 27.3 s
Wall time: 27.3 s


Unnamed: 0,id,submitter,authors,title,comments,journal-ref,doi,report-no,categories,license,abstract,versions,update_date,authors_parsed
0,704.0001,Pavel Nadolsky,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,"37 pages, 15 figures; published version","Phys.Rev.D76:013009,2007",10.1103/PhysRevD.76.013009,ANL-HEP-PR-07-12,hep-ph,,A fully differential calculation in perturba...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2008-11-26,"[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,..."
1,704.0002,Louis Theran,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,To appear in Graphs and Combinatorics,,,,math.CO cs.CG,http://arxiv.org/licenses/nonexclusive-distrib...,"We describe a new algorithm, the $(k,\ell)$-...","[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2008-12-13,"[[Streinu, Ileana, ], [Theran, Louis, ]]"
2,704.0003,Hongjun Pan,Hongjun Pan,The evolution of the Earth-Moon system based o...,"23 pages, 3 figures",,,,physics.gen-ph,,The evolution of Earth-Moon system is descri...,"[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...",2008-01-13,"[[Pan, Hongjun, ]]"
3,704.0004,David Callan,David Callan,A determinant of Stirling cycle numbers counts...,11 pages,,,,math.CO,,We show that a determinant of Stirling cycle...,"[{'version': 'v1', 'created': 'Sat, 31 Mar 200...",2007-05-23,"[[Callan, David, ]]"
4,704.0005,Alberto Torchinsky,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,,"Illinois J. Math. 52 (2008) no.2, 681-689",,,math.CA math.FA,,In this paper we show how to compute the $\L...,"[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...",2013-10-15,"[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]"


In [3]:
orig_df.shape

(1000000, 14)

In [21]:
def get_author_list(line):
    # Cleans author dataframe column, creating a list of authors in the row.
    
    return [e[1] + ' ' + e[0] for e in line]


def get_category_list(line):
    # Cleans category dataframe column, creating a list of categories in the
    # row.
    
    return list(line.split(" "))


df = orig_df
df['cleaned_authors_list'] = df['authors_parsed'].map(get_author_list)
df['category_list'] = df['categories'].map(get_category_list)
df = df.drop(['submitter', 'authors', 'comments', 'journal-ref', 'doi', 
              'report-no', 'license', 'versions', 'update_date', 
              'authors_parsed', 'categories'], axis=1)
df.head()

Unnamed: 0,id,title,abstract,cleaned_authors_list,category_list
0,704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,"[C. Balázs, E. L. Berger, P. M. Nadolsky, C. -...",[hep-ph]
1,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...","[Ileana Streinu, Louis Theran]","[math.CO, cs.CG]"
2,704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,[Hongjun Pan],[physics.gen-ph]
3,704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,[David Callan],[math.CO]
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,"[Wael Abu-Shammala, Alberto Torchinsky]","[math.CA, math.FA]"


In [14]:
df.shape

(1000000, 5)

In [22]:
mlb = MultiLabelBinarizer()
mlb.fit([['astro-ph', 'hep-ph', 'hep-th', 'quant-ph', 'gr-qc']])
mlb.classes_

array(['astro-ph', 'gr-qc', 'hep-ph', 'hep-th', 'quant-ph'], dtype=object)

In [27]:
test = df.head()
test.head()

Unnamed: 0,id,title,abstract,cleaned_authors_list,category_list
0,704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,"[C. Balázs, E. L. Berger, P. M. Nadolsky, C. -...",[hep-ph]
1,704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...","[Ileana Streinu, Louis Theran]","[math.CO, cs.CG]"
2,704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,[Hongjun Pan],[physics.gen-ph]
3,704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,[David Callan],[math.CO]
4,704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,"[Wael Abu-Shammala, Alberto Torchinsky]","[math.CA, math.FA]"


In [33]:
%%time
s = df['category_list'].explode()
new_df = df[['title', 'abstract']].join(pd.crosstab(s.index, s))
new_df.head()

CPU times: user 16.1 s, sys: 6.78 s, total: 22.9 s
Wall time: 23 s


Unnamed: 0,title,abstract,astro-ph,astro-ph.CO,astro-ph.EP,astro-ph.GA,astro-ph.HE,astro-ph.IM,astro-ph.SR,cond-mat.dis-nn,cond-mat.mes-hall,cond-mat.mtrl-sci,cond-mat.other,cond-mat.quant-gas,cond-mat.soft,cond-mat.stat-mech,cond-mat.str-el,cond-mat.supr-con,cs.AI,cs.AR,cs.CC,cs.CE,cs.CG,cs.CL,cs.CR,...,q-bio.CB,q-bio.GN,q-bio.MN,q-bio.NC,q-bio.OT,q-bio.PE,q-bio.QM,q-bio.SC,q-bio.TO,q-fin.CP,q-fin.EC,q-fin.GN,q-fin.MF,q-fin.PM,q-fin.PR,q-fin.RM,q-fin.ST,q-fin.TR,quant-ph,stat.AP,stat.CO,stat.ME,stat.ML,stat.OT,stat.TH
0,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [36]:
small_df = new_df.filter(['title', 'abstract', 'astro-ph', 'hep-ph', 'hep-th', 'quant-ph', 'gr-qc'], axis=1)
small_df.head()

Unnamed: 0,title,abstract,astro-ph,hep-ph,hep-th,quant-ph,gr-qc
0,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,0,1,0,0,0
1,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",0,0,0,0,0
2,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,0,0,0,0,0
3,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,0,0,0,0,0
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,0,0,0,0,0


In [38]:
small_df.shape

(1000000, 7)

In [39]:
nlp = spacy.load('en_core_web_md')

In [40]:
def create_word_vectors(row):
    
    doc = nlp(row)
    return doc.vector

In [41]:
%%time
small_df['vectors'] = small_df['title'].map(create_word_vectors)
small_df.head()

CPU times: user 1h 7min 2s, sys: 1.25 s, total: 1h 7min 3s
Wall time: 1h 7min 3s


Unnamed: 0,title,abstract,astro-ph,hep-ph,hep-th,quant-ph,gr-qc,vectors
0,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,0,1,0,0,0,"[-0.11850046, 0.24028462, 0.07142646, -0.02185..."
1,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",0,0,0,0,0,"[-0.116540596, 0.10794, 0.026708007, 0.0741736..."
2,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,0,0,0,0,0,"[0.04193971, 0.12672335, -0.04016435, -0.04071..."
3,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,0,0,0,0,0,"[-0.2031677, 0.22416851, -0.12339214, 0.119759..."
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,0,0,0,0,0,"[-0.1287667, 0.09558357, 0.08988343, 0.0380371..."


In [45]:
# ONLY DO THIS IF YOU DON'T NEED THE FULL DF ANYMORE!!!  
# FOR SOME REASON IT GETS RID OF IT!!!
#%%time
small_df.to_pickle('./data/word_vecs.xz')

In [46]:
small_df.shape

(1000000, 8)

In [48]:
y = small_df[['astro-ph', 'hep-ph', 'hep-th', 'quant-ph', 'gr-qc']].values
y[0:5]

array([[0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [57]:
X_train, X_test, y_train, y_test = train_test_split(small_df['vectors'], 
                                                    y, 
                                                    test_size=0.25,
                                                    random_state=42)

In [58]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((750000,), (250000,), (750000, 5), (250000, 5))

In [62]:
"""CHECK THIS
Parameters
X(sparse) array-like of shape (n_samples, n_features)   <----
Data.

y(sparse) array-like of shape (n_samples,) or (n_samples, n_classes)
Multi-class targets. An indicator matrix turns on multilabel classification.
"""

'CHECK THIS\nParameters\nX(sparse) array-like of shape (n_samples, n_features)   <----\nData.\n\ny(sparse) array-like of shape (n_samples,) or (n_samples, n_classes)\nMulti-class targets. An indicator matrix turns on multilabel classification.\n'

In [60]:
clf = OneVsRestClassifier(SVC()).fit(X_train, y_train)

ValueError: setting an array element with a sequence.