In [1]:
# -*- coding: utf-8 -*-
#%matplotlib inline

import pylab

import xml.etree.ElementTree as ET
import json
import os
import pandas as pd
import re
import numpy as np

import matplotlib

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('seaborn-white')

pylab.ion()

In [2]:
df=pd.read_json('inspire.json') #Change this to the name of the file, inspire_small.json on GitHub
data=df.copy()

In [3]:
data.head(1)

Unnamed: 0,abs1,cat0,cat1,cat2,key1,key2,title
0,The original Schrodinger's paper is translated...,arXiv,quant-ph/9903100,quant-ph,oai:arXiv.org:quant-ph/9903100,arXiv,About Heisenberg uncertainty relation


I select hep-th as the target for the generation as I believe it will lead to a better outcome: hep-th contains more jargon than other arXiv categories.

In [4]:
data_ph=data[data['cat2']=='hep-th']

<h1> Abstracts </h1>

In order to make the generation simpler I remove text in parethesis, and replace text contained between two '\$' signs by the special string 'xxxxx'. In LateX, text contained between '\$' is typically a formula. Trying to train the NN on formula would probably lead to awful results and in any case I don't aim to sensible physical abstract. So I will put the formulas back later.

In [5]:
abs_ph=data_ph['abs1'].tolist()
abs_ph=[re.sub('\(.*?\)','',abstract, flags=re.DOTALL) for abstract in abs_ph]
abs_ph=[re.sub('\$.*?\$','xxxxx',abstract, flags=re.DOTALL) for abstract in abs_ph]
abs_ph=[abstract.replace(u"\u2018", "").replace(u"\u2019", "").replace(u"\u201c", "").replace(u"\u201d", "")\
        .replace(u"\"", "").replace(u"- ", "") for abstract in abs_ph]
abs_ph_wordified=[abstract.split() for abstract in abs_ph]

In [6]:
good_char=u'0123456789qwertyuiopasdfghjklzxcvbnmQWERTYUIOPLKJHGFDSAZXCVBNM,.!?;:-‘’"/'
set_good_char=set(good_char)
set_good_char.update([u'\u201c',u'\u201d'])

I put the whole text in lower cases. Sometimes authors just put formula in plain text. So I look for words which contains exotic characters and replace them with the special string 'xxxxx'.

In [7]:
def abs_filter(abstract): #abs is a list of words
    abstract = [word.lower() if set(word).intersection(good_char)==set(word) else 'xxxxx' for word in abstract]
    abstract = ' '.join(abstract)
    abstract= re.findall(r"[\w']+|[.,!?;:/]", abstract)
    return abstract

In [8]:
abs_ph_words = [abs_filter(abstract) for abstract in abs_ph_wordified]
abs_ph_words_good = [word for abstract in abs_ph_words for word in abstract] #lower case

Remove abstracts with rare words (appearing less than 5 times) and weird numbers (sorry, this is done a posteriori after i know that iloc[5:2985] contains very long string of numbers with no clear meaning. just modify the code appropriately.

In [9]:
from collections import Counter

word_count=Counter(abs_ph_words_good)
#
print 'Number of abstracts: ', len(abs_ph), '\nUnique words: ' , len(word_count)
#
word_count_series=pd.Series(word_count)
df_word_count=pd.DataFrame({'word':word_count_series.index,'count':word_count_series.values})
#
words4=df_word_count[df_word_count['count']<5]['word'].tolist()
list_nums=df_word_count['word'].iloc[5:2985].tolist()
rare_words=set(list_nums+words4)

Number of abstracts:  77984 
Unique words:  37170


Special character '#' is added at the end of abstract.

In [10]:
abs_ph_filtered = [abstract for abstract in abs_ph_words if set(abstract).intersection(rare_words)==set()]
abs_ph_filtered_joined = [' '.join(abstract)+' #' for abstract in abs_ph_filtered]
abs_ph_filtered= [abstract.split() for abstract in abs_ph_filtered_joined]
datawords = [word for sublist in abs_ph_filtered for word in sublist] 
print 'Number of filtered abstracts: ', len(abs_ph_filtered)

Number of filtered abstracts:  44668


In [11]:
words_unique = [word for abstract in abs_ph_filtered for word in abstract]
words_unique = set(words_unique)
print 'Number of distinct words: ', len(words_unique)

Number of distinct words:  12885


The file 'text8_abstract' contains all the preprocessed abstracts. It will be fed to GloVe to create word embedding and to the RNN to train

In [50]:
thefile = open('text8_abstract', 'w')
for item in datawords:
    thefile.write("%s " % item)
the_file.close()

In [38]:
vocab = words_unique
vocab_size = len(vocab)
idx_to_vocab = dict(enumerate(vocab))
vocab_to_idx = dict(zip(idx_to_vocab.values(), idx_to_vocab.keys()))

<h1> Extract formulas </h1>

In [12]:
abs_ph_formulae=data_ph['abs1'].tolist()
abs_ph_formulae = [re.findall(r'\$(.*?)\$', abstract) for abstract in abs_ph_formulae]
abs_ph_formulae= [formula for sublist in abs_ph_formulae for formula in sublist]

In [13]:
formula_count=Counter(abs_ph_formulae)

In [67]:
formula_count_series=pd.Series(formula_count)
df_formula_count=pd.DataFrame({'formula':formula_count_series.index,'count':formula_count_series.values})
df_formula=df_formula_count[(df_formula_count['count']>10)&(df_formula_count['count']!=67)]

In [68]:
df_formula.sort_values('count', ascending=False)

Unnamed: 0,count,formula
7691,1706,N
24081,778,p
23642,717,n
7837,650,N=2
24407,613,q
5156,544,D
19448,534,\theta
7247,440,M
16427,431,\kappa
9286,415,S


List of recurring formulas

In [72]:
list_formulas = df_formula.sort_values('count', ascending=False)['formula'].tolist()
list_formulas = ['$'+formula+'$' for formula in list_formulas if formula[0] not in ['_', '^', '~', '*']]

In [73]:
thefile_1 = open('formulas', 'w')
for item in list_formulas:
    thefile_1.write("%s\n" % item)
thefile_1.close()

<h1> Titles </h1>

I do the same thing I did for abstract for the titles. I will train a separate RNN on this. This is clearly not appropriate as there should be some semantic relation between abstract and title of a paper. Next time.

In [39]:
title_ph=data_ph['title'].tolist()
title_ph=[re.sub('\(.*?\)','',title, flags=re.DOTALL) for title in title_ph]
title_ph=[re.sub('\$.*?\$','XXXXX',title, flags=re.DOTALL) for title in title_ph]
title_ph=[title.replace(u"\u2018", "").replace(u"\u2019", "").replace(u"\u201c", "").replace(u"\u201d", "")\
        .replace(u"\"", "").replace(u"- ", "") for title in title_ph]
title_ph_wordified=[title.split() for title in title_ph]

In [40]:
good_char=u'0123456789qwertyuiopasdfghjklzxcvbnmQWERTYUIOPLKJHGFDSAZXCVBNM,.!?;:-‘’"/'
set_good_char=set(good_char)
set_good_char.update([u'\u201c',u'\u201d'])

In [41]:
def title_filter(title): #title is a list of words
    title = [word.lower() if set(word).intersection(good_char)==set(word) else 'xxxxx' for word in title]
    title = ' '.join(title)
    title= re.findall(r"[\w']+|[.,!?;:/]", title)
    return title

In [42]:
title_ph_words = [title_filter(title) for title in title_ph_wordified]
title_ph_words_good = [word for title in title_ph_words for word in title] #lower case

Titles with words appearing less than 5 times in the corpus are removed. This time I also remove all titles containing formulas. Come on guys.

In [57]:
from collections import Counter

word_count_title=Counter(title_ph_words_good)
#
print 'Number of title: ', len(title_ph),'\nUnique words: ' , len(word_count_title)
#
word_count_title_series=pd.Series(word_count_title)
df_word_count_title=pd.DataFrame({'word':word_count_title_series.index,'count':word_count_title_series.values})
#
words4_title=df_word_count_title[df_word_count_title['count']<5]['word'].tolist()
list_nums_title=df_word_count_title['word'].iloc[4:151].tolist()
rare_words_title=set(list_nums_title+words4_title+['xxxxx'])

Number of title:  77984 
Unique words:  11629


In [60]:
title_ph_filtered = [title for title in title_ph_words if set(title).intersection(rare_words_title)==set()]
title_ph_filtered_joined = [' '.join(title)+' #' for title in title_ph_filtered]
title_ph_filtered= [title.split() for title in title_ph_filtered_joined]
datawords_title = [word for sublist in title_ph_filtered for word in sublist] 
print 'Number of filtered titles: ', len(title_ph_filtered)

Number of filtered titles:  55841


In [61]:
words_unique_title = [word for title in title_ph_filtered for word in title]
words_unique_title = set(words_unique_title)
print 'Number of unique words in titles: ', len(words_unique_title)

Number of unique words in titles:  4305


In [49]:
import pickle

with open('title_file', 'wb') as f:
    pickle.dump(title_ph_filtered, f)

The file 'text8_title' contains all the preprocessed titles. It will be fed to GloVe to create word embedding and to the RNN to train

In [51]:
thefile = open('text8_title', 'w')
for item in datawords_title:
    thefile.write("%s " % item)

<h1> Extract formulas </h1>

I extract formula from the abstracts to use them back later. Formula are defined as text between two '\$' signs.

In [62]:
abs_ph_formulae=data_ph['abs1'].tolist()
abs_ph_formulae = [re.findall(r'\$(.*?)\$', abstract) for abstract in abs_ph_formulae]
abs_ph_formulae= [formula for sublist in abs_ph_formulae for formula in sublist]

In [63]:
formula_count=Counter(abs_ph_formulae)

In [64]:
formula_count_series=pd.Series(formula_count)
df_formula_count=pd.DataFrame({'formula':formula_count_series.index,'count':formula_count_series.values})
df_formula=df_formula_count[(df_formula_count['count']>200)&(df_formula_count['count']!=67)]

Most frequent fromulas

In [68]:
df_formula.sort_values('count', ascending=False)[0:10]

Unnamed: 0,count,formula
7691,1706,N
24081,778,p
23642,717,n
7837,650,N=2
24407,613,q
5156,544,D
19448,534,\theta
7247,440,M
16427,431,\kappa
9286,415,S
