In [20]:
import os,re
import time,pickle
from tqdm import *
from os.path import expanduser

# Introduction

The aim of this notebook is to find similar papers from the database based
on their abstract and their title.

The following is greatly inspired from [Amir Amini](https://www.kaggle.com/amirhamini/d/benhamner/nips-2015-papers/find-similar-papers-knn/notebook) and [brandonmrose](http://brandonrose.org/clustering).

# Data preprocessing 

The scripts used to scrap the [AGU wesbsite](https://fallmeeting.agu.org/2015/) as well as the resulting data are stored on this [repo](https://github.com/cthorey/agu_data) if you want to reproduce the following by yourself.

In [122]:
home = expanduser('~')
os.chdir(os.path.join(home,'Documents','repos','agu_data','agu_data'))
from Data_Utils import *

data = get_all_data('agu2015')
abstracts = [df.abstract for df in data if (df.title != '') and (df.abstract != '')]
titles = [df.title for df in data if (df.title != '') and (df.abstract != '')]
links = [df.link for df in data if (df.title != '') and (df.abstract != '')]



AGU abstract are short, $\sim 300$ words and looks like that

As you can see, some of them are empty which more likely corresponds to papers that have been retracted before the beginning of the meeting. We also have to:

- Make everything lower case
- Remove all strange character, convert unicode
- replace \n by space

In [121]:
def clean_text(text):
    ''' function to clean each abstract/title'''
    
    if text.split('\n')[0].split(' ')[0] =='ePoster':
        text = ' '.join(text.split('\n')[1:])
    list_of_cleaning_signs = ['\x0c', '\n']
    for sign in list_of_cleaning_signs:
        text = text.replace(sign, ' ')
    #text = unicode(text, errors='ignore')
    clean_text = re.sub('[^a-zA-Z]+', ' ', text)
    return clean_text.lower().strip()

papers = [clean_text(df.abstract) for df in data]

In [71]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [114]:
def tokenize_and_stem(text):
    tokens = set(reduce(lambda x,y:x+y,[nltk.word_tokenize(clean_text(sent)) 
                                        for sent in nltk.sent_tokenize(text)]))
    stems = [stemmer.stem(t) for t in tokens]
    return stems


In [150]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer_abstract = TfidfVectorizer(max_df=0.95, 
                                            max_features=200000, 
                                            min_df=0.05, 
                                            stop_words='english',
                                            use_idf=True, 
                                            tokenizer=tokenize_and_stem,
                                            lowercase = True,
                                            ngram_range=(1,3))

In [151]:
%time tfidf_matrix_Abstract = tfidf_vectorizer_abstract.fit_transform(abstracts)

CPU times: user 3min 48s, sys: 4.14 s, total: 3min 52s
Wall time: 3min 52s


In [153]:
terms_Abstract = tfidf_vectorizer_abstract.get_feature_names()

In [156]:
import numpy as np
import pandas as pd

def top_tfidf_feats(row, terms, top_n=25):
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(terms[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df['feature']
def given_link_give_keywords(tfidfMatrix, terms, paper_link, top_n=20):
    row_id = links.index(paper_link)
    row = np.squeeze(tfidfMatrix[row_id].toarray())
    return top_tfidf_feats(row, terms, top_n)



In [160]:
paper_id_example = links[159]
print ("Keywords based on Abstract:")
print (given_link_give_keywords(tfidf_matrix_Abstract,
                                terms_Abstract, 
                                paper_id_example, 
                                top_n = 10))

Keywords based on Abstract:
0       melt
1    product
2      locat
3     extent
4      volum
5      short
6     center
7        age
8     nation
9    dataset
Name: feature, dtype: object


In [None]:
from sklearn.neighbors import NearestNeighbors
# Based on Abstract
num_neighbors = 4
nbrs_Abstract = NearestNeighbors(n_neighbors=num_neighbors,
                                 algorithm='auto').fit(tfidf_matrix_Abstract)
distances_Abstract, indices_Abstract = nbrs_Abstract.kneighbors(tfidf_matrix_Abstract)

In [None]:
print ("Nbrs of the example paper based on Abstract similarity: %r" % indices_Abstract[1])

In [85]:
a = [clean_text(f) for f in nltk.sent_tokenize(abstracts[0])]
tnltk.word_tokenize(f) for f in a
tokens

SyntaxError: invalid syntax (<ipython-input-85-495524424dc4>, line 2)

In [None]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [40]:
papers[200]

u'a weakening of the tropical tropospheric circulation has been inferred from historical observations and model projections but recent satellite based trends in surface wind speed precipitation and evaporation offer a conflicting view here this apparent contradiction is reconciled through consideration of sea surface temperature sst pattern effects and differences between tropospheric and surface winds the sst patterns are found to exert a strong influence on the surface winds acting against the intrinsic large scale circulation slow down to produce a near zero surface wind speed change averaged in space the intrinsic slow down and sst pattern effects combine to maintain a muted precipitation response despite the near zero change in surface wind speed because the planetary boundary layer is decoupled from the free troposphere the surface wind speed change cannot be regarded as an indicator for the trend of the tropical tropospheric circulation as a result there is no inconsistency betw

In [13]:
pathdata = os.path.join(root,'Data')
papers_list = [f for f in os.listdir(pathdata) if f.split('_')[0]=='agu2015']
papers = []
errors = []
for i in tqdm(range(len(papers_list))):
    name = papers_list[i]
    with open(os.path.join(pathdata,name), 'rb') as f:
        idxs = pickle.load(f)
        papers += idxs['papers']
        errors += idxs['error']



In [58]:
from nltk.parse.stanford import StanfordParser

In [63]:
root = '/Users/thorey/Documents/Tool/standfordParser/stanford-parser-full-2014-08-27'
english_parser = StanfordParser( os.path.join('stanford-parser-3.4.1-models.jar'))

LookupError: Could not find stanford-parser.jar jar file at stanford-parser-3.4.1-models.jar

In [57]:
from nltk.tag.stanford import NERTagger

ImportError: cannot import name NERTagger

In [52]:
paper = papers[0]
lsttag = pos_tag(word_tokenize(paper.abstract))
#zip(range(len(lsttag)),lsttag)

[('c', 'NN'),
 ('h', 'NN'),
 ('o', 'NN'),
 ('c', 'NN'),
 ('o', 'NN'),
 ('l', 'NN'),
 ('a', 'DT'),
 ('t', 'NN'),
 ('e', 'NN')]

In [31]:
paper.date

' Friday, 18 December 2015'

In [64]:
nltk.FreqDist(word_tokenize(paper.abstract))

Counter({u'%': 1,
         u'(': 3,
         u')': 3,
         u',': 8,
         u'.': 10,
         u'3-hour': 2,
         u'3-hourly': 1,
         u'40': 1,
         u'Concentration': 1,
         u'Europe': 3,
         u'Heavy': 1,
         u'It': 2,
         u'On': 1,
         u'Over': 1,
         u'Pathway': 1,
         u'RCP8.5': 1,
         u'Representative': 1,
         u'Spain': 1,
         u'The': 2,
         u'There': 1,
         u'This': 1,
         u'While': 1,
         u'a': 4,
         u'an': 1,
         u'analysed': 1,
         u'analyses': 1,
         u'and': 3,
         u'are': 3,
         u'as': 1,
         u'at': 7,
         u'be': 1,
         u'behaviour': 1,
         u'century': 1,
         u'changes': 1,
         u'changing': 1,
         u'climate': 2,
         u'comparing': 1,
         u'concentrations': 1,
         u'considered': 1,
         u'daily': 6,
         u'differences': 3,
         u'distributions': 1,
         u'established': 1,
         u'events': 3,
 

In [66]:
import json

In [67]:
a = { 'a' : {'b':'c'},'d':{'e':'f'}}

In [72]:
b = json.dumps(a)

In [81]:
b

'{"a": {"b": "c"}, "d": {"e": "f"}}'

In [75]:
with open('data.txt', 'w') as outfile:
    json.dump(b, outfile, sort_keys = True, indent = 4, ensure_ascii=False)

In [96]:
b = {'paper1':{'abstract':paper.abstract,'title':paper.title,'authors':paper.author},
     'paper2':{'abstract':paper.abstract,'title':paper.title,'authors':paper.author}}

In [97]:
    import codecs, json
    with codecs.open('data.json', 'w', 'utf8') as outfile:
         json.dump(b, outfile,sort_keys = True, indent = 4, ensure_ascii=False)

In [99]:
with codecs.open('data.json','r','utf8') as f :
    a =json.load(f)

In [101]:
a.keys()

[u'paper1', u'paper2']