In [27]:
import requests
import json
import re

from matplotlib import pyplot as plt
import numpy as np

In [10]:
def listToDict(l):
    d = {}
    for li in l:
        if li in d:
            d[li] += 1
        else:
            d[li] = 1
    return d

In [11]:
from stop_words import get_stop_words

stop_words = get_stop_words('en')

def remove_stop_words(d):
    nd = {}
    for (key, value) in d.items():
        if key not in stop_words and len(key) > 1:
            nd[key] = value
    return nd

In [37]:
import operator

def get_word_freq(text):
    tokenized = text.split()
    tokenized = list(map(lambda x: x.lower(), tokenized))
    tokenized = list(map(lambda x: re.sub(r'[^\w\s]','',x), tokenized))
    tokenized = list(map(lambda x: re.sub(r'[\d]','',x), tokenized))
    
    wordfreq = listToDict(tokenized)
    
    removed = remove_stop_words(wordfreq)
    
    sorted_removed = list(reversed(sorted(removed.items(), key=operator.itemgetter(1))))
    return sorted_removed

In [13]:
def get_abstract_word_freq(paper):
    abstract = paper['paperAbstract']['text']
    
    return get_word_freq(abstract)

In [14]:
import urllib.request
    
def download_PDF(paper):
    if not paper['hasPdf']:
        raise Exception("no paper for this document")
    links = paper['links']
    url = ""
    for link in links:
        if link['linkType'] == 's2':
            url = link['url']
    print(url)
    urllib.request.urlretrieve (url, get_path(paper))
    

In [15]:
def get_path(paper):
    return "papers/" + paper['slug'] + ".pdf"

In [16]:
import textract

def get_file_word_freq(paper):
    text = textract.process(get_path(paper), method='pdftotext')
    text = text.decode("utf-8")
    
    wordfreq = get_word_freq(text)
    return wordfreq
    

In [18]:
import pandas as pd

def get_paper_data_frame(paper):
    word_freq = get_file_word_freq(paper)
    words = list(map(lambda x: x[0], word_freq))
    freq = list(map(lambda x: x[1], word_freq))
    return pd.DataFrame(freq, index=words, columns=[paper['slug'][0:5]])

In [24]:
import os.path

def get_results_data_frame(results):
    df = pd.DataFrame()
    for result in results:
        try:
            if not os.path.exists(get_path(result)):
                download_PDF(result)

            rdf = get_paper_data_frame(result)
            df = pd.concat([df, rdf], axis=1)
            df.head()
        except Exception as e:
            print(e, result['slug'])
    return df

In [21]:
def clean_and_add_sum(df):
    df = df.fillna(0)
    s = df.apply(lambda x: sum(x), axis=1)
    s = s.rename('sum')
    df = pd.concat([df, s], axis=1)
    
    df = df.sort_values(['sum'], axis=0, ascending=[0])
    return df

In [29]:
def author_word_freq(name):
    params = {
        "autoEnableFilters": True,
        "queryString":name,
        "page":1,
        "pageSize":20,
        "sort":"relevance",
        "authors":[],
        "coAuthors":[],
        "venues":[],
        "facets":{},
        "yearFilter":None}
    headers = {"Content-Type": "application/json"}

    r = requests.post('https://www.semanticscholar.org/api/1/search', data = json.dumps(params), headers=headers)
    data = json.loads(r.text)
    
    df = get_results_data_frame(data['results'])
    df = clean_and_add_sum(df)
    
    return df

In [40]:
motahari = author_word_freq('Abolfazl S Motahari')

no paper for this document On-the-degrees-of-freedom-of-X-channel-with-Ghasemi-Motahari
no paper for this document Relay-aided-Interference-Alignment-for-the-quasi-Nourani-Motahari
no paper for this document Relay-aided-interference-alignment-for-the-quasi-Nourani-Motahari
no paper for this document On-the-Degrees-of-Freedom-of-the-3-user-Gaussian-Motahari-Khandani


In [44]:
motahari.head(n=20)

Unnamed: 0,Commu,Capac,Real-,Inter,Real-.1,Inter.1,Commu.1,On-th,On-th.1,Commu.2,Multi,The-S,Formi,Infor,To-De,The-S.1,sum
channel,89.0,81.0,160.0,61.0,106.0,91.0,16.0,78.0,91.0,74.0,5.0,59.0,78.0,1.0,70.0,123.0,1183.0
can,21.0,88.0,156.0,63.0,114.0,68.0,10.0,42.0,45.0,23.0,81.0,10.0,76.0,53.0,62.0,29.0,941.0
two,133.0,43.0,29.0,28.0,30.0,104.0,34.0,14.0,5.0,125.0,30.0,11.0,7.0,63.0,16.0,16.0,688.0
receiver,92.0,24.0,85.0,30.0,56.0,120.0,25.0,0.0,36.0,90.0,0.0,16.0,39.0,0.0,59.0,6.0,678.0
interference,56.0,36.0,99.0,80.0,91.0,52.0,33.0,2.0,7.0,50.0,9.0,2.0,77.0,0.0,33.0,0.0,627.0
one,98.0,36.0,56.0,13.0,47.0,80.0,35.0,13.0,13.0,92.0,23.0,10.0,25.0,30.0,15.0,9.0,595.0
gaussian,7.0,153.0,19.0,47.0,19.0,31.0,1.0,8.0,69.0,7.0,0.0,11.0,15.0,0.0,80.0,74.0,541.0
region,4.0,130.0,20.0,6.0,10.0,126.0,0.0,8.0,17.0,3.0,0.0,28.0,9.0,1.0,45.0,98.0,505.0
data,71.0,12.0,84.0,22.0,92.0,4.0,26.0,11.0,0.0,66.0,17.0,1.0,47.0,11.0,35.0,1.0,500.0
achievable,13.0,53.0,40.0,27.0,48.0,79.0,11.0,42.0,34.0,11.0,15.0,13.0,26.0,0.0,52.0,16.0,480.0


In [42]:
madahali = author_word_freq('Mohammad Ali Maddah-Ali')

no paper for this document Decentralized-Caching-Attains-Order-Optimal-Memory-Maddah-Ali-Niesen
no paper for this document Broadcast-in-MIMO-Systems-Based-on-a-Generalized-Maddah-Ali-Sadrabadi


In [45]:
madahali.head(n=20)

Unnamed: 0,Commu,Funda,Compl,Real-,Coded,Decen,On-th,Onlin,Hiera,Inter,Capac,Commu.1,Cache,Commu.2,Funda.1,Throu,Formi,An-Ef,sum
channel,89.0,0.0,100.0,160.0,4.0,0.0,118.0,0.0,0.0,227.0,162.0,16.0,82.0,74.0,22.0,24.0,78.0,43.0,1199.0
can,21.0,2.0,77.0,156.0,46.0,48.0,27.0,18.0,36.0,103.0,191.0,10.0,32.0,23.0,83.0,20.0,76.0,9.0,978.0
receiver,92.0,0.0,134.0,85.0,1.0,0.0,89.0,0.0,0.0,120.0,66.0,25.0,64.0,90.0,67.0,5.0,39.0,2.0,879.0
one,98.0,0.0,53.0,56.0,21.0,39.0,34.0,11.0,25.0,88.0,88.0,35.0,58.0,92.0,13.0,3.0,25.0,19.0,758.0
scheme,77.0,1.0,52.0,20.0,70.0,80.0,42.0,40.0,99.0,49.0,28.0,37.0,23.0,75.0,28.0,3.0,16.0,14.0,754.0
two,133.0,0.0,49.0,29.0,27.0,37.0,20.0,14.0,40.0,96.0,69.0,34.0,53.0,125.0,8.0,0.0,7.0,10.0,751.0
interference,56.0,0.0,21.0,99.0,0.0,0.0,71.0,0.0,0.0,72.0,103.0,33.0,77.0,50.0,43.0,4.0,77.0,22.0,728.0
transmitter,79.0,0.0,79.0,40.0,1.0,0.0,19.0,0.0,0.0,49.0,77.0,24.0,57.0,77.0,47.0,7.0,15.0,5.0,576.0
bits,0.0,0.0,0.0,0.0,33.0,32.0,0.0,21.0,16.0,61.0,389.0,0.0,7.0,0.0,2.0,0.0,0.0,0.0,561.0
receivers,32.0,0.0,117.0,56.0,0.0,0.0,27.0,0.0,0.0,36.0,91.0,17.0,24.0,30.0,99.0,6.0,9.0,0.0,544.0


In [51]:
nashtali = author_word_freq('Damoun Nashta-ali')

In [52]:
nashtali.head(n = 20)

Unnamed: 0,Funda,Break,sum
reads,62.0,143.0,205.0
read,85.0,102.0,187.0
genome,42.0,127.0,169.0
can,107.0,46.0,153.0
error,115.0,35.0,150.0
sequencing,72.0,68.0,140.0
bases,0.0,118.0,118.0
length,46.0,61.0,107.0
dna,94.0,10.0,104.0
one,76.0,22.0,98.0


In [53]:
from sklearn.decomposition import PCA

ImportError: No module named 'sklearn'