In [1]:
import os
import sqlite3
import ujson

# connect with db

In [2]:

SQLITE_DB_PATH = os.getenv("SQLITE_DB_PATH", f"{os.path.expanduser('~')}/.watchtower/watchtower.db")

# connect with db
conn = sqlite3.connect(SQLITE_DB_PATH)

# fetch all literature records as objects
cursor = conn.cursor()
cursor.execute("SELECT * FROM literatures")
literature_records = cursor.fetchall()
headers = [description[0] for description in cursor.description]
records = [dict(zip(headers, record)) for record in literature_records]
for record in records:
    record["content"] = ujson.loads(record["content"])
# close connection
conn.close()



In [62]:
from collections import defaultdict
citation_dict = defaultdict(list)
print(f"records:{len(records)}")
for record in records:
    # record["content"] = ujson.loads(record["content"])
    # print(f"title:{record['title']}")
    # print(record["content"].keys())
    record_content= record["content"]
    citation_dict[record_content['bibliography']["title"]] = []
    for key, citation in record_content["citations"].items():
        # print(citation["title"])
        citation_dict[citation["title"]].append(record["title"])
print(len(citation_dict))

# sort citation_dict by citation count
citation_dict = dict(sorted(citation_dict.items(), key=lambda x: len(x[1]), reverse=True))

for key, value in list(citation_dict.items())[:20]:
    print(f"{key} : {len(value)}")
    



records:28
697
A neural probabilistic language model : 8
A unified architecture for natural language processing: Deep neural networks with multitask learning : 4
A scalable hierarchical distributed language model : 4
Continuous space language models : 4
Long short-term memory : 4
Hierarchical probabilistic neural network language model : 3
Recurrent neural network based language model : 3
Strategies for Training Large Scale Neural Network Language Models : 3
Statistical Language Models based on Neural Networks : 3
Distributed representations of words and phrases and their compositionality : 3
Effective self-training for parsing : 2
Connectionist language modeling for large vocabulary continuous speech recognition : 2
Class-based n-gram models of natural language : 2
Three new graphical models for statistical language modelling : 2
Distributional clustering of english words : 2
From frequency to meaning: Vector space models of semantics : 2
Indexing by latent semantic analysis : 2
Learn

In [69]:
# generate citation graph
import networkx as nx
import matplotlib.pyplot as plt

# draw graph with node size proportional to centrality
def draw_graph(G, pos, node_size, title):
    plt.figure(figsize=(20, 20))
    nx.draw(G, pos, with_labels=True, node_size=node_size, font_size=10, font_color="black", font_weight="bold", node_color="skyblue", edge_color="gray", linewidths=1, width=1, alpha=0.7)
    plt.title(title)
    plt.show()
    

G = nx.DiGraph()
for key, citations in citation_dict.items():
    # if len(citations)<2:
    #     break
    for citation in citations:
        # print(f"{v}=>{key}")
        G.add_edge(citation, key)

# calculate pagerank
paper_pagerank = nx.pagerank(G)
# print(pagerank)

paper_pagerank = dict(sorted(paper_pagerank.items(), key=lambda x: x[1], reverse=True))
# print(pagerank)
for author, importance in list(paper_pagerank.items())[:30]:
    print(f"{author}: {round(importance,6)}")


# # draw graph
# pos = nx.spring_layout(G)
# node_size = [pagerank[node] * 10000 for node in G]
# draw_graph(G, pos, node_size, "Citation Graph")



A neural probabilistic language model: 0.001706
Continuous space language models: 0.001588
A scalable hierarchical distributed language model: 0.001557
Strategies for Training Large Scale Neural Network Language Models: 0.001551
Long short-term memory: 0.00155
Indexing by latent semantic analysis: 0.001548
Learning distributed representations of concepts: 0.001548
A unified architecture for natural language processing: Deep neural networks with multitask learning: 0.00154
Hierarchical probabilistic neural network language model: 0.001535
Recurrent neural network based language model: 0.001527
Statistical Language Models based on Neural Networks: 0.001523
Distributed representations of words and phrases and their compositionality: 0.001523
Distributional clustering of english words: 0.00152
Learning representations by backpropagating errors: 0.001508
Comparative evaluation of index languages, Part II; Results: 0.001506
The effectiveness of automatically-generated weights and links: 0.00

## Author rank

In [37]:
author_citation_dict = defaultdict(list)
for record in records:
    citing_authors = [(auth["person_name"].get("first_name", " ")[0], auth["person_name"]["surname"]) for auth in record["content"]["bibliography"]["authors"]]
    # print(citing_authors)
    # citation_authors =
    record_content= record["content"]
    # citation_dict[record_content['bibliography']["title"]] = []
    for key, citation in record_content["citations"].items():
        # print(citation)
        cited_authors = [(auth["person_name"].get("first_name", " "), auth["person_name"]["surname"]) for auth in citation["authors"]]
        cited_authors = [(f[0] if f else "", l if l else "") for f,l in cited_authors]
        # print(cited_authors)
        for cited_author in cited_authors:
            # print(cited_author)
            author_citation_dict[cited_author].extend(citing_authors)
            # break
        # break

## Citation count

In [58]:
author_citation_dict = dict(sorted(author_citation_dict.items(), key=lambda x: len(x[1]), reverse=True))

for key, value in list(author_citation_dict.items())[:20]:
    print(f"{key} : {len(value)}")
    

('Y', 'Bengio') : 223
('T', 'Mikolov') : 112
('G', 'Hinton') : 97
('I', 'Sutskever') : 76
('C', 'Manning') : 68
('R', 'Socher') : 68
('J', 'Weston') : 65
('H', 'Schwenk') : 57
('H', 'Chen') : 52
('N', 'Zhang') : 52
('Q', 'Le') : 50
('J', 'Dean') : 47
('A', 'Ng') : 46
('K', 'Cho') : 41
('L', 'Burget') : 40
('K', 'Liu') : 40
('A', 'Bordes') : 39
('R', 'Pascanu') : 38
('R', 'Collobert') : 37
('O', 'Vinyals') : 36


## author pagerank

In [70]:
G = nx.DiGraph()
for key, citations in author_citation_dict.items():
    # if len(citations)<30:
    #     break
    for citation in citations:
        # print(f"{v}=>{key}")
        G.add_edge(citation, key)

# calculate pagerank
pagerank = nx.pagerank(G)


pagerank = dict(sorted(pagerank.items(), key=lambda x: x[1], reverse=True))
# print(pagerank)
for author, importance in list(pagerank.items())[:30]:
    print(f"{author}: {round(importance, 6)}")

# # draw graph
# pos = nx.spring_layout(G)
# node_size = [pagerank[node] * 10000 for node in G]
# draw_graph(G, pos, node_size, "Author Citation Graph")


('Y', 'Bengio'): 0.000827
('G', 'Hinton'): 0.000815
('J', 'Weston'): 0.000792
('C', 'Manning'): 0.000766
('H', 'Schwenk'): 0.000754
('J', 'Dean'): 0.000745
('R', 'Socher'): 0.000744
('R', 'Collobert'): 0.000743
('T', 'Mikolov'): 0.00074
('I', 'Sutskever'): 0.000738
('A', 'Bordes'): 0.000734
('A', 'Ng'): 0.000717
('P', 'Vincent'): 0.000713
('R', 'Ducharme'): 0.000712
('G', 'Corrado'): 0.000711
('X', 'Glorot'): 0.000706
('P', 'Turney'): 0.000705
('K', 'Chen'): 0.000705
('J', 'Turian'): 0.000701
('R', 'Harshman'): 0.000683
('R', 'Salakhutdinov'): 0.000679
('C', 'Xiong'): 0.000675
('S', 'Bengio'): 0.000674
('M', 'Zhang'): 0.000673
('N', 'Usunier'): 0.000666
('C', 'Lin'): 0.000665
('L', 'Burget'): 0.00066
('L', 'Ratinov'): 0.00066
('A', 'Mnih'): 0.00066
('J', 'Schmidhuber'): 0.000655
