In [1]:
import numpy as np
import glob
import open_PDF
import os
import gensim
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
import nltk
from tqdm.notebook import tqdm
import pandas
from shutil import copyfile
import networkx as nx
import shutil
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import community.community_louvain as community_louvain
nltk.download('wordnet')
stemmer = PorterStemmer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gustaveronteix/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import utils
import open_PDF
import simple_sort
import recursive_sort

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gustaveronteix/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
PDF_DIR = r'test_data'

fname = glob.glob(os.path.join(PDF_DIR, '*.pdf'))[0]

text = open_PDF.open_PDF_tika(fname)

utils.get_topics_paper(text, 
                 num_words = 5, 
                 num_topics = 5)

We now have the different topics per paper. We generate a dict where we store:

 - paper name
 - position
 - list of topics
 
We then generate a networkx graph object. The papers that have similar topics as attributes are linked together and the edges are reinforced with each new common keyword.

Once the network is generated, we run Louvain community building algorithm to sort the different papers based on their common topics.

In [None]:
PDF_DIR = r'test_data_bis'

paper_dictionnary = utils.make_dict_from_papers(PDF_DIR, 
                 num_words = 5, 
                 num_topics = 5)

In [None]:
try:
    import cPickle as pickle
except ImportError:  # Python 3.x
    import pickle

#with open('paper_dictionnary.p', 'wb') as fp:
#    pickle.dump(paper_dictionnary, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
G = utils.make_graph_from_dict(paper_dictionnary)

In [None]:
pos = nx.kamada_kawai_layout(G)
edgewidth = [G.get_edge_data(u, v)['weight'] for u, v in G.edges()]

nx.draw_networkx_edges(G, pos, 
                       alpha=0.3, 
                       width=edgewidth, 
                       edge_color="m")
nx.draw_networkx_nodes(G, pos, 
                       node_color="#210070", 
                       alpha=0.9)

#label_options = {"ec": "k", "fc": "white", "alpha": 0.7}e
#nx.draw_networkx_labels(G, pos,
#                        font_size=9, 
#                        bbox=label_options)

In [None]:
partition = community_louvain.best_partition(G)

pos = nx.kamada_kawai_layout(G)
# color the nodes according to their partition
cmap = cm.get_cmap('viridis', max(partition.values()) + 1)
nx.draw_networkx_nodes(G, pos, partition.keys(), node_size=40, 
                        cmap=cmap, node_color=list(partition.values()))
nx.draw_networkx_edges(G, pos, alpha=0.5)
plt.show()

Most common words by percentage of nodes that express this word! If more than $x\%$ of the nodes express it then remove the word.

In [None]:
word_count = make_word_list(paper_dictionnary)
no_keywords = get_most_common_words(word_count, 20)
no_keywords += ['http', 'ncbi', 'experi', 'pubm', 'elsevi', 'refhub', 'cell']

In [None]:
G_cleaned = utils.make_graph_from_dict(paper_dictionnary, no_keywords)

In [None]:
pos = nx.kamada_kawai_layout(G_cleaned)
edgewidth = [G_cleaned.get_edge_data(u, v)['weight'] for u, v in G_cleaned.edges()]

nx.draw_networkx_edges(G_cleaned, pos, 
                       alpha=0.3, 
                       width=edgewidth, 
                       edge_color="m")
nx.draw_networkx_nodes(G_cleaned, pos, 
                       node_color="#210070", 
                       alpha=0.9)

In [None]:
partition = community_louvain.best_partition(G_cleaned, 
                                             resolution = 1.1)

pos = nx.kamada_kawai_layout(G_cleaned)

# color the nodes according to their partition
cmap = cm.get_cmap('Set2', max(partition.values()) + 1)
nx.draw_networkx_nodes(G_cleaned, 
                       pos, 
                       partition.keys(), 
                       node_size=40, 
                       cmap=cmap, node_color=list(partition.values()))
nx.draw_networkx_edges(G_cleaned, 
                       pos, 
                       alpha=0.5)
plt.show()

In [None]:
partition = community_louvain.best_partition(G_cleaned, 
                                             resolution = 1.1)

for node in G_cleaned.nodes():
    
    G_cleaned.nodes[node]['partition'] = partition[node]

We have the partition of the nodes. What we now want to do is:
 - associate a probability for each word to be in each community
 - find the relatively most probable words between the communities
 - associate each community with these most probable words
 
Then within each community, build the subgraph with the most probable words removed and repeat the procedure.

Test from dictionnary

In [3]:
SORTED_DIR = r'sorted_test_data_dict'
PDF_DIR = r'test_data'
DICTNAME = 'paper_dictionnary.p'
n_largest_names = 2
partition_resolution = 0.9
n_largest_description = 9
max_common_words = 2

no_keywords = ['http', 
               'ncbi', 
               'experi', 
               'biorxiv', 
               'pubm', 
               'elsevi', 
               'refhub',
               'dataset',
               'licens',
               'grant',
               'holder',
               'preprint',
               'copyright',
               'dataset',
               'funder',
               'intern',
               'ncem',
               'requir',
               'creativecommon',
               'certifi']

simple_sort.sort_papers_from_dict(DICTNAME,
                      PDF_DIR,
                      SORTED_DIR,
                      max_common_words,
                      n_largest_names,
                      partition_resolution,
                      n_largest_description,
                      no_keywords = no_keywords)

In [None]:
PDF_DIR = r'test_data_bis'
SORTED_DIR = r'sorted_data_bis'
DICTDIR = r'dict_data_bis'

n_largest_names = 2
partition_resolution = 0.9
n_largest_description = 9

simple_sort.sort_papers_based_on_contents(PDF_DIR,
                              SORTED_DIR,
                              max_common_words = 2,
                              num_words = 4,
                              num_topics = 4,
                              n_largest_names = 2,
                              partition_resolution = 0.9,
                              n_largest_description = 10,
                              SAVEDICT = True,
                              DICTDIR = DICTDIR)

 50%|█████     | 1/2 [00:15<00:15, 15.74s/it]

In [5]:
SORTED_DIR = r'recusrive_sort_test_data_dict'
PDF_DIR = r'test_data'
DICTNAME = 'paper_dictionnary.p'
n_largest_names = 2
partition_resolution = 0.85
n_largest_description = 9
max_common_words = 2
min_graph_size = 25

iteration = 0
max_depth = 4

no_keywords = ['http', 
               'ncbi', 
               'experi', 
               'biorxiv', 
               'pubm', 
               'elsevi', 
               'refhub',
               'dataset',
               'licens',
               'grant',
               'holder',
               'preprint',
               'copyright',
               'dataset',
               'funder',
               'intern',
               'ncem',
               'requir',
               'creativecommon',
               'certifi',
               'version',
               'fund',
               'research']

recursive_sort.recursive_sort_from_dict(DICTNAME,
                          PDF_DIR,
                          SORTED_DIR,
                          max_common_words,
                          n_largest_names,
                          partition_resolution,
                          n_largest_description,
                          min_graph_size,
                          no_keywords = no_keywords,
                          iteration = iteration,
                          max_depth = max_depth)
    