In [16]:
import numpy as np
import PyPDF2
import glob
import open_PDF
import os
import gensim
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
import nltk
from tqdm.notebook import tqdm
import pandas
from shutil import copyfile
import networkx as nx
import shutil
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import community.community_louvain as community_louvain
nltk.download('wordnet')
stemmer = PorterStemmer()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gustaveronteix/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [86]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    
    result=[]
    
    for token in gensim.utils.simple_preprocess(text):
        
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

def get_topics_paper(text,
                     num_topics = 10,
                     num_words = 2):
    
    text = preprocess(text)
    dictionary = gensim.corpora.Dictionary(np.array([text]))

    # generate BOW
    bow_corpus = [dictionary.doc2bow([doc]) for doc in text]

    lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics = num_topics, 
                                       id2word = dictionary,                                    
                                       passes = 10,
                                       workers = 2)

    lda_topics = lda_model.show_topics(num_words=num_words)

    keywords = []
    filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]

    for topic in lda_topics:
        for word in preprocess_string(topic[1], filters):        
            keywords.append(word)
        
    return keywords

In [90]:
PDF_DIR = r'test_data'

import utils

fname = glob.glob(os.path.join(PDF_DIR, '*.pdf'))[0]

text = open_PDF.open_PDF_tika(fname)

utils.get_topics_paper(text, 
                 num_words = 5, 
                 num_topics = 5)

AttributeError: module 'utils' has no attribute 'get_topics_paper'

We now have the different topics per paper. We generate a dict where we store:

 - paper name
 - position
 - list of topics
 
We then generate a networkx graph object. The papers that have similar topics as attributes are linked together and the edges are reinforced with each new common keyword.

Once the network is generated, we run Louvain community building algorithm to sort the different papers based on their common topics.

In [82]:
def make_dict_from_papers(PDF_DIR,
                          num_words,
                          num_topics):

    paper_dictionnary = {}
    num_words = 5
    num_topics = 5

    error_files = []

    for fname in tqdm(glob.glob(os.path.join(PDF_DIR, '*.pdf'))):

        try:

            text = open_PDF.open_PDF_tika(fname)
            keywords = get_topics_paper(text, 
                         num_words = num_words, 
                         num_topics = num_topics)

            paper_name = os.path.basename(fname)
            dir_name = os.path.dirname(fname)

            paper_dictionnary[paper_name] = {}
            paper_dictionnary[paper_name]['directory'] = dir_name
            paper_dictionnary[paper_name]['full_path'] = fname
            paper_dictionnary[paper_name]['keywords'] = keywords

        except:

            error_files.append(fname)
           
        if len(error_files)>0:
            print('Error for: ')
            print(error_files)
            
    return paper_dictionnary

In [None]:
try:
    import cPickle as pickle
except ImportError:  # Python 3.x
    import pickle

with open('paper_dictionnary.p', 'wb') as fp:
    pickle.dump(paper_dictionnary, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
def make_graph_from_dict(paper_dictionnary, 
                         no_keywords = ['http', 'ncbi', 'experi', 'pubm', 'elsevi', 'refhub', 'cell']):
    
    G = nx.Graph()
    G.add_nodes_from(paper_dictionnary.keys())

    for key_n in paper_dictionnary.keys():

        for keyword in paper_dictionnary[key_n]['keywords']:

            # explore other articles
            for article in paper_dictionnary.keys():

                # not current article
                if article != key_n:

                    # if keyword in both articles then link
                    if (keyword in paper_dictionnary[article]['keywords']) & (keyword not in no_keywords):

                        edge_info = G.get_edge_data(article, key_n)
                        if edge_info is None:
                            G.add_edge(article,key_n, weight = 1)
                            G[article][key_n]['keywords'] = [keyword]
                        elif (keyword not in edge_info['keywords']):
                            w = edge_info['weight']
                            G.add_edge(article,key_n, weight = w+1)
                            G[article][key_n]['keywords'] += [keyword]
    return G

In [None]:
G = make_graph_from_dict(paper_dictionnary)

In [None]:
pos = nx.kamada_kawai_layout(G)
edgewidth = [G.get_edge_data(u, v)['weight'] for u, v in G.edges()]

nx.draw_networkx_edges(G, pos, 
                       alpha=0.3, 
                       width=edgewidth, 
                       edge_color="m")
nx.draw_networkx_nodes(G, pos, 
                       node_color="#210070", 
                       alpha=0.9)

#label_options = {"ec": "k", "fc": "white", "alpha": 0.7}e
#nx.draw_networkx_labels(G, pos,
#                        font_size=9, 
#                        bbox=label_options)

In [None]:
partition = community_louvain.best_partition(G)

pos = nx.kamada_kawai_layout(G)
# color the nodes according to their partition
cmap = cm.get_cmap('viridis', max(partition.values()) + 1)
nx.draw_networkx_nodes(G, pos, partition.keys(), node_size=40, 
                        cmap=cmap, node_color=list(partition.values()))
nx.draw_networkx_edges(G, pos, alpha=0.5)
plt.show()

In [7]:
from collections import Counter

def get_all_keywords(paper_dictionnary):
    
    keyword_list = []
    
    for paper in paper_dictionnary.keys():
        
        keywords_paper = paper_dictionnary[paper]['keywords']
        
        keyword_list += keywords_paper
        
    return keyword_list

def make_word_list(paper_dictionnary):
    
    keyword_list = get_all_keywords(paper_dictionnary)
        
    return Counter(keyword_list)

def get_most_common_words(word_count, n):
    
    common_word_list = []
    
    most_common_words = word_count.most_common(n)
    
    for word_tuple in most_common_words:
        
        common_word_list.append(word_tuple[0])
                
    return common_word_list

Most common words by percentage of nodes that express this word! If more than $x\%$ of the nodes express it then remove the word.

In [None]:
word_count = make_word_list(paper_dictionnary)
no_keywords = get_most_common_words(word_count, 20)
no_keywords += ['http', 'ncbi', 'experi', 'pubm', 'elsevi', 'refhub', 'cell']

In [None]:
G_cleaned = make_graph_from_dict(paper_dictionnary, no_keywords)

In [None]:
pos = nx.kamada_kawai_layout(G_cleaned)
edgewidth = [G_cleaned.get_edge_data(u, v)['weight'] for u, v in G_cleaned.edges()]

nx.draw_networkx_edges(G_cleaned, pos, 
                       alpha=0.3, 
                       width=edgewidth, 
                       edge_color="m")
nx.draw_networkx_nodes(G_cleaned, pos, 
                       node_color="#210070", 
                       alpha=0.9)

In [None]:
partition = community_louvain.best_partition(G_cleaned, 
                                             resolution = 1.1)

pos = nx.kamada_kawai_layout(G_cleaned)

# color the nodes according to their partition
cmap = cm.get_cmap('Set2', max(partition.values()) + 1)
nx.draw_networkx_nodes(G_cleaned, 
                       pos, 
                       partition.keys(), 
                       node_size=40, 
                       cmap=cmap, node_color=list(partition.values()))
nx.draw_networkx_edges(G_cleaned, 
                       pos, 
                       alpha=0.5)
plt.show()

In [None]:
partition = community_louvain.best_partition(G_cleaned, 
                                             resolution = 1.1)

for node in G_cleaned.nodes():
    
    G_cleaned.nodes[node]['partition'] = partition[node]

We have the partition of the nodes. What we now want to do is:
 - associate a probability for each word to be in each community
 - find the relatively most probable words between the communities
 - associate each community with these most probable words
 
Then within each community, build the subgraph with the most probable words removed and repeat the procedure.

In [78]:
def get_all_the_words(G):
    
    word_list = []
    
    for u,v,d in G.edges(data = True):
        
        word_list += d['keywords']
    
    return Counter(word_list)

def get_word_prob(counter_dict):
        
    total_word_number = np.sum([counter_dict[k] for k in counter_dict.keys()])
    
    counter_frame = pandas.DataFrame()
    i = 0
    
    for word in counter_dict.keys():
        
        counter_frame.loc[word, 'prob'] = counter_dict[word]/total_word_number

    return counter_frame

def get_subgraph(G, attribute, attribute_value):

    node_list = []

    for x, y in G.nodes(data=True):

        if (y[attribute] == attribute_value):

            node_list.append(x)

    return G.subgraph(node_list) 
    
def prob_in_communities(G):
    
    partition = nx.get_node_attributes(G, 'partition')
    partition_list = np.unique([partition[k] for k in partition.keys()])
    
    counter_dict = get_all_the_words(G)
    tot_graph_counter_frame = get_word_prob(counter_dict)
    
    tot_graph_counter_frame.columns = ['whole_graph']
    
    for partition_value in partition_list:
        
        subgraph = get_subgraph(G, 'partition', partition_value)
                
        counter_dict = get_all_the_words(subgraph)
        counter_frame = get_word_prob(counter_dict)
        
        if not counter_frame.empty:
            counter_frame.columns = [str(int(partition_value))]
        
            tot_graph_counter_frame = tot_graph_counter_frame.merge(counter_frame,
                                                                left_index=True,
                                                                right_index=True,
                                                                how = 'outer')
            
        else:
            
            counter_frame = pandas.DataFrame(data = np.zeros(len(tot_graph_counter_frame)))
            counter_frame.columns = [str(int(partition_value))]
            counter_frame.index = tot_graph_counter_frame.index.values
            
            tot_graph_counter_frame = tot_graph_counter_frame.merge(counter_frame,
                                                                left_index=True,
                                                                right_index=True,
                                                                how = 'outer')
            
    return tot_graph_counter_frame.fillna(0)
        

In [79]:
def get_unique_words(word_prob, column):
    
    if not column in word_prob.columns:
        print(column)
        print(word_prob)

    entropy_frame = pandas.DataFrame()

    for j in word_prob.columns:
        
        if column != j:

            s = -np.log(word_prob[j]/word_prob[column])
            s = pandas.DataFrame(s)
            s.columns = [j]
            entropy_frame = entropy_frame.merge(s,
                                                  left_index=True,
                                                  right_index=True,
                                                  how = 'outer')
        
    return pandas.DataFrame(entropy_frame.sum(axis = 1))

def sort_papers_from_graph(G,
                           PDF_DIR,
                           SORTED_DIR,
                           n_largest_names,
                           n_largest_description):
    
    partition = community_louvain.best_partition(G, 
                                                 resolution = partition_resolution)

    for node in G.nodes():
        G.nodes[node]['partition'] = partition[node]
    
    word_prob = prob_in_communities(G)
    partition = nx.get_node_attributes(G, 'partition')
    partition_list = np.unique([partition[k] for k in partition.keys()])

    # pour éviter les betises
    word_prob += 1e-10
    
    if os.path.exists(SORTED_DIR):
        shutil.rmtree(SORTED_DIR) 
    
    if not os.path.exists(SORTED_DIR):
        os.makedirs(SORTED_DIR)
    
    
    for partition_number in partition_list:
        
        # need to string the name
        partition_name = str(int(partition_number))
        
        s = get_unique_words(word_prob, partition_name)
        name_frame = s.nlargest(n_largest_names, columns = 0)
                
        # make the folder
        folder_name = ''
        for ind in name_frame.index:
            folder_name += ind +'_' 
        
        if not os.path.exists(os.path.join(SORTED_DIR, folder_name)):
            os.mkdir(os.path.join(SORTED_DIR, folder_name))
            
        # text file description of the folder
        word_frame = s.nlargest(n_largest_description, columns = 0)
        word_list = [i for i in word_frame.index]
        write_description(os.path.join(SORTED_DIR, folder_name),
                          word_list)
        
        for article_name in partition.keys():
            
            src = os.path.join(PDF_DIR, article_name)
            dst = os.path.join(SORTED_DIR, folder_name,article_name)
            
            if partition[article_name] == partition_number:
            
                copyfile(src, dst)
            
    return

def write_description(DIR, word_list):
    
    with open(os.path.join(DIR, 'readme.txt'), 'w') as f:
        for line in word_list:
            f.write(line)
            f.write('\n')
            
    return
        
def savedict(dict_to_save,
             DIR):
    
    try:
        import cPickle as pickle
    except ImportError:  # Python 3.x
        import pickle
        
    if os.path.exists(DIR):
        shutil.rmtree(DIR) 
    
    if not os.path.exists(DIR):
        os.makedirs(DIR)

    with open(os.path.join(DIR,'paper_dictionnary.p'), 'wb') as fp:
        pickle.dump(dict_to_save, 
                    fp, 
                    protocol=pickle.HIGHEST_PROTOCOL)

def sort_papers_based_on_contents(PDF_DIR,
                                  SORTED_DIR,
                                  max_common_words,
                                  num_words,
                                  num_topics,
                                  n_largest_names,
                                  partition_resolution,
                                  n_largest_description,
                                  SAVEDICT:bool,
                                  DICTDIR,
                                  no_keywords = ['http', 'ncbi', 'experi', 'biorxiv', 'pubm', 'elsevi', 'refhub']):
    
    paper_dictionnary = make_dict_from_papers(PDF_DIR,
                          num_words,
                          num_topics)
    
    if SAVEDICT:
         savedict(paper_dictionnary,
                  DICTDIR)
            
    
    word_count = make_word_list(paper_dictionnary)
    no_keywords += get_most_common_words(word_count, max_common_words)
    
    G = make_graph_from_dict(paper_dictionnary, no_keywords)
        
    sort_papers_from_graph(G, 
                           PDF_DIR,
                           SORTED_DIR,
                           n_largest_names,
                           n_largest_description)
    
    return



In [73]:
def sort_papers_from_dict(DICTNAME,
                          PDF_DIR,
                          SORTED_DIR,
                          max_common_words,
                          n_largest_names,
                          partition_resolution,
                          n_largest_description,
                          no_keywords = ['http', 'ncbi', 'experi', 'biorxiv', 'pubm', 'elsevi', 'refhub']):
    
    
    try:
        import cPickle as pickle
    except ImportError:  # Python 3.x
        import pickle

    with open(DICTNAME, 'rb') as fp:
        paper_dictionnary = pickle.load(fp)
    
    word_count = make_word_list(paper_dictionnary)
    no_keywords += get_most_common_words(word_count, max_common_words)
    
    G = make_graph_from_dict(paper_dictionnary, no_keywords)

    partition = community_louvain.best_partition(G, 
                                                 resolution = partition_resolution)

    for node in G.nodes():
        G.nodes[node]['partition'] = partition[node]
        
    sort_papers_from_graph(G, 
                           PDF_DIR,
                           SORTED_DIR,
                           n_largest_names,
                           n_largest_description)
    
    return

Test from dictionnary

In [33]:
SORTED_DIR = r'sorted_test_data_dict'
PDF_DIR = r'test_data'
DICTNAME = 'paper_dictionnary.p'
n_largest_names = 2
partition_resolution = 0.9
n_largest_description = 9
max_common_words = 2

no_keywords = ['http', 
               'ncbi', 
               'experi', 
               'biorxiv', 
               'pubm', 
               'elsevi', 
               'refhub',
               'dataset',
               'licens',
               'grant',
               'holder',
               'preprint',
               'copyright',
               'dataset',
               'funder',
               'intern',
               'ncem',
               'requir',
               'creativecommon',
               'certifi']

sort_papers_from_dict(DICTNAME,
                      PDF_DIR,
                      SORTED_DIR,
                      max_common_words,
                      n_largest_names,
                      partition_resolution,
                      n_largest_description,
                      no_keywords = no_keywords)

In [20]:
PDF_DIR = r'test_data_bis'
SORTED_DIR = r'sorted_test_data'

n_largest_names = 2
partition_resolution = 0.9
n_largest_description = 9

sort_papers_based_on_contents(PDF_DIR,
                              SORTED_DIR,
                              max_common_words = 2,
                              num_words = 4,
                              num_topics = 4,
                              n_largest_names = 2,
                              partition_resolution = 0.9,
                              n_largest_description = 10,
                              SAVEDICT = True,
                              DICTDIR = PDF_DIR)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=83.0), HTML(value='')))




In [84]:
def recursive_sorter_from_graph(G,
                           PDF_DIR,
                           DESTINATION_DIR,
                           n_largest_names,
                           n_largest_description,
                           min_graph_size,
                           partition_resolution,
                           word_list,
                           iteration,
                           max_depth):
    
    # if the graph is too small to partition stop here
    if (len(G) < min_graph_size) | (iteration+1 > max_depth):
        
        move_articles(G,
                  PDF_DIR,
                  DESTINATION_DIR)
        
        # text file description of the folder
        write_description(DESTINATION_DIR,
                          word_list)
        
        return
    
    for node in G.nodes():
        G.nodes[node]['partition'] = np.nan
         
    # partition the graph
    partition = community_louvain.best_partition(G, 
                                                 resolution = partition_resolution)

    for node in G.nodes():
        G.nodes[node]['partition'] = partition[node]
    
    word_prob = prob_in_communities(G)
    partition = nx.get_node_attributes(G, 'partition')
    partition_list = np.unique([partition[k] for k in partition.keys()])

    # pour éviter les betises
    word_prob += 1e-10
    
    # prepare the folders
    if os.path.exists(DESTINATION_DIR):
        shutil.rmtree(DESTINATION_DIR) 
    
    if not os.path.exists(DESTINATION_DIR):
        os.makedirs(DESTINATION_DIR)
    
    # run the partition proper
    for partition_number in partition_list:
        
        # need to string the name
        partition_name = str(int(partition_number))
        s = get_unique_words(word_prob, partition_name)
        name_frame = s.nlargest(n_largest_names, columns = 0)
                
        # make the folder
        folder_name = ''
        for ind in name_frame.index:
            folder_name += ind +'_'
            
        word_frame = s.nlargest(n_largest_description, columns = 0)
        word_list = [i for i in word_frame.index]
        
        if not os.path.exists(os.path.join(DESTINATION_DIR, folder_name)):
            os.mkdir(os.path.join(DESTINATION_DIR, folder_name))
            
        LOCAL_DESTINATION_DIR = os.path.join(DESTINATION_DIR, folder_name)
        
        # make local graph
        subgraph = get_subgraph(G, 'partition', partition_number)
    
        recursive_sorter_from_graph(G = subgraph,
                           PDF_DIR = PDF_DIR,
                           DESTINATION_DIR = LOCAL_DESTINATION_DIR,
                           n_largest_names = n_largest_names,
                           n_largest_description = n_largest_description,
                           min_graph_size = min_graph_size,
                           partition_resolution = partition_resolution,
                           word_list = word_list,
                           iteration = iteration + 1,
                           max_depth = max_depth)
            
    return

def move_articles(G,
                  PDF_DIR,
                  DESTINATION_DIR):
    
    for article_name in G.nodes():

        src = os.path.join(PDF_DIR, article_name)
        dst = os.path.join(DESTINATION_DIR, article_name)

        copyfile(src, dst)
        
    return

def recursive_sort_from_dict(DICTNAME,
                          PDF_DIR,
                          SORTED_DIR,
                          max_common_words,
                          n_largest_names,
                          partition_resolution,
                          n_largest_description,
                          min_graph_size,
                          no_keywords = ['http', 'ncbi', 'experi', 'biorxiv', 'pubm', 'elsevi', 'refhub'],
                          iteration = 0,
                          max_depth = 5):
    
    
    try:
        import cPickle as pickle
    except ImportError:  # Python 3.x
        import pickle

    with open(DICTNAME, 'rb') as fp:
        paper_dictionnary = pickle.load(fp)
    
    word_count = make_word_list(paper_dictionnary)
    no_keywords += get_most_common_words(word_count, max_common_words)
    
    G = make_graph_from_dict(paper_dictionnary, no_keywords)

    partition = community_louvain.best_partition(G, 
                                                 resolution = partition_resolution)

    for node in G.nodes():
        G.nodes[node]['partition'] = partition[node]
        
    recursive_sorter_from_graph(G,
                           PDF_DIR = PDF_DIR,
                           DESTINATION_DIR = SORTED_DIR,
                           n_largest_names = n_largest_names,
                           n_largest_description = n_largest_description,
                           min_graph_size = min_graph_size,
                           partition_resolution = partition_resolution,
                           word_list = [],
                           iteration = iteration,
                           max_depth = max_depth)
    
    return


##############

SORTED_DIR = r'recusrive_sort_test_data_dict'
PDF_DIR = r'test_data'
DICTNAME = 'paper_dictionnary.p'
n_largest_names = 2
partition_resolution = 0.85
n_largest_description = 9
max_common_words = 2
min_graph_size = 25

iteration = 0
max_depth = 4

no_keywords = ['http', 
               'ncbi', 
               'experi', 
               'biorxiv', 
               'pubm', 
               'elsevi', 
               'refhub',
               'dataset',
               'licens',
               'grant',
               'holder',
               'preprint',
               'copyright',
               'dataset',
               'funder',
               'intern',
               'ncem',
               'requir',
               'creativecommon',
               'certifi',
               'version',
               'fund',
               'research']

recursive_sort_from_dict(DICTNAME,
                          PDF_DIR,
                          SORTED_DIR,
                          max_common_words,
                          n_largest_names,
                          partition_resolution,
                          n_largest_description,
                          min_graph_size,
                          no_keywords = no_keywords,
                          iteration = iteration,
                          max_depth = max_depth)
    