In [1]:
# Run in python console
import nltk; nltk.download('stopwords')
import spacy

import re
import os
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import en_core_web_sm

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielacollaguazo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ImportError: No module named spacy

## Prepare Stopwords

In [None]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

## ImportingAbstractive Summarization Literature

In [None]:
# %reset
import pandas as pd
import os
import zipfile

# Import Dataset
content = []
filenames = []
df = pd.DataFrame()
cwd = os.getcwd()

path_to_zip_file = cwd + '/abstractive_summ_academic_literature/txtAbstractiveSummarization.zip'
directory_to_extract_to = cwd + '/abstractive_summ_academic_literature/'

with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

## Create corpus

In [None]:
corpus_path =  directory_to_extract_to + '/txtAbstractiveSummarization/'
files = []
for f in os.listdir(corpus_path):
    if f[-4:]=='.txt':
        files.append(f)
print(len(files))
for f in files:
    with open (corpus_path + f, "r") as myfile:
        content.append(myfile.read())
        filenames.append(f)
df['filename'] = filenames
df['content'] = content

df.head()

In [None]:
# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

# pretty prints the first document in form of a list. Keep in mind that the fact
# that elements of that list look like sentences, they are not.
# pprint is just formatting the full content of the document as a list of smaller strings
# pprint(data[:1])


## Tokenize words and clean-up text

In [None]:
# here we are tokenizing each document.
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

# data_words is a list where each element is the tokenized document
# print(len(data_words))

# printing the first document of the list, tokenized
# print(data_words[:1])

## Creating Bigram and Trigram Models

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=10, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  


# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

### Understanding Bigram Trigram Models

In [None]:
# An example on how to apply the trained phrases model to a new, unseen sentence.
# bigram[['opinion', 'consensus', 'minors']] # output: opinion_consensus is a bigram
# trigram[['opinion', 'consensus', 'minors']] # output: opinion_consensus as well but since there is
                                              # not enough co-ocurrance of another word after this bigram
                                              # it still is a bigram only
# print(bigram[['long', 'short', 'term']])  # this creates a bigram of short_term
# print(trigram[['long', 'short', 'term']]) # this creates a bigram of short_term only. There is not enough words to create
                                          # a trigram



# Q: What is the difference between Phrases and Phraser
# A: The type of data is different (one is a Phrases and the other a Phraser).
# print(type(bigram))
# print(type(bigram_mod))

# The bigrams are the words that at least appear 10 times together in the document
# bi1 = bigram[data_words[0]]
# bi2 = bigram_mod[data_words[0]]
# if bi1 == bi2:
#     print('nothing changes!')

# See trigram example
# The trigrams are three words co-ocurring together more at least 10 times in the model
# tri1 = trigram_mod[bigram_mod[data_words[0]]]
# tri2 = trigram[bigram_mod[data_words[0]]]
# if tri1 == tri2:
#     print('nothing changes!')

# print(tri1)

## Remove Stopwords, Make Bigrams and Lemmatize

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
nlp = en_core_web_sm.load(disable=['parser', 'ner'])

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# print(data_lemmatized[:1])

## Create the Dictionary and Corpus needed for Topic Modeling

In [None]:
# Create Dictionary:
# Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for debugging and topic printing.
id2word = corpora.Dictionary(data_lemmatized)
# print(len(id2word)) # corpus has 14118 unique tokens

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
# word with their corresponding id
corpus = [id2word.doc2bow(text) for text in texts]

# View
# print(corpus[:1])

## Build list of topic models

In [None]:
def build_topics(num_topics):
    list_models=[]
    for n in num_topics:
        topic_name = 'lda_model_' + str(n)
        topic_name = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=n, random_state=100, update_every=1,
                                                     chunksize=100, passes=10, alpha='auto', per_word_topics=True)
        list_models.append(topic_name)
    return list_models

num_topics = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
models = build_topics(num_topics)

## View the topics in LDA model

In [None]:
# Print the Keyword in the 10 topics
# pprint(lda_model.print_topics())
# doc_lda = lda_model[corpus]

## Compute Model Perplexity and Coherence Score for all models

In [None]:
def calc_perplexity_coherence(models):
    list_perplexity = []
    list_coherence = []
    
    for model in models:
        list_perplexity.append(model.log_perplexity(corpus))
        coherence_model_lda = CoherenceModel(model=model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
        list_coherence.append(coherence_model_lda.get_coherence())
    return list_perplexity, list_coherence

x = calc_perplexity_coherence(models)      

In [None]:
df_metrics = pd.DataFrame(list(x)).transpose()
df_metrics.columns = ['Perplexity','Coherence']
df_metrics['Number of topics'] = num_topics
df_metrics

### Graphic of Number of Topics and Perplexity

In [None]:
# Perplexity needs to be as low as possible 
plt.plot( 'Number of topics', 'Perplexity', data=df_metrics, color='skyblue')
plt.xlabel("Number of Topics")
plt.ylabel("Perplexity")
plt.show()

### Graphic of Number of Topics and Coherence

In [None]:
# Perplexity needs to be as low as possible 
plt.plot( 'Number of topics', 'Coherence', data=df_metrics, color='orange')
plt.xlabel("Number of Topics")
plt.ylabel("Coherence")
plt.show()

## Most salient topic per file using results of Model of choice

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num) + 1, round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    file_names = df['filename']
    sent_topics_df = pd.concat([file_names,sent_topics_df, contents], axis=1)
    return(sent_topics_df)

    
df_topic_sents_keywords = format_topics_sentences(ldamodel=models[0], corpus=corpus, texts=texts)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Filename', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

In [None]:
# export to csv
current_path=os.getcwd()
df_dominant_topic.to_csv(current_path + '/dominant_topic_file.csv', index=False)

## Splitting papers in different directories of Model of choice

In [None]:
# create a function that splits all the papers in different folders according to the topic it belongs to
# choose just one model first
import shutil
path_txt_files = directory_to_extract_to + 'txtAbstractiveSummarization/'
df_document_topic = df_dominant_topic[['Filename','Dominant_Topic']]

def split_corp_per_topic(abs_dirname):
    for index, row in df_document_topic.iterrows():  
        sel_topic = int(row['Dominant_Topic'])
        file_name = row['Filename']       
        folder_topic = abs_dirname + str(sel_topic)
        
        if not os.path.exists(folder_topic):
            os.makedirs(folder_topic)

        from_folder = abs_dirname + file_name
        to_folder = os.path.join(folder_topic, file_name)
        shutil.move(from_folder,to_folder)

        
split_corp_per_topic(path_txt_files)


## Visualize the topics-keywords

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis