In [1]:
# from https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920

import nltk
nltk.download('stopwords')
import re
import numpy as np
import pandas as  pd
from pprint import pprint# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel# spaCy for preprocessing
import spacy# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
# %matplotlib inline
import parquet
import os

# Prepare stopwords
# NLTK Stop words
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /Users/tfai/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2023-05-05 11:20:48.938752: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def remove_stopwords(texts):
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    # Build the bigram
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100) # higher threshold fewer phrases.
    # Faster way to get a sentence clubbed as a bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)

    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    # Build the trigram models
    bigram = gensim.models.Phrases(texts, min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[texts], threshold=100)

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    # See trigram example
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    # nlp = spacy.load('en', disable=['parser', 'ner'])
    nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


In [3]:
# Tokenize words and cleanup the text
def sent_to_words(sentences):
  for sentence in sentences:
    yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))            
    #deacc=True removes punctuations

def preprocess(dataFrameName):
    
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

    # load daaset
    # path = os.getcwd()
    # df = pd.read_parquet('{}/project_data/arxiv_climate_change.parquet'.format(path), engine='auto')
    
    
    filePath = '%s.parquet' % (dataFrameName)

    df = pd.read_parquet(filePath)

    # Remove newline characters
    # Convert to list 
    data = df.abstract.values.tolist()  
    # Remove new line characters 
    data = [re.sub('\s+', ' ', sent) for sent in data]  
    # Remove distracting single quotes 
    data = [re.sub("\'", "", sent) for sent in data]  
    data_words = list(sent_to_words(data))

    # Call preprocessing functions in order
    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops)

    

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    # print(data_lemmatized[:1])

    return data_lemmatized
    

  data = [re.sub('\s+', ' ', sent) for sent in data]


In [4]:
# Create Dictionary and Corpus needed for Topic Modeling
    # Create Dictionary 

def get_corpus(df):
    data_lemmatized = preprocess(df)
    id2word = corpora.Dictionary(data_lemmatized)  
        # Create Corpus 
    texts = data_lemmatized  
        # Term Document Frequency 
    corpus = [id2word.doc2bow(text) for text in texts]  

    return(corpus, id2word,data_lemmatized)




In [5]:
def build_model(df,corpus, id2word,data_lemmatized):
    # Building topic model
    
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=16, 
                                            random_state=100,
                                            update_every=1, # determines how often the model parameters should be updated
                                            chunksize=100, # the number of documents to be used in each training chunk
                                            passes=10, # the total number of training passes
                                            alpha='auto',
                                            per_word_topics=True)


    # Print the keyword of topics
    # pprint(lda_model.print_topics())
    # doc_lda = lda_model[corpus]
    # # Evaluate topic models

    # # Compute model Perplexity and Coherence score
    # # Compute Perplexity
    # # print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
    # # a measure of how good the model is. lower the better.

    # # Compute Coherence Score
    # coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    # coherence_lda = coherence_model_lda.get_coherence()
    # print('\nCoherence Score: ', coherence_lda)
    #Higher the topic coherence, the topic is more human interpretable.
    return(lda_model) 

In [7]:
corpus, id2word,data_lemmatized = get_corpus('subset_data/nlp_2007')
ldaModel = build_model('arxiv_climate_change',corpus, id2word,data_lemmatized)




In [8]:
ldaModel.print_topics()

[(0,
  '0.040*"low" + 0.030*"selection" + 0.027*"restrict" + 0.026*"series" + 0.021*"tag" + 0.020*"factor" + 0.019*"investigate" + 0.017*"suitable" + 0.016*"insight" + 0.015*"bad"'),
 (1,
  '0.056*"case" + 0.040*"planning" + 0.031*"learn" + 0.021*"category" + 0.020*"use" + 0.020*"structure" + 0.019*"develop" + 0.019*"methodology" + 0.019*"demonstrate" + 0.017*"problem"'),
 (2,
  '0.046*"language" + 0.031*"natural" + 0.025*"speech" + 0.024*"processing" + 0.018*"paper" + 0.016*"application" + 0.016*"use" + 0.016*"information" + 0.015*"system" + 0.015*"semantic"'),
 (3,
  '0.046*"line" + 0.024*"direct" + 0.021*"mathematical" + 0.019*"deadline" + 0.017*"table" + 0.017*"relation" + 0.014*"wiktionary" + 0.012*"large" + 0.010*"number" + 0.009*"definition"'),
 (4,
  '0.048*"model" + 0.045*"base" + 0.030*"probabilistic" + 0.029*"method" + 0.027*"estimation" + 0.026*"estimate" + 0.026*"parameter" + 0.025*"probability" + 0.019*"problem" + 0.018*"give"'),
 (5,
  '0.030*"base" + 0.025*"language" + 