## Libraries

In [4]:
import os
import re

import numpy as np
import pandas as pd

import gensim
import gensim.corpora as corpora

import pickle 
import pyLDAvis
import pyLDAvis.gensim

from pprint import pprint
from bs4 import BeautifulSoup as bs

### import nltk
### nltk.download("averaged_perceptron_tagger")
### nltk.download('punkt')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import pos_tag

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

## Parameters

In [5]:
currentFolder = os.getcwd()

dataFolder = os.path.join(currentFolder, 'data')

In [6]:
stemmer = SnowballStemmer('english')

stopWords = stopwords.words('english')
stopWords.extend(['amp', 'lt', 'gt', 'br/'])

## Functions

In [7]:
def tokenize(text: str):
    """This function helps to tokenize the text
    Arguments
        text (str):
    
    Output
        tokens (list):
    """
    return word_tokenize(text.lower(), language='english')

def remove_stopwords(tokens: list):
    """This function is used to remove the stopwords from the text
    Arguments
        tokens (list):

    Output
        tokens (list):
    """
    return [
        word 
        for word in tokens 
        if word not in stopWords
    ]

def lemmatize(tokens: list):
    """This function is used to lemmatize the text
    Arguments
        tokens (list):

    Output:
        lemm_tokens (list):
    """
    return [
        stemmer.stem(word) 
        for word in tokens
    ]


def pos_tagging(tokens: list):
    """This function is used to pos_tagging the text
    Arguments
        tokens (list):

    Output
        nouns (list):
    """
    tagged = pos_tag(tokens)
    return " ".join([word for word, pos in tagged if pos == "NN"])

## Read Files & Process Texts

#### Get Files Names by Condition

In [8]:
filesList = [
    os.path.join(dataFolder, i) 
    for i in os.listdir(dataFolder) 
    if i.endswith('.xml')
]

### Extract Abstract from Files

In [6]:
abstractTag = 'AbstractNarration'

abstractsList = []

for filePath in filesList:
    with open(filePath, 'r', encoding = 'utf-8') as f:
        fileReaded = bs(f.read(), "xml")
        
    ## Get the abstract from fileReaded
    abstract = fileReaded.find_all('AbstractNarration')
    
    ### Test: if all files have the 'AbstractNarration' tag
    if not abstract:
        print('\n**************************************************')
        print('Abstract tag \'AbstractNarration\' not in file: {}'.format(filePath))
        print('**************************************************\n')

        abstractsList.append(None)
        continue
    ### All files have the 'AbstractNarration' tag
 
    abstract = re.sub(
        r'<\/?{abstractTag}>'.format(abstractTag = abstractTag), 
        '', 
        str(abstract[0])
    ).strip()
    
    if abstract == '':
        abstract = None

    abstractsList.append(abstract) 

    del abstract, fileReaded
    

### Process Data (Tokenize, Remove Stop Words, Lemmatize, Pos Tagging [NN])

In [7]:
abstractsListProcessed = [
    pos_tagging(
        lemmatize(
            remove_stopwords(
                tokenize(
                    re.sub('[,\.!?]', ' ', 
                        re.sub('[0-9]', '', i)
                    )
                )
            )
        )
    )
    for i in abstractsList
]

In [8]:
abstractDF = pd.DataFrame()
abstractDF['FILENAME'] = [os.path.basename(fileName) for fileName in filesList]
abstractDF['ABSTRACT'] = abstractsList
abstractDF['PROCESSED'] = abstractsListProcessed
del abstractsList

abstractDF = abstractDF.dropna(subset='ABSTRACT')
abstractDF.to_parquet(os.path.join(dataFolder, 'data_tranformed.parquet'), index = False)

In [9]:
if 'abstractDF' not in globals():
    abstractDF = pd.read_parquet(os.path.join(dataFolder, 'data_tranformed.parquet'))

## Model Fitting

### LDA

In [10]:
NUM_TOPICS = 5

In [11]:
dataWords = [i.split(' ') for i in abstractDF['PROCESSED'].tolist()]
id2word = corpora.Dictionary(dataWords)# Create Corpus
corpus = [id2word.doc2bow(text) for text in dataWords]

In [12]:
lda = gensim.models.LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    num_topics=NUM_TOPICS,
    random_state=123
)

pprint(lda.print_topics())

[(0,
  '0.022*"project" + 0.020*"use" + 0.017*"research" + 0.012*"support" + '
  '0.012*"student" + 0.011*"impact" + 0.008*"evalu" + 0.008*"nsf" + '
  '0.007*"foundat" + 0.006*"review"'),
 (1,
  '0.018*"research" + 0.016*"project" + 0.015*"use" + 0.014*"model" + '
  '0.010*"support" + 0.008*"student" + 0.008*"system" + 0.007*"review" + '
  '0.007*"nsf" + 0.007*"comput"'),
 (2,
  '0.024*"research" + 0.019*"project" + 0.015*"use" + 0.012*"support" + '
  '0.011*"impact" + 0.010*"student" + 0.009*"evalu" + 0.009*"system" + '
  '0.007*"mission" + 0.007*"merit"'),
 (3,
  '0.027*"research" + 0.022*"project" + 0.016*"use" + 0.012*"student" + '
  '0.012*"support" + 0.011*"system" + 0.010*"impact" + 0.007*"model" + '
  '0.007*"program" + 0.007*"review"'),
 (4,
  '0.024*"project" + 0.016*"research" + 0.014*"use" + 0.014*"student" + '
  '0.011*"support" + 0.011*"model" + 0.010*"impact" + 0.007*"system" + '
  '0.007*"foundat" + 0.007*"mission"')]


In [13]:
pyLDAvis.enable_notebook()

ldaPrepared = pyLDAvis.gensim.prepare(lda, corpus, id2word)
pyLDAvis.save_html(ldaPrepared, 'ldaPrepared_{}.html'.format(NUM_TOPICS))

ldaPrepared

### TfidfVectorizer

In [14]:
NUM_TOPICS = 5

In [15]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.95)
dtm = tfidf.fit_transform(abstractDF["PROCESSED"])

In [17]:
NUM_WORDS = 15

nmf = NMF(n_components=NUM_TOPICS, random_state=123)
nmf.fit(dtm)

W = nmf.transform(dtm)
H = nmf.components_

vocab = np.array(tfidf.get_feature_names_out())

topWordsFunc = lambda t: [vocab[i] for i in np.argsort(t)[: -NUM_WORDS - 1 : -1]]
topicWords = [topWordsFunc(t) for t in H]
topicWords = [" ".join(t) for t in topicWords]
topicWords

['network system project model comput problem research learn design algorithm machin method robot technolog optim',
 'stem student program research project teacher scienc school educ profession colleg career particip engin mentor',
 'abstractnarration star galaxi hole gravit gas observ model mass matter astrophys planet telescop merger neutrino',
 'chang water research plant climat model project environment speci carbon biolog ecosystem region communiti soil',
 'quantum materi electron properti research structur cell manufactur mechan metal magnet energi polym technolog process']

In [20]:
topicsDF = pd.DataFrame(
    np.round(W, 2), 
    columns = ["TOPIC_" + str(i) for i in range(nmf.n_components)], 
    index = ["ticket_" + str(i) for i in range(len(abstractDF['PROCESSED']))]
)

topTopic = np.argmax(topicsDF.values, axis=1)
topicsDF["TOPIC"] = topTopic
abstractDF["TOPIC"] = topTopic

display(topicsDF.head())

Unnamed: 0,TOPIC_0,TOPIC_1,TOPIC_2,TOPIC_3,TOPIC_4,TOPIC
ticket_0,0.0,0.0,0.0,0.04,0.02,3
ticket_1,0.01,0.11,0.0,0.02,0.0,1
ticket_2,0.02,0.04,0.0,0.01,0.02,1
ticket_3,0.01,0.11,0.0,0.01,0.0,1
ticket_4,0.0,0.03,0.0,0.02,0.03,1


In [21]:
topicsDict = dict(
    zip(
        range(0,nmf.n_components),
        [
            "Computational Problem-solving: Network Systems, Machine Learning, and Algorithmic Design",
            "STEM Education Program: Empowering Students, Teachers, and Career Advancement",
            "Astrophysical Abstract Narratives: Stars, Galaxies, Black Holes, and Gravitational Observations",
            "Environmental Changes: Climate, Water, and Plant Research in Specific Regions",
            "Quantum Materials and Energy: Electronic Properties, Structural Insights, and Manufacturing"
        ]
    )
)
abstractDF["TOPIC"] = abstractDF["TOPIC"].map(topicsDict)

In [22]:
abstractDF

Unnamed: 0,FILENAME,ABSTRACT,PROCESSED,TOPIC
0,2000005.xml,Head and heart development are closely intertw...,head heart share regulatori mechan progenitor ...,"Environmental Changes: Climate, Water, and Pla..."
1,2000009.xml,The National Academy of Engineering identified...,nation academi engin identifi issu challeng st...,"STEM Education Program: Empowering Students, T..."
2,2000012.xml,This award provides three years of funding to ...,award provid year fund help confer seri analys...,"STEM Education Program: Empowering Students, T..."
3,2000021.xml,"This collaborative research project, involving...",collabor research project state epistem form s...,"STEM Education Program: Empowering Students, T..."
4,2000028.xml,Research Initiation Awards provide support for...,research support mid-car faculti histor colleg...,"STEM Education Program: Empowering Students, T..."
...,...,...,...,...
13295,2055767.xml,Recent studies have highlighted the nation's i...,studi nation advanc manufactur exampl studi de...,"STEM Education Program: Empowering Students, T..."
13296,2055771.xml,"This project links two mathematical fields, dy...",project field dynam algebra elementari dynam s...,Computational Problem-solving: Network Systems...
13297,2055772.xml,Recent years have seen a dramatic rise in mobi...,year health monitor trend pandem collect massi...,Computational Problem-solving: Network Systems...
13298,2055773.xml,Recent years have seen a dramatic rise in mobi...,year health monitor trend pandem collect massi...,Computational Problem-solving: Network Systems...


In [25]:
abstractDF['TOPIC'].value_counts()

Environmental Changes: Climate, Water, and Plant Research in Specific Regions                      4285
Computational Problem-solving: Network Systems, Machine Learning, and Algorithmic Design           4079
Quantum Materials and Energy: Electronic Properties, Structural Insights, and Manufacturing        2455
STEM Education Program: Empowering Students, Teachers, and Career Advancement                      2340
Astrophysical Abstract Narratives: Stars, Galaxies, Black Holes, and Gravitational Observations     141
Name: TOPIC, dtype: int64