In [76]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from gensim import corpora, models
from pprint import pprint
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import os
import xml.etree.ElementTree as ET
import google

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

#Due to the nature of Latent Dirichlet Allocation (LDA), I am setting a seed so that results can be repplicated
np.random.seed(1998)

#Due to computing capabilities, I am retrieving 100 documents, which are in xml format inside abstracts/
folder_path = "abstracts/"
documents = []
file_names = []
for filename in os.listdir(folder_path):
    if filename.endswith(".xml"):
        file_path = os.path.join(folder_path, filename)
        tree = ET.parse(file_path)
        root = tree.getroot()
        abstract_narration = root.find("./Award/AbstractNarration")
        if abstract_narration is not None and abstract_narration.text:
            text = abstract_narration.text.strip()
            documents.append(text)
            file_names.append(filename)
########################

#Cleaning the data of common phrases in documents
documents = [str(document).replace('award reflects NSF\'s statutory mission and has been deemed worthy of support through evaluation using the Foundation\'s intellectual merit and broader impacts review criteria','') for document in documents]
documents = [str(document).replace('Nontechnical Summary:','') for document in documents]
documents = [str(document).replace('Technical summary:','') for document in documents]
documents = [str(document).replace('Nontechnical Description','') for document in documents]
documents = [str(document).replace('&lt;br/&gt;&lt;br/&gt','') for document in documents]

# Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

#I am setting a new set of stopwrods, which are based on the scenario given (NSF Research Awards Abstracts)
#This prevents ending up with topics that contain these words, which are commons accross most documents
stop_words_scenario = ['nsf','undergraduate','goal','graduate','develop','development','new','provide','provided','stem','study','system','systems','college','program','science','research','project','student','using','support','award','impact','university','students','impacts','also','researcher','researchers']

def preprocess_text(text):
    # Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenizing
    tokens = nltk.word_tokenize(text.lower())
    # Removing stop words and lemmatizing
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and token not in stop_words_scenario]
    return tokens

# Tokenizing the documents
tokenized_documents = [preprocess_text(doc) for doc in documents]

# Creating a dictionary from the tokenized documents
dictionary = corpora.Dictionary(tokenized_documents)

# Creating a document-term matrix using doc to bag of words
doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokenized_documents]

# Performing topic modeling using Latent Dirichlet Allocation (LDA)
lda_model = models.LdaModel(doc_term_matrix, num_topics=3, id2word=dictionary, passes=20)

# Extracting the topic probabilities for each document
topic_probabilities = [lda_model.get_document_topics(doc) for doc in doc_term_matrix]

# Converting the topic probabilities into a feature matrix
feature_matrix = np.zeros((len(documents), lda_model.num_topics))
for i, doc_topics in enumerate(topic_probabilities):
    for topic, prob in doc_topics:
        feature_matrix[i, topic] = prob
        
# Printing the 5 most common words associated with each topic
print("MOST SIGNIFICANT WORDS FOR EACH TOPIC:")
topics = lda_model.print_topics(num_topics=lda_model.num_topics, num_words=10)  # Adjust the number of words as needed
for topic in topics:
    topic_id, topic_words = topic
    topic_words = topic_words.split("+")
    topic_words = [word.split("*")[1].strip().replace('"', '') for word in topic_words]
    topic_words_string = ", ".join(topic_words)
    print(f"Topic {topic_id}: {topic_words_string}")
        
# Useing k-means clustering for document classification
num_clusters = 3  
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(feature_matrix)

# Exporting a csv file containing:
# Name of document, cluster number, abstract
classification_nsf = pd.DataFrame(columns=['file_name','cluster_number','abstract'])
for i, label in enumerate(kmeans.labels_):
    classification_nsf.loc[len(classification_nsf)] = [file_names[i],str(label),documents[i]]
    

classification_nsf.to_csv('results.csv',header=True,index=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


MOST SIGNIFICANT WORDS FOR EACH TOPIC:
Topic 0: cell, theory, conference, algebra, understanding, quantum, teaching, fadd, faculty, learning
Topic 1: theory, problem, analysis, equation, operator, mathematical, pi, faculty, one, group
Topic 2: material, education, technology, industry, biotechnology, data, field, model, high, technician
