<a href="https://colab.research.google.com/github/clarariachi/MAIS-202-Final-Project/blob/main/Topic_Modelling_With_NIPS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import all needed libraries, transfer my workspace to google colab, and load csv files into dataframes
import pandas as pd
import gensim 
import wordcloud
import gensim.downloader as api
!pip install -U scikit-learn
!pip3 install spacy
!python3 -m spacy download en # language model
!pip3 install pyLDAvis # for visualizing topic models
import nltk # for preprocessing
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
import os
from google.colab import drive

drive.mount("/content/gdrive", force_remount=True)            
WORK_AREA = "/content/gdrive/" + r'MyDrive/neurips_project/'
os.chdir(WORK_AREA) 

authors_url = 'https://drive.google.com/uc?id=1n8TifV2zNsePkVHXv8iOTKObSA_rhfPS'
authors = pd.read_csv(f'{WORK_AREA}/authors_nips.csv')

papers_url = 'https://drive.google.com/uc?id=1BsRS4uD54hupdk7XI4S3o5tIpql5koAH' 
docs = pd.read_csv(f'{WORK_AREA}/papers.csv')

papers = pd.merge(authors, docs)

papers.drop(['source_id'], axis=1, inplace=True) # removing metadata
print(papers.head())

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn
  Downloading scikit_learn-1.2.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.8/9.8 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully uninstalled scikit-learn-1.0.2
Successfully installed scikit-learn-1.2.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2023-03-02 02:24:31.543289: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild Tens

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/gdrive
  first_name last_name institution  year                          title  \
0       Alan    Murray         NaN  1987     Bit-Serial Neural Networks   
1       Alan    Murray         NaN  2015  Robust Portfolio Optimization   
2    Anthony     Smith         NaN  1987     Bit-Serial Neural Networks   
3    Anthony     Smith         NaN  2015  Robust Portfolio Optimization   
4        Zoe    Butler         NaN  1987     Bit-Serial Neural Networks   

                                            abstract  \
0                                                NaN   
1  We propose a robust portfolio optimization app...   
2                                                NaN   
3  We propose a robust portfolio optimization app...   
4                                                NaN   

                                           full_text  
0  573 \n\nBIT - SERIAL NEURAL  NETWORKS \n\nAlan...  
1  Robust Portfolio Optimization\n\nHuitong Qiu\n...  
2  573 \n\nBIT - SER

In [None]:
# cleaning & preprocessing full_text
papers['preprocessed_text'] = papers['full_text'].map(lambda x: str(x).lower()) # converts all full_text to lowercase

# tokenization of full_text
from nltk.tokenize import word_tokenize
papers['preprocessed_text'] = papers['preprocessed_text'].apply(lambda x: word_tokenize(x)) # splitting each string of text in the full_text column into a list of individual words
papers['preprocessed_text'] = papers['preprocessed_text'].apply(lambda x: [word for word in x if word.isalpha() and len(word)>3]) # removes punctuation, special characters, and numbers from the paper text data

# retrieving english stopwords corpus from NLTK
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'indeed', 'even', 'know', 'look', 'page', 'would', 'select', 'choose', 'university', 'example', 'group', 'unit', 'much', 'many', 'form', 'note', 'case', 'particular', 'could', 'might', 'approximate', 'about', 'thus', 'therefore', 'et', 'al', 'f', 'n', 'x', 'y', 'eg', 'ie', 'p', 'well', 'give', 'word', 'although', 'though', 'either', 'general', 'assume', 'second', 'represent', 'respective', 'correspond', 'input', 'output', 'finally', 'fact', 'define', 'update', 'next', 'compute', 'pair', 'require', 'label', 'change']) # removing more stopwords not already in the NLTK corpus and which are specific to NIPS papers to further reduce noise
stop_words_set = set(stop_words)
papers['preprocessed_text'] = papers['preprocessed_text'].apply(lambda x: [word for word in x if word not in stop_words_set]) # removing stopwords

# lemmatizing words in paper_text
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() # creating an instance of the WordNetLemmatizer class
papers['preprocessed_text'] = papers['preprocessed_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x]) # passing each word in the text of each document into the lemmatize method 

print(papers.head())

In [None]:
# creating a list of sublists where each sublist contains the words of each document
tokenized_papers = [row for row in papers['preprocessed_text']]

In [None]:
# Create a dictionary that maps tokens (words) to unique integer ids for each word
import gensim.corpora as corpora
dictionary = corpora.Dictionary(tokenized_papers) # this dictionary will be used to create the bag-of-words representation for each paper
# Create the bag-of-words representation for each paper
corpus = [dictionary.doc2bow(doc) for doc in tokenized_papers] # the doc2bow method returns a list of tuples where the 1st element of the tuple is the unique integer id of each word and the 2nd element is the frequency of each word in the document

In [None]:
# Split the data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
training_papers, testing_papers = train_test_split(corpus, test_size=0.2, random_state=42) # set random_state parameter to an arbitrary fixed value (42) to ensure 80-20 split each time we run the code

In [None]:
from pprint import pprint
# set the desired number of topics (hyperparameter)
num_topics = 10
# Build LDA model
temp = dictionary[0] # loads the dictionary to be able to use it
id2word=dictionary.id2token # creates a dictionary that maps the integer id back to the corresponding word (token)
lda_model = gensim.models.LdaMulticore(corpus=training_papers, id2word=id2word, num_topics=num_topics) # LdaMulticore is used instead of LdaModel for large corpuses of text data
                                                                                                                   # id2token is a method of the Dictionary class that reverses keys and values
# Prints the keywords in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
# Prints the most dominant topicin each document
for i, doc in enumerate(doc_lda):
    print('Document', i)
    dominant_topic = max(doc, key=lambda x: x[1])[0]
    print('Dominant topic:', dominant_topic)

In [None]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Visualize the topics
vis = gensimvis.prepare(lda_model, training_papers, dictionary=dictionary)
pyLDAvis.display(vis)

In [None]:
### LDA MODEL DONE. NOW TRAIN A MODEL THAT USES TF-IDF AND K-MEANS. 

In [None]:
# Train a TF-IDF model on the training_papers
from gensim.models import TfidfModel
tfidf = gensim.models.TfidfModel(training_papers)
# Convert each document into a list of tuples where the first element is the word's integer id within the document and the second element is its TF_IDF score 
corpus_tfidf = tfidf[corpus]

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer

# Normalize TF-IDF vectors so that their magnitude = 1
tfidf_normalized = normalize(corpus_tfidf) 

# Apply PCA to reduce the dimensionality of the TF-IDF vectors
pca = PCA(n_components=2, random_state=42) 
tfidf_pca = pca.fit_transform(tfidf_normalized)

# Chooose number of clusters (topics) hyperparameter
num_clusters = 10 

# Apply the k-means algorithm to cluster the TF-IDF vectors
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_pca)

# Print the top words of each cluster (topic)
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df= 0.05, stop_words=stop_words) # max_df filters words that appear in over 80% of the documents and min_df filters words that appear in less than 5 percent of documents
terms = tfidf_vectorizer.get_feature_names()
for i in range(num_clusters):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
# ATTEMPT AT COMBINING TF-IDF WITH LDA ALTHOUGH THIS IS NOT USUALLY DONE IN PRACTICE // ignore
# set the desired number of topics (hyperparameter)
num_topics = 10
# load the dictionary to be able to use it
temp = dictionary[0]
# create a dictionary that maps the integer id back to the corresponding word (token) 
id2word=dictionary.id2token 
# Train an LDA model on the corpus using the TF-IDF weighted vectors
lda_model_tfidf = gensim.models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=id2word)

# Print the topics and their top words
for topic in lda_model_tfidf.show_topics(num_topics=num_topics, formatted=False):
    print("Topic #{}:".format(topic[0]))
    print([word[0] for word in topic[1]])