<a href="https://colab.research.google.com/github/clarariachi/MAIS-202-Final-Project/blob/main/Topic_Modelling_With_NIPS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# import all needed libraries, transfer my workspace to google colab, and load csv files into dataframes
import pandas as pd
import gensim 
import wordcloud
import gensim.downloader as api
!pip install -U scikit-learn
!pip3 install pyLDAvis # for visualizing topic models
import nltk # for preprocessing
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
import os
from google.colab import drive

drive.mount("/content/gdrive", force_remount=True)            
WORK_AREA = "/content/gdrive/" + r'MyDrive/neurips_project/'
os.chdir(WORK_AREA) 

authors_url = 'https://drive.google.com/uc?id=1n8TifV2zNsePkVHXv8iOTKObSA_rhfPS'
authors = pd.read_csv(f'{WORK_AREA}/authors_nips.csv')

papers_url = 'https://drive.google.com/uc?id=1BsRS4uD54hupdk7XI4S3o5tIpql5koAH' 
papers = pd.read_csv(f'{WORK_AREA}/papers.csv')

#papers = pd.merge(authors, docs, on = "source_id")

#papers.drop(['source_id'], axis=1, inplace=True) # removing metadata
print(papers.head())

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Mounted at /content/gdrive
   source_id  year                                              title  \
0         27  1987                         Bit-Serial Neural Networks   
1         63  1987                        Connectivity Versus Entropy   
2         60  1987        The Hopfield Model with Multi-Level Neurons   
3         59  1987                               How Neural Nets Work   
4         69  1987  Spatial Organization of Neural Networks: A Pro...   

  abstract                                          full_text  
0      NaN  573 \n\nBIT - SERIAL NEURAL  NETWORKS \n\nAlan...  
1      NaN  1 \n\nCONNECTIVITY VERSUS ENTROPY \n\nYaser  S...  
2      NaN  278 \n\nTHE HOPFIELD MODEL WITH MUL TI-LEVEL N...  
3      NaN  442 \n\nAlan  Lapedes \nRobert  Farber \n\nThe...  
4      NaN  740 \n\nSPATIAL  ORGANIZATION  OF  NEURAL  NEn...  


In [12]:
# cleaning & preprocessing full_text

# tokenization of full_text
from nltk.tokenize import word_tokenize
papers['preprocessed_text'] = papers['full_text'].apply(lambda x: word_tokenize(str(x))) # splitting each string of text in the full_text column into a list of individual words
papers['preprocessed_text'] = papers['preprocessed_text'].apply(lambda x: [word.lower() for word in x if word.isalpha() and len(word)>3]) # removes punctuation, special characters, and numbers from the paper text data

# retrieving english stopwords corpus from NLTK
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'indeed', 'even', 'know', 'look', 'page', 'would', 'select', 'choose', 'university', 'example', 'group', 'unit', 'much', 'many', 'form', 'note', 'case', 'particular', 'could', 'might', 'approximate', 'about', 'thus', 'therefore', 'et', 'al', 'f', 'n', 'x', 'y', 'eg', 'ie', 'p', 'well', 'give', 'word', 'although', 'though', 'either', 'general', 'assume', 'second', 'represent', 'respective', 'correspond', 'input', 'output', 'finally', 'fact', 'define', 'update', 'next', 'compute', 'pair', 'require', 'label', 'change']) # removing more stopwords not already in the NLTK corpus and which are specific to NIPS papers to further reduce noise
stop_words_set = set(stop_words)
papers['preprocessed_text'] = papers['preprocessed_text'].apply(lambda x: [word for word in x if word not in stop_words_set]) # removing stopwords

# lemmatizing words in paper_text
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() # creating an instance of the WordNetLemmatizer class
papers['preprocessed_text'] = papers['preprocessed_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x]) # passing each word in the text of each document into the lemmatize method 

print(papers.head())

   source_id  year                                              title  \
0         27  1987                         Bit-Serial Neural Networks   
1         63  1987                        Connectivity Versus Entropy   
2         60  1987        The Hopfield Model with Multi-Level Neurons   
3         59  1987                               How Neural Nets Work   
4         69  1987  Spatial Organization of Neural Networks: A Pro...   

  abstract                                          full_text  \
0      NaN  573 \n\nBIT - SERIAL NEURAL  NETWORKS \n\nAlan...   
1      NaN  1 \n\nCONNECTIVITY VERSUS ENTROPY \n\nYaser  S...   
2      NaN  278 \n\nTHE HOPFIELD MODEL WITH MUL TI-LEVEL N...   
3      NaN  442 \n\nAlan  Lapedes \nRobert  Farber \n\nThe...   
4      NaN  740 \n\nSPATIAL  ORGANIZATION  OF  NEURAL  NEn...   

                                   preprocessed_text  
0  [serial, neural, network, alan, murray, anthon...  
1  [connectivity, versus, entropy, yaser, califor...  
2  [h

In [13]:
# creating a list of sublists where each sublist contains the words of each document
tokenized_papers = [row for row in papers['preprocessed_text']]

In [14]:
# Create a dictionary that maps tokens (words) to unique integer ids for each word
import gensim.corpora as corpora
dictionary = corpora.Dictionary(tokenized_papers) # this dictionary will be used to create the bag-of-words representation for each paper
# Create the bag-of-words representation for each paper
corpus = [dictionary.doc2bow(doc) for doc in tokenized_papers] # the doc2bow method returns a list of tuples where the 1st element of the tuple is the unique integer id of each word and the 2nd element is the frequency of each word in the document

In [15]:
# Split the data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
training_papers, testing_papers = train_test_split(corpus, test_size=0.2, random_state=42) # set random_state parameter to an arbitrary fixed value (42) to ensure 80-20 split each time we run the code

In [16]:
from pprint import pprint
# set the desired number of topics (hyperparameter)
num_topics = 10
# Build LDA model
temp = dictionary[0] # loads the dictionary to be able to use it
id2word=dictionary.id2token # creates a dictionary that maps the integer id back to the corresponding word (token)
lda_model = gensim.models.LdaMulticore(corpus=training_papers, id2word=id2word, num_topics=num_topics) # LdaMulticore is used instead of LdaModel for large corpuses of text data
                                                                                                                   # id2token is a method of the Dictionary class that reverses keys and values
# Prints the keywords in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
# Prints the most dominant topic in each document
for i, doc in enumerate(doc_lda):
    print('Document', i)
    dominant_topic = max(doc, key=lambda x: x[1])[0]
    print('Dominant topic:', dominant_topic)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Document 7180
Dominant topic: 5
Document 7181
Dominant topic: 9
Document 7182
Dominant topic: 3
Document 7183
Dominant topic: 9
Document 7184
Dominant topic: 1
Document 7185
Dominant topic: 0
Document 7186
Dominant topic: 9
Document 7187
Dominant topic: 9
Document 7188
Dominant topic: 2
Document 7189
Dominant topic: 9
Document 7190
Dominant topic: 3
Document 7191
Dominant topic: 8
Document 7192
Dominant topic: 7
Document 7193
Dominant topic: 5
Document 7194
Dominant topic: 2
Document 7195
Dominant topic: 7
Document 7196
Dominant topic: 5
Document 7197
Dominant topic: 7
Document 7198
Dominant topic: 7
Document 7199
Dominant topic: 4
Document 7200
Dominant topic: 3
Document 7201
Dominant topic: 7
Document 7202
Dominant topic: 7
Document 7203
Dominant topic: 0
Document 7204
Dominant topic: 5
Document 7205
Dominant topic: 7
Document 7206
Dominant topic: 8
Document 7207
Dominant topic: 3
Document 7208
Dominant topic: 3
Documen

In [17]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Visualize the topics
vis = gensimvis.prepare(lda_model, training_papers, dictionary=dictionary)
pyLDAvis.display(vis)

  default_term_info = default_term_info.sort_values(


In [None]:
### LDA MODEL DONE. NOW TRAIN A MODEL THAT USES TF-IDF AND K-MEANS. 

In [18]:
# Train a TF-IDF model on the training_papers
from gensim.models import TfidfModel
tfidf = gensim.models.TfidfModel(training_papers)
# Convert each document into a list of tuples where the first element is the word's integer id within the document and the second element is its TF_IDF score 
corpus_tfidf = tfidf[corpus]

In [19]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer

# Normalize TF-IDF vectors so that their magnitude = 1
tfidf_normalized = normalize(corpus_tfidf) 

# Apply PCA to reduce the dimensionality of the TF-IDF vectors
pca = PCA(n_components=2, random_state=42) 
tfidf_pca = pca.fit_transform(tfidf_normalized)

# Chooose number of clusters (topics) hyperparameter
num_clusters = 10 

# Apply the k-means algorithm to cluster the TF-IDF vectors
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_pca)

# Print the top words of each cluster (topic)
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df= 0.05, stop_words=stop_words) # max_df filters words that appear in over 80% of the documents and min_df filters words that appear in less than 5 percent of documents
terms = tfidf_vectorizer.get_feature_names()
for i in range(num_clusters):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

ValueError: ignored

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
# ATTEMPT AT COMBINING TF-IDF WITH LDA ALTHOUGH THIS IS NOT USUALLY DONE IN PRACTICE // ignore
# set the desired number of topics (hyperparameter)
num_topics = 10
# load the dictionary to be able to use it
temp = dictionary[0]
# create a dictionary that maps the integer id back to the corresponding word (token) 
id2word=dictionary.id2token 
# Train an LDA model on the corpus using the TF-IDF weighted vectors
lda_model_tfidf = gensim.models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=id2word)

# Print the topics and their top words
for topic in lda_model_tfidf.show_topics(num_topics=num_topics, formatted=False):
    print("Topic #{}:".format(topic[0]))
    print([word[0] for word in topic[1]])