## Load Dependencies

In [19]:
# load dependencies
import pandas as pd
import numpy as np

# NLTK dependencies
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# Gensim dependencies
from gensim import corpora
from gensim.models.ldamodel import LdaModel

In [20]:
# load the dataset
df = pd.read_csv("Research_Articles.csv")

# select first 1000 rows of data 
df = df[:1000]

# check first few rows of data 
df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


## Preprocess Text

In [21]:
# tokenize and preprocess the text
stop_words = stopwords.words('english')
lemma = WordNetLemmatizer()

def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    tokens = [lemma.lemmatize(t) for t in tokens]
    return tokens

In [22]:
# preprocess text of ABSTRACT column
documents = df["ABSTRACT"].apply(preprocess_text).tolist()

## Create Dictionary and Corpus for LDA

In [23]:
# create a dictionary and corpus for the LDA model
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]

## Train LDA Model 

In [24]:
# train the LDA model
num_topics = 10
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=10)

## Check Top 10 Topics

In [25]:
# print the topics and their top words
for topic_id in range(num_topics):
    top_words = [word for word, prob in lda_model.show_topic(topic_id, topn=10)]
    print("Topic {}: {}".format(topic_id, ", ".join(top_words)))

Topic 0: data, model, state, system, analysis, problem, result, k, show, two
Topic 1: star, system, planet, data, model, present, result, problem, function, paper
Topic 2: network, model, system, data, algorithm, problem, method, time, number, neural
Topic 3: p, model, c, show, case, also, system, state, using, result
Topic 4: algorithm, problem, model, network, result, function, method, paper, show, study
Topic 5: method, model, field, magnetic, state, result, quantum, using, energy, phase
Topic 6: data, result, used, control, distribution, function, problem, model, algorithm, also
Topic 7: model, method, network, learning, image, using, data, approach, neural, show
Topic 8: model, result, show, performance, data, study, simulation, based, distribution, system
Topic 9: system, graph, result, study, n, show, set, boundary, using, property


## Check 10 Topics with Probability

In [26]:
# check 10 topics with probability 
lda_model.print_topics(num_topics=10, num_words=10)

[(0,
  '0.008*"data" + 0.006*"model" + 0.006*"state" + 0.005*"system" + 0.005*"analysis" + 0.005*"problem" + 0.004*"result" + 0.004*"k" + 0.004*"show" + 0.004*"two"'),
 (1,
  '0.008*"star" + 0.007*"system" + 0.006*"planet" + 0.006*"data" + 0.006*"model" + 0.004*"present" + 0.004*"result" + 0.004*"problem" + 0.004*"function" + 0.004*"paper"'),
 (2,
  '0.017*"network" + 0.013*"model" + 0.009*"system" + 0.009*"data" + 0.008*"algorithm" + 0.007*"problem" + 0.006*"method" + 0.006*"time" + 0.005*"number" + 0.005*"neural"'),
 (3,
  '0.006*"p" + 0.006*"model" + 0.005*"c" + 0.005*"show" + 0.005*"case" + 0.005*"also" + 0.005*"system" + 0.004*"state" + 0.004*"using" + 0.004*"result"'),
 (4,
  '0.009*"algorithm" + 0.007*"problem" + 0.007*"model" + 0.006*"network" + 0.006*"result" + 0.005*"function" + 0.005*"method" + 0.005*"paper" + 0.005*"show" + 0.005*"study"'),
 (5,
  '0.011*"method" + 0.008*"model" + 0.006*"field" + 0.006*"magnetic" + 0.005*"state" + 0.005*"result" + 0.004*"quantum" + 0.004*"u