In [1]:
import nltk
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
stop = set(stopwords.words('english'))
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import string
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
import csv
from csv import reader
tit = []
with open("/content/drive/My Drive/topic modelling dataset/COVID-19_title.csv", 'r') as read_obj:
    # pass the file object to reader() to get the reader object
    csv_reader = reader(read_obj)
    # Iterate over each row in the csv using reader object
    for row in csv_reader:
        # row variable is a list that represents a row in csv
        tit.append(row)
        

In [4]:
val = []
for i in tit:
  val.append(i[0])

In [5]:
print(val[:5])

['clinical features culture-proven mycoplasma pneumoniae infections king abdulaziz university hospital jeddah saudi arabia', 'nitric oxide pro-inflammatory mediator lung diseas', 'surfactant protein-d pulmonary host defens', 'role endothelin-1 lung diseas', 'gene expression epithelial cells response pneumovirus infect']


Generating a document term matrix

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# keep top 1000 terms
vectorizer = TfidfVectorizer(stop_words='english', max_features= 1000,  max_df = 0.5, smooth_idf=True)
X = vectorizer.fit_transform(val)

X.shape

(242108, 1000)

Topic Modeling

In [7]:
#represent each and every term and document as a vector

from sklearn.decomposition import TruncatedSVD

# TruncatedSVD is used to perform the task of matrix decomposition
#trying out with 8 topics 
svd_model = TruncatedSVD(n_components=8, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

len(svd_model.components_)

8

In [9]:
#components of svd_model are our actual topics. 

terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0],end=" ")
    print()
        

Topic 0: 
covid 19 pandem pandemic patients coronavirus disease 
Topic 1: 
sars cov coronavirus disease 2019 respiratory infection 
Topic 2: 
coronavirus disease 2019 novel respiratory acute syndrome 
Topic 3: 
virus respiratory acute syndrome severe viral patients 
Topic 4: 
health pandemic pandem public care virus mental 
Topic 5: 
pandem respiratory acute syndrome severe care management 
Topic 6: 
virus influenza disease protein infectious human pandem 
Topic 7: 
patients review clinical systematic disease meta analysi 


In [12]:
from sklearn.decomposition import TruncatedSVD

#trying out with 7 topics 
svd_model = TruncatedSVD(n_components=7, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)


TruncatedSVD(algorithm='randomized', n_components=7, n_iter=100,
             random_state=122, tol=0.0)

In [13]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0],end=" ")
    print()

Topic 0: 
covid 19 pandem pandemic patients coronavirus disease 
Topic 1: 
sars cov coronavirus disease 2019 respiratory infection 
Topic 2: 
coronavirus disease 2019 novel respiratory acute syndrome 
Topic 3: 
virus respiratory acute syndrome severe viral patients 
Topic 4: 
health pandemic pandem public care virus mental 
Topic 5: 
pandem respiratory acute syndrome severe care management 
Topic 6: 
virus influenza disease protein infectious human pandem 


In [14]:
from sklearn.decomposition import TruncatedSVD

#trying out with 11 topics 
svd_model = TruncatedSVD(n_components=11, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

TruncatedSVD(algorithm='randomized', n_components=11, n_iter=100,
             random_state=122, tol=0.0)

In [15]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0],end=" ")
    print()

Topic 0: 
covid 19 pandem pandemic patients coronavirus disease 
Topic 1: 
sars cov coronavirus disease 2019 respiratory infection 
Topic 2: 
coronavirus disease 2019 novel respiratory acute syndrome 
Topic 3: 
virus respiratory acute syndrome severe viral patients 
Topic 4: 
health pandemic pandem public care virus mental 
Topic 5: 
pandem respiratory acute syndrome severe care management 
Topic 6: 
virus influenza disease protein infectious human pandem 
Topic 7: 
patients review clinical systematic disease meta analysi 
Topic 8: 
pandemic pandem impact management patients influenza surgery 
Topic 9: 
review systematic meta analysi case report treatment 
Topic 10: 
disease 2019 virus acute respiratory severe syndrome 


Changing the number of Max features in Document Term Matrix

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# keep top 1000 terms
vectorizer = TfidfVectorizer(stop_words='english', max_features= 10000,  max_df = 0.5, smooth_idf=True)
X = vectorizer.fit_transform(val)

X.shape

(242108, 10000)

In [17]:
#represent each and every term and document as a vector

from sklearn.decomposition import TruncatedSVD

# TruncatedSVD is used to perform the task of matrix decomposition
#trying out with 8 topics 
svd_model = TruncatedSVD(n_components=8, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

TruncatedSVD(algorithm='randomized', n_components=8, n_iter=100,
             random_state=122, tol=0.0)

In [19]:
#components of svd_model are our actual topics. 

terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0],end=" ")
    print()

#almost the same results were observed

Topic 0: 
covid 19 pandem patients coronavirus pandemic disease 
Topic 1: 
coronavirus sars cov 2019 disease respiratory acute 
Topic 2: 
sars cov infection covid 19 infect potential 
Topic 3: 
respiratory acute syndrome severe virus viral infections 
Topic 4: 
health pandemic public care mental virus global 
Topic 5: 
pandem coronavirus syndrome acute respiratory severe sars 
Topic 6: 
virus pandem influenza human protein infectious infect 
Topic 7: 
pandem patients review systematic clinical care meta 


Changing the Max features to a very high value

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

# keep top 1000 terms
vectorizer = TfidfVectorizer(stop_words='english', max_features= 150000,  max_df = 0.5, smooth_idf=True)
X = vectorizer.fit_transform(val)

X.shape

(242108, 75177)

In [21]:
#represent each and every term and document as a vector

from sklearn.decomposition import TruncatedSVD

# TruncatedSVD is used to perform the task of matrix decomposition
#trying out with 8 topics 
svd_model = TruncatedSVD(n_components=8, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

TruncatedSVD(algorithm='randomized', n_components=8, n_iter=100,
             random_state=122, tol=0.0)

In [22]:
#components of svd_model are our actual topics. 

terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0],end=" ")
    print()

#almost the same results were observed

Topic 0: 
covid 19 coronavirus patients pandem disease pandemic 
Topic 1: 
coronavirus sars cov respiratory 2019 disease acute 
Topic 2: 
sars cov infection infect covid 19 detection 
Topic 3: 
respiratory acute syndrome severe virus viral syndrom 
Topic 4: 
health public mental care pandemic pandem global 
Topic 5: 
review systematic patients virus health meta clinical 
Topic 6: 
virus influenza human infectious viral infect protein 
Topic 7: 
pandem review systematic virus care patients meta 


In [23]:
#represent each and every term and document as a vector

from sklearn.decomposition import TruncatedSVD

# TruncatedSVD is used to perform the task of matrix decomposition
#trying out with 11 topics 
svd_model = TruncatedSVD(n_components=11, algorithm='randomized', n_iter=100, random_state=122)

svd_model.fit(X)

TruncatedSVD(algorithm='randomized', n_components=11, n_iter=100,
             random_state=122, tol=0.0)

In [24]:
#components of svd_model are our actual topics. 

terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0],end=" ")
    print()

#almost the same results were observed

Topic 0: 
covid 19 coronavirus patients pandem disease pandemic 
Topic 1: 
coronavirus sars cov respiratory 2019 disease acute 
Topic 2: 
sars cov infection infect covid 19 detection 
Topic 3: 
respiratory acute syndrome severe virus viral syndrom 
Topic 4: 
health public mental care pandemic pandem global 
Topic 5: 
review systematic patients virus health meta clinical 
Topic 6: 
virus influenza human infectious viral infect protein 
Topic 7: 
pandem review systematic virus care patients meta 
Topic 8: 
review systematic meta analysi coronavirus pandemic 19 
Topic 9: 
pandemic pandem care patients influenza impact management 
Topic 10: 
clinical novel infection china coronavirus pneumonia pandem 
