In [15]:
import pickle
import pandas as pd
import numpy as np
from pprint import pprint

In [6]:
corpus = pickle.load(open('corpus','rb'))           #list of text/doc
id2word = pickle.load(open('id2word', 'rb'))        #dictionary
model = pickle.load(open('optimal_lda_model', 'rb'))#LDA model



In [22]:
#reading in original df
df=pd.read_csv('processed_clean_df.csv')
df.shape

(258961, 11)

In [25]:
## Tokenize words and further clean-up text
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data = df.abstract_processed.values.tolist()
data_words = list(sent_to_words(data))

# Finding the dominant topic in each document

In [27]:
def format_topics_sentences(ldamodel=model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [28]:
#this will take awhile
df_topic_sents_keywords = format_topics_sentences(ldamodel=model, corpus=corpus, texts=data)

In [37]:
df_topic_sents_keywords.head()

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,0
0,15.0,0.3562,"risk, procedure, mask, surgical, use, surgery,...",since 1988 when world health organization (who...
1,17.0,0.3029,"system, use, technology, application, base, di...",the paper is focused on analysis of compliance...
2,22.0,0.3984,"care, patient, health, pandemic, service, heal...",purpose due to covid 19 pandemic the treatment...
3,22.0,0.3532,"care, patient, health, pandemic, service, heal...",backgroundwith no vaccine or treatment for sar...
4,6.0,0.5648,"health, pandemic, disease, public, country, wo...",human history is observing a very horrible and...


In [29]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
# Show
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,15.0,0.3562,"risk, procedure, mask, surgical, use, surgery,...",since 1988 when world health organization (who...
1,1,17.0,0.3029,"system, use, technology, application, base, di...",the paper is focused on analysis of compliance...
2,2,22.0,0.3984,"care, patient, health, pandemic, service, heal...",purpose due to covid 19 pandemic the treatment...
3,3,22.0,0.3532,"care, patient, health, pandemic, service, heal...",backgroundwith no vaccine or treatment for sar...
4,4,6.0,0.5648,"health, pandemic, disease, public, country, wo...",human history is observing a very horrible and...
5,5,14.0,0.265,"study, treatment, review, use, trial, clinical...",inhaled therapy is the cornerstone in the mana...
6,6,12.0,0.4439,"cell, response, expression, immune, induce, me...",fibrosis is driven by a misdirected cell respo...
7,7,3.0,0.4693,"cov, sar, infection, virus, respiratory, viral...",a subset of patients with sars-cov-2 infection...
8,8,12.0,0.839,"cell, response, expression, immune, induce, me...",the reproductive tract in avian females is sen...
9,9,20.0,0.597,"research, development, paper, article, new, pr...",this article discusses the impact of the covid...


In [30]:
df_dominant_topic.shape

(258961, 5)

In [47]:
df_dominant_topic['Title'] = df.title
df_dominant_topic['url'] = df.url

In [48]:
df_dominant_topic.head()

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,Title,url
0,0,15.0,0.3562,"risk, procedure, mask, surgical, use, surgery,...",since 1988 when world health organization (who...,Progress Toward Poliovirus Containment Impleme...,
1,1,17.0,0.3029,"system, use, technology, application, base, di...",the paper is focused on analysis of compliance...,Anti)social Monitoring: Law and (or) Expediency?,
2,2,22.0,0.3984,"care, patient, health, pandemic, service, heal...",purpose due to covid 19 pandemic the treatment...,Running of high patient volume radiation oncol...,
3,3,22.0,0.3532,"care, patient, health, pandemic, service, heal...",backgroundwith no vaccine or treatment for sar...,Rapid Development of a De Novo Convalescent Pl...,http://medrxiv.org/cgi/content/short/2020.10.2...
4,4,6.0,0.5648,"health, pandemic, disease, public, country, wo...",human history is observing a very horrible and...,COVID – 19 and gravid mothers,


In [52]:
df_dominant_topic.to_csv("df_dominant_topic.csv", index=False)

# Find the most representative document for each topic

In [51]:
# Group top 5 doc under each topic
sent_topics_sorted_df = pd.DataFrame()

sent_topics_outdf_grpd = df_dominant_topic.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorted_df = pd.concat([sent_topics_sorted_df, 
                                             grp.sort_values(['Topic_Perc_Contrib'], ascending=[0]).head(5)], 
                                            axis=0)

# Reset Index    
sent_topics_sorted_df.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorted_df.columns = ['Document_No', 'Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text", "Title", "url"]

# Show
sent_topics_sorted_df.head(10)

Unnamed: 0,Document_No,Topic_Num,Topic_Perc_Contrib,Keywords,Text,Title,url
0,863,0.0,0.8339,"student, education, learning, online, school, ...",when the covid-19 pandemic closed her universi...,First Person: The pandemic’s silver lining: Re...,
1,231,0.0,0.7932,"student, education, learning, online, school, ...",the east london gp and honorary clinical senio...,Supporting GPs to tackle domestic violence: fi...,
2,114868,0.0,0.764,"student, education, learning, online, school, ...",during the covid-19 pandemic universities arou...,Perspective from a Teaching and Learning Cente...,https://doi.org/10.5688/ajpe8142; https://www....
3,35771,0.0,0.764,"student, education, learning, online, school, ...",during the covid-19 pandemic universities arou...,Perspective from a Teaching and Learning Cente...,
4,95651,0.0,0.7266,"student, education, learning, online, school, ...",remote learning has been thrust into the spotl...,Promote an unexpected online experience throug...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...
5,27064,1.0,0.9311,"increase, lockdown, period, number, country, c...",we investigate for the first time the empirica...,The Macroeconomic Determinants of COVID19 Mort...,
6,142793,1.0,0.9236,"increase, lockdown, period, number, country, c...",in this study we characterize the impacts of c...,Spatial and temporal variations of air polluti...,https://www.ncbi.nlm.nih.gov/pubmed/33024128/;...
7,60134,1.0,0.9001,"increase, lockdown, period, number, country, c...",in the present study pollutants levels from 24...,Covid-19 and air pollution in indian cities: W...,
8,181787,1.0,0.8929,"increase, lockdown, period, number, country, c...",ron fricker assesses the impact of the pandemi...,Covid-19: One year on…,
9,217422,1.0,0.8929,"increase, lockdown, period, number, country, c...",ron fricker assesses the impact of the pandemi...,Covid‐19: One year on…,https://www.ncbi.nlm.nih.gov/pubmed/33821159/;...


In [43]:
sent_topics_sorted_df[['Topic_Num','Keywords', 'Title']]

Unnamed: 0,Topic_Num,Keywords,Title
0,0.0,"student, education, learning, online, school, ...",First Person: The pandemic’s silver lining: Re...
1,1.0,"increase, lockdown, period, number, country, c...",The Macroeconomic Determinants of COVID19 Mort...
2,2.0,"protein, drug, target, vaccine, host, interact...",In Silico Identification of a Potent Arsenic B...
3,3.0,"cov, sar, infection, virus, respiratory, viral...",Evaluation of cutaneous symptoms in children i...
4,4.0,"patient, case, symptom, day, ct, follow, clini...",Olfactory disorder in patients infected with S...
5,5.0,"child, health, mental, anxiety, psychological,...",The immediate mental health impacts of the COV...
6,6.0,"health, pandemic, disease, public, country, wo...",Strategies for prevention and control of COVID...
7,7.0,"ace, lung, receptor, tissue, entry, angiotensi...","[2019 novel coronavirus, angiotensin convertin..."
8,8.0,"policy, economic, pandemic, social, crisis, im...",An ayuda to the least advantaged: providing a ...
9,9.0,"food, high, energy, temperature, surface, prod...","Highly regioselective 1,3-dipolar cycloadditio..."


# Topic distribution across documents

In [46]:
# Number of Documents for Each Topic
topic_counts = df_dominant_topic['Dominant_Topic'].value_counts()

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 4)

# Topic Number and Keywords
topic_num_keywords = df_dominant_topic[['Dominant_Topic', 'Keywords']]

# Concatenate Column wise
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1)

# Change Column names
df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']

# Show
df_dominant_topics.head()

Unnamed: 0,Dominant_Topic,Topic_Keywords,Num_Documents,Perc_Documents
0.0,15.0,"risk, procedure, mask, surgical, use, surgery,...",4262.0,0.0165
1.0,17.0,"system, use, technology, application, base, di...",11065.0,0.0427
2.0,22.0,"care, patient, health, pandemic, service, heal...",11384.0,0.044
3.0,22.0,"care, patient, health, pandemic, service, heal...",16376.0,0.0632
4.0,6.0,"health, pandemic, disease, public, country, wo...",13957.0,0.0539
