In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#LDA training
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA

documents = pd.read_csv('Aug25_feeds.csv')


def classifyDocuments(text):
	# Initialise the count vectorizer with the English stop words
	count_vectorizer = CountVectorizer(stop_words='english')
	# Fit and transform the processed titles
	count_data = count_vectorizer.fit_transform(documents[text].astype('str'))
	# Helper function
	def print_topics(model, count_vectorizer, n_top_words):
	    words = count_vectorizer.get_feature_names()
	    for topic_idx, topic in enumerate(model.components_):
	        print("\nTopic #%d:" % topic_idx)
	        print(" ".join([words[i]
	                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
	# Tweak the two parameters below
	number_topics = 10
	number_words = 10

	# Create and fit the LDA model
	lda = LDA(n_components=number_topics, n_jobs=-1, max_iter = 10)
	lda.fit(count_data)

	# Print the topics found by the LDA model
	print("Topics found via LDA based on:" + text)
	print_topics(lda, count_vectorizer, number_words)
	#label each document to different topics
	topic_values = lda.transform(count_data)
	column_name = text + 'topic_lda'
	documents[column_name] = topic_values.argmax(axis=1)

classifyDocuments("preprocessed_title")
classifyDocuments("description_lem")

documents.to_csv('Aug25_feeds.csv',index = False)

Topics found via LDA based on:preprocessed_title

Topic #0:
covid19 medical new devices manufacturing disease health heart drug cancer

Topic #1:
fda covid19 coronavirus guidance test new update clinical issues use

Topic #2:
fda new drug research approves ema covid19 board shortages generics

Topic #3:
covid19 coronavirus 2020 update roundup recon daily vaccine medtech merck

Topic #4:
cancer therapy gene recon drug cell new phase trial fda

Topic #5:
fda device medical guidance approves new drug patients finalizes treatment

Topic #6:
fda biosimilars biosimilar drug regulatory products pact biological asia roundup

Topic #7:
eu regulatory uk vaccine medical mdr covid19 device new roundup

Topic #8:
supplier biotech week nabs stories covid19 raises rd mdm west

Topic #9:
new company 3d medtech printing medtronic covid19 fda blood point
Topics found via LDA based on:description_lem

Topic #0:
patient device company said data technology medical use study year

Topic #1:
health biosimila

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#LDA training
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA

documents = pd.read_csv('Aug25_feeds.csv')


def classifyDocuments(text):
    # Initialise the count vectorizer with the English stop words
    count_vectorizer = CountVectorizer(stop_words='english')
    # Fit and transform the processed titles
    count_data = count_vectorizer.fit_transform(documents[text].astype('str'))
    # build lda model
    lda_model = LDA(n_components=10,max_iter=10,learning_method='online',
                    random_state=100,batch_size=128,evaluate_every=-1,n_jobs=-1,)
    lda_output = lda_model.fit_transform(count_data)
    print(lda_model)
    
classifyDocuments("preprocessed_title")
classifyDocuments("description_lem")

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=-1,
                          perp_tol=0.1, random_state=100, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=-1,
                          perp_tol=0.1, random_state=100, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)


In [19]:
from sklearn.model_selection import GridSearchCV
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')
# Fit and transform the processed titles
data_vectorized = count_vectorizer.fit_transform(documents['description_lem'].astype('str'))
search_params= {'n_components':[10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]}
lda = LDA(max_iter=5, learning_method='online', learning_offset=50., random_state=0)
#init grid search class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
       estimator=LDA(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
            perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0),
         n_jobs=1,
       param_grid={'n_components': [10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)



GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method=None,
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=1,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                                 total_

In [20]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.9, 'n_components': 10}
Best Log Likelihood Score:  -1009615.978137878
Model Perplexity:  5724.106893796683


In [32]:
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = documents['id'].to_numpy()
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic["dominant_topic"] = dominant_topic
# Styling
def color_green(val):
    color = "green" if val > .1 else "black"
    return "color: {col}".format(col=color)
def make_bold(val):
    weight = 700 if val > .1 else 400
    return "font-weight: {weight}".format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
16552,0.0,0.75,0.0,0.0,0.0,0.22,0.0,0.0,0.0,0.0,1
16553,0.0,0.89,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,1
16554,0.0,0.91,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
16555,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.9,0.01,8
16556,0.0,0.73,0.0,0.0,0.0,0.0,0.0,0.15,0.09,0.0,1
16571,0.01,0.42,0.01,0.01,0.01,0.01,0.01,0.01,0.53,0.01,8
16572,0.01,0.65,0.01,0.01,0.01,0.01,0.01,0.01,0.3,0.01,1
16573,0.01,0.44,0.01,0.14,0.32,0.01,0.01,0.01,0.06,0.01,1
16574,0.0,0.96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1111,0.0,0.97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [34]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = count_vectorizer.get_feature_names()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()

Unnamed: 0,00,000,000028,0001,0005,001,0014â,0018â,002,0020,...,î¼g,î¼gpatch,česky,œuvre,širokou,životy,здравоохранения,министерство,российской,федерации
Topic0,0.168749,0.159472,0.169487,0.15091,0.15625,0.159539,0.161847,0.157676,0.159756,0.15904,...,0.162936,0.155155,0.148532,0.167282,0.153631,0.156014,0.157805,0.15525,0.153787,0.153757
Topic1,0.900397,2.714902,0.166981,2.386586,1.411663,1.159303,1.017418,1.001755,0.189989,0.20871,...,0.403766,0.32844,0.157298,0.14835,0.164739,0.151736,0.178705,0.191376,0.178033,0.175259
Topic2,0.161869,0.163331,0.16874,0.163161,0.153656,0.15676,0.154282,0.157431,0.180176,0.152079,...,0.161295,0.16367,1.000538,0.15256,1.081641,1.046528,0.154779,0.157001,0.153023,0.158199
Topic3,0.153437,0.151178,1.028097,0.494333,0.157918,0.894259,0.173122,0.161204,1.019912,0.144961,...,0.146625,0.154692,0.148362,0.149776,0.144432,0.160781,0.160068,0.154916,0.151464,0.156971
Topic4,0.158714,0.162864,0.160099,0.158465,0.150876,0.165568,0.155906,0.148754,0.15495,0.159514,...,1.799325,1.796135,0.160377,0.154893,0.160179,0.160215,0.153111,0.166934,0.152507,0.157215


In [38]:
# Show top n keywords for each topic
def show_topics(vectorizer, lda_model, n_words):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=count_vectorizer, lda_model=best_lda_model, n_words=10)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,,impella,ghx,amblyoplay,shuttle,seraph,abiomed,beep,odenkirk,pizza
Topic 1,device,company,said,patient,medical,fda,new,drug,covid19,product
Topic 2,martha,5g,rempel,gupta,enamel,mmwaves,blocker,tosaf,mcdc,butterfly
Topic 3,jewellery,streamone,cadmium,newsletter,og,ripretinib,mxf,heffernen,cerebral,paxxus
Topic 4,badams,biosimilar,drug,ntaylor,phase,thu,trial,biotech,mon,wed
Topic 5,le,la,et,texte,du,en,santé,presse,covid19,welding
Topic 6,sicd,recallsâ,gbil,retrieving,toole,dualchamber,oâ,autolap,burst,difficulty
Topic 7,biosticker,copolymer,mault,csf,gfap,lnp,parikh,dural,biointellisense,sabic
Topic 8,regulatory,news,daily,welcome,intelligence,briefing,reconnaissance,roundup,update,overview
Topic 9,plastic,medaccred,audit,moulding,injection,accreditation,staking,tip,pulsestaking,swaging


In [39]:
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')
# Fit and transform the processed titles
data_vectorized = count_vectorizer.fit_transform(documents['preprocessed_title'].astype('str'))
search_params= {'n_components':[10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]}
lda = LDA(max_iter=5, learning_method='online', learning_offset=50., random_state=0)
#init grid search class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)
GridSearchCV(cv=None, error_score='raise',
       estimator=LDA(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=-1,
            perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0),
         n_jobs=1,
       param_grid={'n_components': [10, 15, 20, 25, 30], 'learning_decay': [0.5, 0.7, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)



GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method=None,
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=-1,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                                 total

In [40]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.7, 'n_components': 10}
Best Log Likelihood Score:  -159116.58984956337
Model Perplexity:  4657.5724669559895


In [41]:
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = documents['id'].to_numpy()
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic["dominant_topic"] = dominant_topic
# Styling
def color_green(val):
    color = "green" if val > .1 else "black"
    return "color: {col}".format(col=color)
def make_bold(val):
    weight = 700 if val > .1 else 400
    return "font-weight: {weight}".format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
16552,0.01,0.01,0.01,0.01,0.89,0.01,0.01,0.01,0.01,0.01,4
16553,0.01,0.01,0.01,0.01,0.01,0.01,0.14,0.01,0.76,0.01,8
16554,0.01,0.01,0.58,0.01,0.01,0.01,0.01,0.01,0.19,0.14,2
16555,0.01,0.65,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.3,1
16556,0.02,0.02,0.02,0.02,0.02,0.85,0.02,0.02,0.02,0.02,5
16571,0.72,0.21,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0
16572,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.26,0.01,0.64,9
16573,0.76,0.14,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0
16574,0.01,0.01,0.01,0.01,0.01,0.01,0.9,0.01,0.01,0.01,6
1111,0.01,0.55,0.01,0.35,0.01,0.01,0.01,0.01,0.01,0.01,1


In [42]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = count_vectorizer.get_feature_names()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()

Unnamed: 0,039death,039discommon039,039finally039,039igniting,039iphone,039more,039pulse039,039robot,039rosa039colored,039there039s,...,zirabev,zogenix,zolgensma,zoll,zotefoams,zumutor,zydelig,zydus,zynteglo,čr
Topic0,0.722593,0.100629,0.100735,0.100539,2.278888,1.507963,0.100654,2.27813,0.963769,0.100625,...,0.100577,0.100642,0.100604,0.10089,0.100534,0.100686,0.100618,0.100603,0.100594,0.805651
Topic1,0.100593,0.896583,0.100778,0.100601,0.100583,0.100556,0.100632,0.100601,0.100644,0.125211,...,0.100699,0.100534,0.10068,0.101002,0.100708,0.100519,0.100639,0.100661,0.100549,0.100627
Topic2,0.100672,0.102052,0.100534,0.100619,0.100726,0.100642,0.100622,0.100648,0.102241,1.348197,...,0.101878,0.106808,0.100822,0.10064,0.100623,0.100493,0.106243,0.10049,0.100644,0.100665
Topic3,0.100579,0.100597,0.100716,0.100677,0.103507,0.101446,0.74872,0.104349,0.100579,0.100535,...,0.100636,0.100703,0.101881,0.100744,0.100759,0.101369,0.105811,0.102122,0.105879,0.100543
Topic4,0.10065,0.10066,0.100478,0.100571,0.100521,0.100683,0.100718,0.100562,0.100632,0.109237,...,0.100617,0.100634,0.100657,0.100602,0.103548,0.100667,0.10056,0.10065,0.100537,0.100552


In [43]:
# Show top n keywords for each topic
def show_topics(vectorizer, lda_model, n_words):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=count_vectorizer, lda_model=best_lda_model, n_words=10)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,nabs,biotech,lilly,ipo,vaccines,future,buyout,cancer,report,way
Topic 1,recon,deal,drugs,amid,merck,chutes,ladders,ceo,abbvie,device
Topic 2,2020,medtech,cell,biosimilar,antibody,cancer,launches,rd,uk,company
Topic 3,covid19,coronavirus,fda,update,medical,new,device,devices,treatment,test
Topic 4,fda,guidance,covid19,new,clinical,trials,issues,drug,use,development
Topic 5,roundup,regulatory,daily,eu,study,announces,3d,global,health,asia
Topic 6,covid19,vaccine,therapy,gene,pfizer,supplier,recon,production,raises,week
Topic 7,patients,improve,blood,demand,medtronic,surgical,heart,know,develop,systems
Topic 8,approves,fda,drug,phase,manufacturing,trial,biosimilars,cancer,data,recon
Topic 9,fda,drug,gilead,recon,approvals,remdesivir,shortages,hit,delivery,breast
