In [1]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from gensim.models import LdaMulticore
import pandas as pd
import numpy as np
np.random.seed(400)

In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
import spacy.cli
spacy.cli.download("en_core_web_sm")

# Load spaCy model
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [7]:
data = pd.read_excel('Drugs.xlsx')['New Tweet']
data.head()

0    lauren boebert calling abolishment dept educat...
1    wish had read seen post last week bought ascor...
2    its covid amoxicillin will not work pretty sur...
3    alarm got coincidentally i have just started a...
4    #beantibioticsaware it is #antibioticawareness...
Name: New Tweet, dtype: object

In [5]:
# import en_core_web_sm
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []
    for text in texts:
 # Handle NaN values
        if pd.isna(text):
            text = ''
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = ' '.join(new_text)
        texts_out.append(final)
    return texts_out


lemmatized_texts = lemmatization(data)

In [6]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)
print(data_words[0][0:20])

['call', 'dept', 'education', 'chlamydia', 'speak', 'danger']


In [7]:
# Create a dictionary containing the number of times a word appears in the training set using gensim.corpora.Dictionary and call it dictionary

dictionary = gensim.corpora.Dictionary(data_words)

In [8]:
# Checking dictionary created

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 call
1 chlamydia
2 danger
3 dept
4 education
5 speak
6 always
7 amoxiclave
8 ascorbic
9 buy
10 cough


In [9]:
# Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
# words and how many times those words appear. Save this to 'bow_corpus'
bow_corpus = [dictionary.doc2bow(doc) for doc in data_words]

In [10]:
# Preview BOW for our sample preprocessed document

document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 20 ("doctor") appears 1 time.
Word 28 ("get") appears 1 time.
Word 55 ("take") appears 1 time.
Word 70 ("right") appears 1 time.
Word 162 ("cefexime") appears 1 time.
Word 163 ("fever") appears 1 time.
Word 164 ("late") appears 1 time.
Word 165 ("prescribed") appears 1 time.
Word 166 ("relief") appears 1 time.


In [11]:
# Explore other documents in your corpus
for document_num in range(10):  # Print information for the first 10 documents
    bow_doc_x = bow_corpus[document_num]
    for i in range(len(bow_doc_x)):
        print("Document {}: Word {} (\"{}\") appears {} time.".format(
            document_num, bow_doc_x[i][0], dictionary[bow_doc_x[i][0]], bow_doc_x[i][1]
        ))
    print("\n")

Document 0: Word 0 ("call") appears 1 time.
Document 0: Word 1 ("chlamydia") appears 1 time.
Document 0: Word 2 ("danger") appears 1 time.
Document 0: Word 3 ("dept") appears 1 time.
Document 0: Word 4 ("education") appears 1 time.
Document 0: Word 5 ("speak") appears 1 time.


Document 1: Word 6 ("always") appears 1 time.
Document 1: Word 7 ("amoxiclave") appears 1 time.
Document 1: Word 8 ("ascorbic") appears 1 time.
Document 1: Word 9 ("buy") appears 1 time.
Document 1: Word 10 ("cough") appears 1 time.
Document 1: Word 11 ("last") appears 1 time.
Document 1: Word 12 ("post") appears 1 time.
Document 1: Word 13 ("read") appears 1 time.
Document 1: Word 14 ("see") appears 1 time.
Document 1: Word 15 ("syrup") appears 1 time.
Document 1: Word 16 ("time") appears 1 time.
Document 1: Word 17 ("week") appears 1 time.


Document 2: Word 18 ("amoxicillin") appears 1 time.
Document 2: Word 19 ("covid") appears 2 time.
Document 2: Word 20 ("doctor") appears 1 time.
Document 2: Word 21 ("pres

In [12]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5,
                                   random_state=46,
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [13]:
# For each topic, we will explore the words occuring in that topic and its relative weight

for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.018*"amoxicillin" + 0.014*"get" + 0.012*"lorazepam" + 0.009*"infection" + 0.009*"week" + 0.009*"say" + 0.008*"give" + 0.008*"pain" + 0.008*"prescription" + 0.007*"antibiotic"


Topic: 1 
Words: 0.016*"take" + 0.011*"amoxicillin" + 0.011*"use" + 0.011*"patient" + 0.008*"give" + 0.007*"short" + 0.007*"help" + 0.007*"link" + 0.006*"lorazepam" + 0.006*"get"


Topic: 2 
Words: 0.024*"take" + 0.022*"amoxicillin" + 0.015*"get" + 0.012*"lorazepam" + 0.012*"make" + 0.010*"go" + 0.009*"work" + 0.008*"nurofen" + 0.008*"so" + 0.007*"time"


Topic: 3 
Words: 0.024*"amoxicillin" + 0.014*"get" + 0.011*"go" + 0.010*"use" + 0.010*"take" + 0.009*"lorazepam" + 0.009*"infection" + 0.009*"give" + 0.008*"antibiotic" + 0.007*"tooth"


Topic: 4 
Words: 0.052*"lorazepam" + 0.018*"take" + 0.013*"just" + 0.012*"get" + 0.010*"so" + 0.010*"need" + 0.010*"say" + 0.008*"sleep" + 0.008*"cancer" + 0.007*"doctor"




### Amoxicillin LDA Model

In [14]:
am_data = pd.read_excel('Amoxicillin.xlsx')['New Tweet']

In [15]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []
    for text in texts:
 # Handle NaN values
        if pd.isna(text):
            text = ''
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = ' '.join(new_text)
        texts_out.append(final)
    return texts_out


lemmatized_texts = lemmatization(am_data)
print(lemmatized_texts[0][0:100])

call dept education chlamydia speak danger


In [16]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

am_data_words = gen_words(lemmatized_texts)
print(am_data_words[0][0:20])

['call', 'dept', 'education', 'chlamydia', 'speak', 'danger']


In [17]:
# Create a dictionary containing the number of times a word appears in the training set using gensim.corpora.Dictionary and call it dictionary

am_dict = gensim.corpora.Dictionary(am_data_words)

In [18]:
# Checking dictionary created

count = 0
for k, v in am_dict.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 call
1 chlamydia
2 danger
3 dept
4 education
5 speak
6 always
7 amoxiclave
8 ascorbic
9 buy
10 cough


In [19]:
# Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many words and how many times those words appear. Save this to 'bow_corpus'
bow_corpus = [am_dict.doc2bow(doc) for doc in am_data_words]

In [20]:
# Preview BOW for our sample preprocessed document

document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     am_dict[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 20 ("doctor") appears 1 time.
Word 28 ("get") appears 1 time.
Word 55 ("take") appears 1 time.
Word 70 ("right") appears 1 time.
Word 162 ("cefexime") appears 1 time.
Word 163 ("fever") appears 1 time.
Word 164 ("late") appears 1 time.
Word 165 ("prescribed") appears 1 time.
Word 166 ("relief") appears 1 time.


In [21]:
# Explore other documents in your corpus
for document_num in range(10):  # Print information for the first 10 documents
    bow_doc_x = bow_corpus[document_num]
    for i in range(len(bow_doc_x)):
        print("Document {}: Word {} (\"{}\") appears {} time.".format(
            document_num, bow_doc_x[i][0], am_dict[bow_doc_x[i][0]], bow_doc_x[i][1]
        ))
    print("\n")

Document 0: Word 0 ("call") appears 1 time.
Document 0: Word 1 ("chlamydia") appears 1 time.
Document 0: Word 2 ("danger") appears 1 time.
Document 0: Word 3 ("dept") appears 1 time.
Document 0: Word 4 ("education") appears 1 time.
Document 0: Word 5 ("speak") appears 1 time.


Document 1: Word 6 ("always") appears 1 time.
Document 1: Word 7 ("amoxiclave") appears 1 time.
Document 1: Word 8 ("ascorbic") appears 1 time.
Document 1: Word 9 ("buy") appears 1 time.
Document 1: Word 10 ("cough") appears 1 time.
Document 1: Word 11 ("last") appears 1 time.
Document 1: Word 12 ("post") appears 1 time.
Document 1: Word 13 ("read") appears 1 time.
Document 1: Word 14 ("see") appears 1 time.
Document 1: Word 15 ("syrup") appears 1 time.
Document 1: Word 16 ("time") appears 1 time.
Document 1: Word 17 ("week") appears 1 time.


Document 2: Word 18 ("amoxicillin") appears 1 time.
Document 2: Word 19 ("covid") appears 2 time.
Document 2: Word 20 ("doctor") appears 1 time.
Document 2: Word 21 ("pres

In [22]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5,
                                   random_state=46,
                                   id2word = am_dict,                                    
                                   passes = 10,
                                   workers = 2)

In [23]:
# For each topic, we will explore the words occuring in that topic and its relative weight

for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.021*"amoxicillin" + 0.017*"get" + 0.016*"infection" + 0.015*"go" + 0.012*"right" + 0.010*"take" + 0.009*"antibiotic" + 0.008*"treatment" + 0.007*"week" + 0.007*"day"


Topic: 1 
Words: 0.042*"amoxicillin" + 0.016*"antibiotic" + 0.016*"infection" + 0.015*"get" + 0.013*"use" + 0.011*"give" + 0.008*"start" + 0.007*"take" + 0.007*"back" + 0.007*"strep"


Topic: 2 
Words: 0.013*"amoxicillin" + 0.013*"get" + 0.011*"use" + 0.011*"drug" + 0.007*"work" + 0.007*"antibiotic" + 0.007*"go" + 0.007*"prophylaxis" + 0.006*"think" + 0.006*"take"


Topic: 3 
Words: 0.042*"amoxicillin" + 0.026*"get" + 0.015*"give" + 0.011*"day" + 0.009*"now" + 0.009*"go" + 0.008*"medicine" + 0.007*"say" + 0.006*"week" + 0.005*"take"


Topic: 4 
Words: 0.025*"amoxicillin" + 0.019*"take" + 0.012*"prescribe" + 0.010*"week" + 0.009*"still" + 0.008*"make" + 0.008*"infection" + 0.007*"say" + 0.007*"know" + 0.007*"tooth"




### Nurofen LDA Model

In [24]:
nu_data = pd.read_excel('Nurofen.xlsx')['New Tweet']

In [25]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []
    for text in texts:
 # Handle NaN values
        if pd.isna(text):
            text = ''
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = ' '.join(new_text)
        texts_out.append(final)
    return texts_out


lemmatized_texts = lemmatization(nu_data)
print(lemmatized_texts[0][0:100])

look box nurofen take too many


In [26]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

nu_data_words = gen_words(lemmatized_texts)
print(nu_data_words[0][0:20])

['look', 'box', 'nurofen', 'take', 'too', 'many']


In [27]:
# Create a dictionary containing the number of times a word appears in the training set using gensim.corpora.Dictionary and call it dictionary

nu_dict = gensim.corpora.Dictionary(nu_data_words)

In [28]:
# Checking dictionary created

count = 0
for k, v in nu_dict.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 box
1 look
2 many
3 nurofen
4 take
5 too
6 offer
7 tell
8 else
9 give
10 only


In [29]:
# Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many words and how many times those words appear. Save this to 'bow_corpus'
bow_corpus = [nu_dict.doc2bow(doc) for doc in nu_data_words]

In [30]:
# Preview BOW for our sample preprocessed document

document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     nu_dict[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 107 ("time") appears 1 time.


In [31]:
# Explore other documents in your corpus
for document_num in range(10):  # Print information for the first 10 documents
    bow_doc_x = bow_corpus[document_num]
    for i in range(len(bow_doc_x)):
        print("Document {}: Word {} (\"{}\") appears {} time.".format(
            document_num, bow_doc_x[i][0], nu_dict[bow_doc_x[i][0]], bow_doc_x[i][1]
        ))
    print("\n")

Document 0: Word 0 ("box") appears 1 time.
Document 0: Word 1 ("look") appears 1 time.
Document 0: Word 2 ("many") appears 1 time.
Document 0: Word 3 ("nurofen") appears 1 time.
Document 0: Word 4 ("take") appears 1 time.
Document 0: Word 5 ("too") appears 1 time.


Document 1: Word 3 ("nurofen") appears 1 time.
Document 1: Word 4 ("take") appears 1 time.
Document 1: Word 6 ("offer") appears 1 time.
Document 1: Word 7 ("tell") appears 1 time.


Document 2: Word 3 ("nurofen") appears 1 time.
Document 2: Word 4 ("take") appears 1 time.
Document 2: Word 8 ("else") appears 1 time.
Document 2: Word 9 ("give") appears 1 time.
Document 2: Word 10 ("only") appears 1 time.


Document 3: Word 1 ("look") appears 3 time.
Document 3: Word 11 ("capsule") appears 1 time.
Document 3: Word 12 ("entrance") appears 1 time.
Document 3: Word 13 ("especially") appears 1 time.
Document 3: Word 14 ("full") appears 1 time.
Document 3: Word 15 ("imagine") appears 1 time.
Document 3: Word 16 ("just") appears 1 t

In [32]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5,
                                   random_state=46,
                                   id2word = nu_dict,                                    
                                   passes = 10,
                                   workers = 2)

In [33]:
# For each topic, we will explore the words occuring in that topic and its relative weight

for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.033*"nurofen" + 0.023*"just" + 0.022*"get" + 0.018*"take" + 0.018*"use" + 0.018*"night" + 0.012*"too" + 0.012*"couple" + 0.012*"feel" + 0.012*"child"


Topic: 1 
Words: 0.042*"take" + 0.026*"nurofen" + 0.018*"tell" + 0.018*"many" + 0.018*"make" + 0.018*"good" + 0.018*"help" + 0.018*"time" + 0.010*"pain" + 0.010*"thank"


Topic: 2 
Words: 0.022*"take" + 0.022*"say" + 0.022*"nurofen" + 0.017*"bad" + 0.017*"help" + 0.011*"give" + 0.011*"just" + 0.011*"go" + 0.011*"good" + 0.011*"panadol"


Topic: 3 
Words: 0.028*"nurofen" + 0.028*"look" + 0.021*"just" + 0.021*"pain" + 0.015*"head" + 0.015*"give" + 0.015*"so" + 0.008*"open" + 0.008*"people" + 0.008*"full"


Topic: 4 
Words: 0.053*"nurofen" + 0.024*"say" + 0.019*"so" + 0.016*"get" + 0.015*"pain" + 0.010*"day" + 0.010*"really" + 0.010*"same" + 0.010*"now" + 0.010*"buy"




### Lorazepam LDA Model

In [34]:
lo_data = pd.read_excel('Lorazepam.xlsx')['New Tweet']

In [35]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []
    for text in texts:
 # Handle NaN values
        if pd.isna(text):
            text = ''
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = ' '.join(new_text)
        texts_out.append(final)
    return texts_out


lemmatized_texts = lemmatization(lo_data)
print(lemmatized_texts[0][0:100])

lorazepam stimulate il6 production associate poor survival outcome pancreatic cancer clinical cancer


In [36]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

lo_data_words = gen_words(lemmatized_texts)
print(lo_data_words[0][0:20])

['lorazepam', 'stimulate', 'il', 'production', 'associate', 'poor', 'survival', 'outcome', 'pancreatic', 'cancer', 'clinical', 'cancer', 'research', 'cancer', 'research']


In [37]:
# Create a dictionary containing the number of times a word appears in the training set using gensim.corpora.Dictionary and call it dictionary

lo_dict = gensim.corpora.Dictionary(lo_data_words)

In [38]:
# Checking dictionary created

count = 0
for k, v in lo_dict.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 associate
1 cancer
2 clinical
3 il
4 lorazepam
5 outcome
6 pancreatic
7 poor
8 production
9 research
10 stimulate


In [39]:
# Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many words and how many times those words appear. Save this to 'bow_corpus'
bow_corpus = [lo_dict.doc2bow(doc) for doc in lo_data_words]

In [40]:
# Preview BOW for our sample preprocessed document

document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     lo_dict[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 4 ("lorazepam") appears 1 time.
Word 34 ("make") appears 2 time.
Word 45 ("take") appears 1 time.
Word 53 ("go") appears 1 time.
Word 68 ("really") appears 1 time.
Word 80 ("possible") appears 1 time.
Word 152 ("today") appears 1 time.
Word 156 ("search") appears 1 time.
Word 157 ("able") appears 1 time.
Word 158 ("appt") appears 2 time.
Word 159 ("dose") appears 1 time.
Word 160 ("find") appears 2 time.
Word 161 ("foot") appears 1 time.
Word 162 ("hard") appears 1 time.
Word 163 ("hope") appears 1 time.
Word 164 ("injure") appears 1 time.
Word 165 ("keep") appears 1 time.
Word 166 ("oxy") appears 1 time.
Word 167 ("shelf") appears 1 time.
Word 168 ("still") appears 2 time.
Word 169 ("thank") appears 1 time.
Word 170 ("tomorrow") appears 1 time.


In [41]:
# Explore other documents in your corpus
for document_num in range(10):  # Print information for the first 10 documents
    bow_doc_x = bow_corpus[document_num]
    for i in range(len(bow_doc_x)):
        print("Document {}: Word {} (\"{}\") appears {} time.".format(
            document_num, bow_doc_x[i][0], lo_dict[bow_doc_x[i][0]], bow_doc_x[i][1]
        ))
    print("\n")

Document 0: Word 0 ("associate") appears 1 time.
Document 0: Word 1 ("cancer") appears 3 time.
Document 0: Word 2 ("clinical") appears 1 time.
Document 0: Word 3 ("il") appears 1 time.
Document 0: Word 4 ("lorazepam") appears 1 time.
Document 0: Word 5 ("outcome") appears 1 time.
Document 0: Word 6 ("pancreatic") appears 1 time.
Document 0: Word 7 ("poor") appears 1 time.
Document 0: Word 8 ("production") appears 1 time.
Document 0: Word 9 ("research") appears 2 time.
Document 0: Word 10 ("stimulate") appears 1 time.
Document 0: Word 11 ("survival") appears 1 time.


Document 1: Word 12 ("buy") appears 1 time.
Document 1: Word 13 ("discount") appears 1 time.
Document 1: Word 14 ("huge") appears 1 time.
Document 1: Word 15 ("low") appears 1 time.
Document 1: Word 16 ("online") appears 1 time.
Document 1: Word 17 ("pharmacy") appears 1 time.
Document 1: Word 18 ("prescription") appears 2 time.
Document 1: Word 19 ("price") appears 1 time.
Document 1: Word 20 ("trust") appears 1 time.


D

In [42]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5,
                                   random_state=46,
                                   id2word = lo_dict,                                    
                                   passes = 10,
                                   workers = 2)

In [43]:
# For each topic, we will explore the words occuring in that topic and its relative weight

for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.046*"lorazepam" + 0.042*"take" + 0.014*"help" + 0.010*"think" + 0.006*"find" + 0.006*"hour" + 0.006*"say" + 0.006*"sleep" + 0.006*"get" + 0.006*"experience"


Topic: 1 
Words: 0.038*"lorazepam" + 0.019*"give" + 0.012*"dose" + 0.011*"anxiety" + 0.011*"go" + 0.010*"take" + 0.010*"sleep" + 0.010*"find" + 0.009*"treatment" + 0.008*"make"


Topic: 2 
Words: 0.035*"lorazepam" + 0.025*"take" + 0.024*"work" + 0.020*"make" + 0.019*"so" + 0.013*"too" + 0.012*"say" + 0.012*"just" + 0.011*"get" + 0.009*"go"


Topic: 3 
Words: 0.070*"lorazepam" + 0.017*"need" + 0.011*"get" + 0.011*"drug" + 0.011*"take" + 0.010*"give" + 0.009*"just" + 0.008*"use" + 0.007*"cause" + 0.007*"sleep"


Topic: 4 
Words: 0.049*"lorazepam" + 0.030*"cancer" + 0.030*"patient" + 0.019*"survival" + 0.019*"pancreatic" + 0.017*"outcome" + 0.015*"link" + 0.013*"associate" + 0.012*"benzodiazepine" + 0.011*"so"




### Positives

In [4]:
df = pd.read_excel('Drugs.xlsx')

# Filter rows with positive sentiment
positives = df[df['Analysis'] == 'Positive']
positives.head()

Unnamed: 0.1,Unnamed: 0,Author ID,Tweet ID,Language,Created at,Text,Word Count,New Tweet,Word Count.1,Location,Retweet Count,Subjectivity,Polarity,Analysis,Results
2,3.0,1030000000000000000,1690000000000000000,en,2023-08-21 09:09:19+00:00,"@bearish0411 if its COVID, amoxicillin will no...",23,its covid amoxicillin will not work pretty sur...,10,,0,0.944444,0.375,Positive,1
3,4.0,2245928095,1690000000000000000,en,2023-08-21 09:09:16+00:00,"@missmulrooney alarm. got it. coincidentally, ...",14,alarm got coincidentally i have just started a...,10,,0,0.0,0.3,Positive,1
4,5.0,1590000000000000000,1690000000000000000,en,2023-08-21 08:47:58+00:00,RT @skdembe: #BeAntibioticsAware... It's #Anti...,15,#beantibioticsaware it is #antibioticawareness...,8,,1,0.3,1.0,Positive,1
5,6.0,388774547,1690000000000000000,en,2023-08-21 08:47:22+00:00,''took her straight to drs AGAIN where the doc...,49,took straight drs again where doctor time says...,32,,0,0.466667,0.233333,Positive,1
8,9.0,830271474,1690000000000000000,en,2023-08-21 05:23:58+00:00,I do have a drawer with all kinds of odd meds ...,20,do have drawer kinds odd meds found some amoxi...,13,,0,0.392857,0.059524,Positive,1


In [5]:
data = positives['New Tweet']
data.head()

2    its covid amoxicillin will not work pretty sur...
3    alarm got coincidentally i have just started a...
4    #beantibioticsaware it is #antibioticawareness...
5    took straight drs again where doctor time says...
8    do have drawer kinds odd meds found some amoxi...
Name: New Tweet, dtype: object

In [9]:
# import en_core_web_sm
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []
    for text in texts:
 # Handle NaN values
        if pd.isna(text):
            text = ''
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = ' '.join(new_text)
        texts_out.append(final)
    return texts_out


lemmatized_texts = lemmatization(data)

In [10]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)
print(data_words[0][0:20])

['covid', 'amoxicillin', 'work', 'pretty', 'sure', 'doctor', 'prescribe', 'think', 'covid']


In [11]:
# Create a dictionary containing the number of times a word appears 
# in the training set using gensim.corpora.Dictionary and call it dictionary

dictionary = gensim.corpora.Dictionary(data_words)

In [12]:
# Checking dictionary created

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 amoxicillin
1 covid
2 doctor
3 prescribe
4 pretty
5 sure
6 think
7 work
8 alarm
9 coincidentally
10 get


In [14]:
# Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many words and how many times those words appear. Save this to 'bow_corpus'
bow_corpus = [dictionary.doc2bow(doc) for doc in data_words]

In [15]:
# Preview BOW for our sample preprocessed document

document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 44 ("med") appears 1 time.
Word 185 ("addition") appears 1 time.
Word 186 ("arrive") appears 1 time.
Word 187 ("baby") appears 1 time.
Word 188 ("bad") appears 1 time.
Word 189 ("fur") appears 1 time.
Word 190 ("gray") appears 1 time.
Word 191 ("hand") appears 1 time.
Word 192 ("happy") appears 1 time.
Word 193 ("hirudoid") appears 1 time.
Word 194 ("keep") appears 1 time.
Word 195 ("know") appears 2 time.
Word 196 ("manage") appears 1 time.
Word 197 ("mth") appears 1 time.
Word 198 ("reduce") appears 1 time.
Word 199 ("savlon") appears 1 time.
Word 200 ("seizure") appears 1 time.
Word 201 ("slave") appears 1 time.
Word 202 ("supply") appears 1 time.
Word 203 ("swell") appears 1 time.
Word 204 ("watch") appears 1 time.


In [16]:
# Explore other documents in your corpus
for document_num in range(10):  # Print information for the first 10 documents
    bow_doc_x = bow_corpus[document_num]
    for i in range(len(bow_doc_x)):
        print("Document {}: Word {} (\"{}\") appears {} time.".format(
            document_num, bow_doc_x[i][0], dictionary[bow_doc_x[i][0]], bow_doc_x[i][1]
        ))
    print("\n")

Document 0: Word 0 ("amoxicillin") appears 1 time.
Document 0: Word 1 ("covid") appears 2 time.
Document 0: Word 2 ("doctor") appears 1 time.
Document 0: Word 3 ("prescribe") appears 1 time.
Document 0: Word 4 ("pretty") appears 1 time.
Document 0: Word 5 ("sure") appears 1 time.
Document 0: Word 6 ("think") appears 1 time.
Document 0: Word 7 ("work") appears 1 time.


Document 1: Word 8 ("alarm") appears 1 time.
Document 1: Word 9 ("coincidentally") appears 1 time.
Document 1: Word 10 ("get") appears 1 time.
Document 1: Word 11 ("just") appears 1 time.
Document 1: Word 12 ("so") appears 1 time.
Document 1: Word 13 ("start") appears 1 time.
Document 1: Word 14 ("today") appears 1 time.
Document 1: Word 15 ("useful") appears 1 time.


Document 2: Word 0 ("amoxicillin") appears 1 time.
Document 2: Word 16 ("antibiotic") appears 1 time.
Document 2: Word 17 ("good") appears 1 time.
Document 2: Word 18 ("match") appears 1 time.
Document 2: Word 19 ("personality") appears 1 time.


Document 

In [17]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5,
                                   random_state=46,
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [18]:
# For each topic, we will explore the words occuring in that topic and its relative weight

for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.015*"lorazepam" + 0.012*"amoxicillin" + 0.012*"drug" + 0.008*"take" + 0.008*"give" + 0.008*"first" + 0.006*"medicine" + 0.006*"flavor" + 0.006*"throat" + 0.006*"think"


Topic: 1 
Words: 0.021*"lorazepam" + 0.013*"get" + 0.011*"know" + 0.011*"go" + 0.010*"more" + 0.009*"amoxicillin" + 0.009*"so" + 0.009*"week" + 0.009*"dosage" + 0.008*"drug"


Topic: 2 
Words: 0.029*"amoxicillin" + 0.022*"get" + 0.014*"give" + 0.012*"right" + 0.012*"lorazepam" + 0.010*"too" + 0.009*"day" + 0.009*"pill" + 0.009*"so" + 0.008*"say"


Topic: 3 
Words: 0.031*"lorazepam" + 0.019*"work" + 0.015*"take" + 0.014*"help" + 0.012*"really" + 0.012*"use" + 0.011*"make" + 0.010*"good" + 0.010*"amoxicillin" + 0.009*"year"


Topic: 4 
Words: 0.020*"take" + 0.013*"amoxicillin" + 0.013*"lorazepam" + 0.012*"go" + 0.010*"get" + 0.010*"prescribe" + 0.010*"well" + 0.008*"make" + 0.007*"more" + 0.007*"use"




### Negatives

In [23]:
# Filter rows with negative sentiment
negatives = df[df['Analysis'] == 'Negative']
negatives.head()

Unnamed: 0.1,Unnamed: 0,Author ID,Tweet ID,Language,Created at,Text,Word Count,New Tweet,Word Count.1,Location,Retweet Count,Subjectivity,Polarity,Analysis,Results
6,7.0,1480000000000000000,1690000000000000000,en,2023-08-21 07:40:46+00:00,Oh but Im allergic to amoxicillin,6,but im allergic amoxicillin,4,,0,0.0,0.0,Negative,-1
7,8.0,1130000000000000000,1690000000000000000,en,2023-08-21 06:53:13+00:00,I thought I was weird for liking Amoxicillin t...,23,thought weird liking amoxicillin taste but see...,15,,0,0.625,-0.041667,Negative,-1
9,10.0,1580000000000000000,1690000000000000000,en,2023-08-21 04:41:05+00:00,@LouDobbs @drpaulmarik1 You trump cult fools....,43,trump cult fools pushed hydroxy ivermectin ant...,30,,1,0.42,-0.07,Negative,-1
14,15.0,22717621,1690000000000000000,en,2023-08-21 01:58:31+00:00,Medical things Ive learned I am allergic to in...,27,medical things ive learned allergic last 2 yea...,16,,0,0.188889,-0.0625,Negative,-1
20,21.0,94802940,1690000000000000000,en,2023-08-20 18:43:51+00:00,@anujtiwari11 Cefexime in amoxicillin right? H...,23,cefexime amoxicillin right? have taken fever p...,13,,0,0.567857,-0.007143,Negative,-1


In [27]:
data = negatives['New Tweet']
data.head()

6                           but im allergic amoxicillin
7     thought weird liking amoxicillin taste but see...
9     trump cult fools pushed hydroxy ivermectin ant...
14    medical things ive learned allergic last 2 yea...
20    cefexime amoxicillin right? have taken fever p...
Name: New Tweet, dtype: object

In [28]:
# import en_core_web_sm
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []
    for text in texts:
 # Handle NaN values
        if pd.isna(text):
            text = ''
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = ' '.join(new_text)
        texts_out.append(final)
    return texts_out


lemmatized_texts = lemmatization(data)

In [29]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)
print(data_words[0][0:20])

['allergic', 'amoxicillin']


In [30]:
# Create a dictionary containing the number of times a word appears 
# in the training set using gensim.corpora.Dictionary and call it dictionary

dictionary = gensim.corpora.Dictionary(data_words)

In [31]:
# Checking dictionary created

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 allergic
1 amoxicillin
2 antibiotic
3 indeed
4 like
5 other
6 seem
7 taste
8 think
9 weird
10 well


In [32]:
# Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many words and how many times those words appear. Save this to 'bow_corpus'
bow_corpus = [dictionary.doc2bow(doc) for doc in data_words]

In [33]:
# Preview BOW for our sample preprocessed document

document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 1 ("amoxicillin") appears 1 time.
Word 21 ("infection") appears 1 time.
Word 50 ("take") appears 1 time.
Word 60 ("prescription") appears 1 time.
Word 88 ("sick") appears 1 time.
Word 117 ("never") appears 1 time.
Word 188 ("decide") appears 1 time.


In [34]:
# Explore other documents in your corpus
for document_num in range(10):  # Print information for the first 10 documents
    bow_doc_x = bow_corpus[document_num]
    for i in range(len(bow_doc_x)):
        print("Document {}: Word {} (\"{}\") appears {} time.".format(
            document_num, bow_doc_x[i][0], dictionary[bow_doc_x[i][0]], bow_doc_x[i][1]
        ))
    print("\n")

Document 0: Word 0 ("allergic") appears 1 time.
Document 0: Word 1 ("amoxicillin") appears 1 time.


Document 1: Word 2 ("antibiotic") appears 1 time.
Document 1: Word 3 ("indeed") appears 1 time.
Document 1: Word 4 ("like") appears 1 time.
Document 1: Word 5 ("other") appears 1 time.
Document 1: Word 6 ("seem") appears 1 time.
Document 1: Word 7 ("taste") appears 2 time.
Document 1: Word 8 ("think") appears 1 time.
Document 1: Word 9 ("weird") appears 1 time.
Document 1: Word 10 ("well") appears 1 time.


Document 2: Word 11 ("anti") appears 1 time.
Document 2: Word 12 ("budesinine") appears 1 time.
Document 2: Word 13 ("corticosteroid") appears 1 time.
Document 2: Word 14 ("cult") appears 1 time.
Document 2: Word 15 ("curceritin") appears 1 time.
Document 2: Word 16 ("dose") appears 1 time.
Document 2: Word 17 ("fool") appears 1 time.
Document 2: Word 18 ("high") appears 1 time.
Document 2: Word 19 ("hundred") appears 1 time.
Document 2: Word 20 ("hydroxy") appears 1 time.
Document 2

In [35]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5,
                                   random_state=46,
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [36]:
# For each topic, we will explore the words occuring in that topic and its relative weight

for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.024*"get" + 0.022*"amoxicillin" + 0.014*"so" + 0.012*"help" + 0.011*"go" + 0.010*"bad" + 0.008*"doctor" + 0.008*"still" + 0.008*"give" + 0.008*"little"


Topic: 1 
Words: 0.018*"amoxicillin" + 0.016*"take" + 0.014*"use" + 0.013*"lorazepam" + 0.010*"go" + 0.008*"get" + 0.008*"allergic" + 0.008*"infection" + 0.008*"time" + 0.008*"only"


Topic: 2 
Words: 0.019*"infection" + 0.014*"antibiotic" + 0.011*"day" + 0.011*"amoxicillin" + 0.008*"buy" + 0.008*"give" + 0.008*"doctor" + 0.008*"ear" + 0.008*"say" + 0.006*"covid"


Topic: 3 
Words: 0.029*"lorazepam" + 0.027*"take" + 0.021*"cancer" + 0.015*"pancreatic" + 0.011*"outcome" + 0.010*"work" + 0.010*"patient" + 0.010*"treatment" + 0.009*"go" + 0.009*"say"


Topic: 4 
Words: 0.017*"lorazepam" + 0.012*"get" + 0.012*"antibiotic" + 0.010*"just" + 0.010*"other" + 0.010*"so" + 0.008*"bad" + 0.008*"use" + 0.007*"think" + 0.007*"end"




### Neutrals

In [38]:
# Filter rows with neutral sentiment
neutrals = df[df['Analysis'] == 'Neutral']
neutrals.head()

Unnamed: 0.1,Unnamed: 0,Author ID,Tweet ID,Language,Created at,Text,Word Count,New Tweet,Word Count.1,Location,Retweet Count,Subjectivity,Polarity,Analysis,Results
0,1.0,599628533,1690000000000000000,en,2023-08-21 10:26:24+00:00,RT @theliamnissan: Lauren Boebert calling for ...,21,lauren boebert calling abolishment dept educat...,12,,3560,0.0,0,Neutral,0
1,2.0,1090000000000000000,1690000000000000000,en,2023-08-21 09:14:02+00:00,"@amerix I wish I had read seen post last week,...",26,wish had read seen post last week bought ascor...,19,,0,0.066667,0,Neutral,0
13,14.0,877000000000000000,1690000000000000000,en,2023-08-21 02:01:00+00:00,@Phil_Lewis_ Gonna be on a 90 day treatment of...,9,gonna 90 day treatment amoxicillin,5,,0,0.0,0,Neutral,0
15,16.0,359191827,1690000000000000000,en,2023-08-21 01:13:38+00:00,Me vs amoxicillin \nMeriang ðŸ‘,4,me vs amoxicillin,3,,0,0.0,0,Neutral,0
16,17.0,2844157329,1690000000000000000,en,2023-08-20 23:48:05+00:00,amoxicillin is a god-like antibiotic... been d...,15,amoxicillin god-like antibiotic been dealing d...,10,,0,0.0,0,Neutral,0


In [39]:
data = neutrals['New Tweet']
data.head()

0     lauren boebert calling abolishment dept educat...
1     wish had read seen post last week bought ascor...
13                   gonna 90 day treatment amoxicillin
15                                   me vs amoxicillin 
16    amoxicillin god-like antibiotic been dealing d...
Name: New Tweet, dtype: object

In [40]:
# import en_core_web_sm
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    texts_out = []
    for text in texts:
 # Handle NaN values
        if pd.isna(text):
            text = ''
        doc = nlp(text)
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        final = ' '.join(new_text)
        texts_out.append(final)
    return texts_out


lemmatized_texts = lemmatization(data)

In [41]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_texts)
print(data_words[0][0:20])

['call', 'dept', 'education', 'chlamydia', 'speak', 'danger']


In [42]:
# Create a dictionary containing the number of times a word appears 
# in the training set using gensim.corpora.Dictionary and call it dictionary

dictionary = gensim.corpora.Dictionary(data_words)

In [43]:
# Checking dictionary created

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 call
1 chlamydia
2 danger
3 dept
4 education
5 speak
6 always
7 amoxiclave
8 ascorbic
9 buy
10 cough


In [44]:
# Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many words and how many times those words appear. Save this to 'bow_corpus'
bow_corpus = [dictionary.doc2bow(doc) for doc in data_words]

In [45]:
# Preview BOW for our sample preprocessed document

document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 18 ("amoxicillin") appears 1 time.
Word 52 ("get") appears 1 time.
Word 73 ("function") appears 1 time.
Word 74 ("know") appears 1 time.
Word 75 ("pull") appears 1 time.


In [46]:
# Explore other documents in your corpus
for document_num in range(10):  # Print information for the first 10 documents
    bow_doc_x = bow_corpus[document_num]
    for i in range(len(bow_doc_x)):
        print("Document {}: Word {} (\"{}\") appears {} time.".format(
            document_num, bow_doc_x[i][0], dictionary[bow_doc_x[i][0]], bow_doc_x[i][1]
        ))
    print("\n")

Document 0: Word 0 ("call") appears 1 time.
Document 0: Word 1 ("chlamydia") appears 1 time.
Document 0: Word 2 ("danger") appears 1 time.
Document 0: Word 3 ("dept") appears 1 time.
Document 0: Word 4 ("education") appears 1 time.
Document 0: Word 5 ("speak") appears 1 time.


Document 1: Word 6 ("always") appears 1 time.
Document 1: Word 7 ("amoxiclave") appears 1 time.
Document 1: Word 8 ("ascorbic") appears 1 time.
Document 1: Word 9 ("buy") appears 1 time.
Document 1: Word 10 ("cough") appears 1 time.
Document 1: Word 11 ("last") appears 1 time.
Document 1: Word 12 ("post") appears 1 time.
Document 1: Word 13 ("read") appears 1 time.
Document 1: Word 14 ("see") appears 1 time.
Document 1: Word 15 ("syrup") appears 1 time.
Document 1: Word 16 ("time") appears 1 time.
Document 1: Word 17 ("week") appears 1 time.


Document 2: Word 18 ("amoxicillin") appears 1 time.
Document 2: Word 19 ("day") appears 1 time.
Document 2: Word 20 ("go") appears 1 time.
Document 2: Word 21 ("treatment"

In [47]:
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 5,
                                   random_state=46,
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [48]:
# For each topic, we will explore the words occuring in that topic and its relative weight

for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.058*"take" + 0.033*"amoxicillin" + 0.029*"lorazepam" + 0.016*"patient" + 0.016*"know" + 0.014*"say" + 0.014*"allergic" + 0.010*"now" + 0.010*"short" + 0.010*"survival"


Topic: 1 
Words: 0.068*"amoxicillin" + 0.044*"lorazepam" + 0.014*"get" + 0.014*"day" + 0.011*"time" + 0.011*"sleep" + 0.011*"use" + 0.007*"need" + 0.007*"include" + 0.007*"die"


Topic: 2 
Words: 0.034*"lorazepam" + 0.031*"give" + 0.022*"need" + 0.010*"amoxicillin" + 0.010*"buy" + 0.010*"antibiotic" + 0.010*"go" + 0.007*"metronidazole" + 0.007*"also" + 0.007*"work"


Topic: 3 
Words: 0.027*"get" + 0.017*"amoxicillin" + 0.014*"infection" + 0.014*"start" + 0.011*"lorazepam" + 0.011*"take" + 0.011*"stop" + 0.011*"use" + 0.011*"antibiotic" + 0.011*"today"


Topic: 4 
Words: 0.035*"lorazepam" + 0.022*"amoxicillin" + 0.018*"use" + 0.009*"cause" + 0.009*"infection" + 0.009*"doctor" + 0.009*"user" + 0.009*"psychosis" + 0.009*"see" + 0.009*"dream"


