# Load model

In [103]:
from flair.data import Sentence
from flair.models import SequenceTagger

# load the NER tagger
model_output_folder = 'twimed-ner-bert'
tagger = SequenceTagger.load_from_file(model_output_folder + '/final-model.pt')

def predict(tagger, text):
    print('===============================================')
    text = text.replace('#', '')
    sentence = Sentence(text)
    tagger.predict(sentence)
    print(sentence)
    print('--------------------------------')

    # iterate over entities and print
    for entity in sentence.get_spans('ner'):
        print(entity)

    

2019-03-21 10:58:44,302 loading file twimed-ner-bert/final-model.pt


In [104]:
#
predict(tagger, """Starting back on fluoxetine tonight, I did pick up the prescription a few days ago but in the past when I’ve started/upped meds I’ve had to call in sick to work due to side effects. So I waited. Now I have 3 days off to adjust""")


Sentence: "Starting back on fluoxetine tonight, I did pick up the prescription a few days ago but in the past when I’ve started/upped meds I’ve had to call in sick to work due to side effects. So I waited. Now I have 3 days off to adjust" - 46 Tokens
--------------------------------
Drug-span [4]: "fluoxetine"
DATE-span [13,14]: "few days"
DATE-span [42,43]: "3 days"


In [139]:
from twitterscraper import query_tweets

class TwitterWatch: 
    def __init__(self, tagger):
        self.tagger = tagger
        
    def query(self, product):
        list_of_tweets = query_tweets(product, 100, lang='en')
        for tweet in list_of_tweets:            
            source_entity = self.getSourceEntity(tweet.fullname)            
            sentence = self.predict(tweet.text)
            tweet.text_entities = self.getEntityTags(sentence)
            tweet.source_entity = source_entity
            tweet.hasDrug = 'Drug' in tweet.text_entities
            tweet.hasSymptom = 'Disease_Symptom' in tweet.text_entities
            tweet.product = product
        return list_of_tweets
        
    def detect(self, sourcename, text):
        sentence = self.predict(text)        
        text_entities = self.getEntityTags(sentence)        
        hasDrug = 'Drug' in text_entities
        hasSymptom = 'Disease_Symptom' in text_entities        
        return {
            'Name'          : sourcename, 
            'SourceEntity'  : self.getSourceEntity(sourcename), 
            'TextEntity'    : text_entities,
            'Text'          : text, 
            'hasDrug'       : 'Drug' in text_entities,
            'hasSymptom'    : 'Disease_Symptom' in text_entities,
            'entities'      : sentence.to_dict(tag_type='ner'),
            'sentence'      : sentence
        }
        
    
    def getSourceEntity(self, name):
        result = 'UNKNOWN'
        sentence = self.predict(name)        
        for entity in sentence.get_spans('ner'):
            result = entity.tag
        return result
    
    def getEntityTags(self, sentence):
        if(sentence == None):
            return []
        else:
            return list(map( lambda entity: entity.tag, sentence.get_spans('ner') ))
        
    def predict(self, text):
        text = self.cleanTwitterTag(text)
        sentence = Sentence(text)
        self.tagger.predict(sentence)
        return sentence
    
    def cleanTwitterTag(self, text):
        text = text.replace('#', '').replace("@", '')
        return text
    

In [21]:
import pandas as pd

def getTwitter(tagger, product):
    tw = TwitterWatch(tagger)
    tweets = tw.query(product);
    tweetsdict = list(map(lambda t: 
    {
        'Product'       : t.product,
        'Name'          : t.fullname, 
        'SourceEntity'  : t.source_entity, 
        'TextEntity'    : t.text_entities,
        'Text'          : t.text, 
        'hasDrug'       : t.hasDrug,
        'hasSymptom'    : t.hasSymptom
    }, tweets))
    df = pd.DataFrame.from_dict(tweetsdict)
    df = df[['Product', 'Name', 'SourceEntity', 'TextEntity', 'hasDrug', 'hasSymptom', 'Text']]
    df = df.sort_values(['SourceEntity', 'hasDrug', 'hasSymptom'])
    return df
    #df.to_excel(f"output_{product}.xlsx")
    

In [108]:
from requests import request
import json
from pandas.io.json import json_normalize
def getAdverseEvents(limit):
    adverseEventUrl=f"""https://api.fda.gov/drug/event.json?search=patient.drug.openfda.pharm_class_epc:"drug"&count=patient.drug.medicinalproduct.exact&limit={limit}"""
    response=request(url=adverseEventUrl, method='get')
    ae = response.json()
    return pd.DataFrame(ae['results'])

In [None]:
outputfile = "TopAdversaryDrugs.xlsx"
ae = getAdverseEvents(200)
top100AdversaryDrug = pd.DataFrame(ae['results'])
top100AdversaryDrug

In [25]:
count = len(top100AdversaryDrug)
tweetsDF = []
for i, product in enumerate(top100AdversaryDrug['term']):
    clear_output(wait=True)       
    print(product, " (", i+1, "/", count, ") ..." )
    df = getTwitter(tagger, product)
    tweetsDF.append(df)
    clear_output(wait=True)       
result = pd.concat(tweetsDF)
result.to_excel(outputfile)


  force_unicode(url))
' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS
  force_unicode(url))
  force_unicode(url))
  force_unicode(url))
The%20diabetes%20drug%20Avandia,%20also%20known%20as%20rosiglitazone,%20has%20been%20suspended%20by%20UK%20and%20European%20drug%20watchdogs.%20The%20drug,%20which%20is%20used%20to%20control%20blood%20sugar%20in%20type%202%20diabetes,%20has%20been%20linked%20to%20an%20increased%20risk%20of%20heart%20attack%20and%20stroke%20pic.twitter.com/9E8KzRNZuy' with link or location/anchor > 255 characters since it exceeds Excel's limit for URLS
  force_unicode(url))
  force_unicode(url))
  force_unicode(url))
%22Compared%20with%20usual%20care,%20open-label%20pravastatin%20use%20(40%20mg/d)%20did%20not%20reduce%20all-cause%20mortality%20or%20CHD%20mortality%20plus%20nonfatal%20MI%20in%20patients%20with%20(n%20=%201475)%20or%20without%20(n%20=%208880)%20CHD.%20There%20was%20no%20reduction%20in%20all-cause%20mortality...%22' 

In [27]:
result.to_csv("TopAdversaryDrugs.csv")

In [28]:
import pandas as pd
df = pd.read_excel(outputfile)

In [29]:
total_drug = len(df.Product.unique())
total_tweets = len(df)

print(f"Total Drug Count: {total_drug}")
print(f"Total Tweets: {total_tweets}")

Total Drug Count: 200
Total Tweets: 69626


### Many of them is advertisement or drug news tweets
### We are interested in personal tweets for drug side effects

In [30]:
df.head(20)

Unnamed: 0,Product,Name,SourceEntity,TextEntity,hasDrug,hasSymptom,Text
161,ASPIRIN,JENNY TALIA from Australia 🇦🇺,GPE,[],False,False,partied like a rock star. Note to self - you a...
35,ASPIRIN,☃️ The Hornbread ❄️,ORG,[],False,False,@clutchbucket it tasted like baby aspirin
315,ASPIRIN,Gargamel is my bitch,ORG,['CARDINAL'],False,False,3 aspirin and a huge ass cup of coffee
2,ASPIRIN,Charlotte Pollard,PERSON,['PERSON'],False,False,@NatalieAPollard I've got some aspirin on hold...
5,ASPIRIN,Nabil Kilany,PERSON,"['ORG', 'DATE']",False,False,"@RobaAssi Been there last week end ruba, take ..."
8,ASPIRIN,Richard Thomas,PERSON,[],False,False,"ha @JedidiahAyres - i had a good time too, too..."
10,ASPIRIN,Nati Baquerizo,PERSON,[],False,False,"@lcicaza I wish, tendré que self medicate myse..."
11,ASPIRIN,Trisha Blish,PERSON,"['DATE', 'ORG', 'PERSON']",False,False,Well today was stressful but thanks to Aspirin...
19,ASPIRIN,Greg McLeod,PERSON,"['CARDINAL', 'CARDINAL', 'CARDINAL', 'CARDINAL...",False,False,"For tea: half a bottle of whisky, half a bottl..."
20,ASPIRIN,Lloyd Smith,PERSON,[],False,False,No baby lots of aspirin...


In [31]:
personalTweets = df[df.SourceEntity == 'PERSON']
personalTweets

Unnamed: 0,Product,Name,SourceEntity,TextEntity,hasDrug,hasSymptom,Text
2,ASPIRIN,Charlotte Pollard,PERSON,['PERSON'],False,False,@NatalieAPollard I've got some aspirin on hold...
5,ASPIRIN,Nabil Kilany,PERSON,"['ORG', 'DATE']",False,False,"@RobaAssi Been there last week end ruba, take ..."
8,ASPIRIN,Richard Thomas,PERSON,[],False,False,"ha @JedidiahAyres - i had a good time too, too..."
10,ASPIRIN,Nati Baquerizo,PERSON,[],False,False,"@lcicaza I wish, tendré que self medicate myse..."
11,ASPIRIN,Trisha Blish,PERSON,"['DATE', 'ORG', 'PERSON']",False,False,Well today was stressful but thanks to Aspirin...
19,ASPIRIN,Greg McLeod,PERSON,"['CARDINAL', 'CARDINAL', 'CARDINAL', 'CARDINAL...",False,False,"For tea: half a bottle of whisky, half a bottl..."
20,ASPIRIN,Lloyd Smith,PERSON,[],False,False,No baby lots of aspirin...
26,ASPIRIN,Katt 🧚🏽‍♀️,PERSON,[],False,False,"@carlyyminkk bring me water , aspirin, and a c..."
30,ASPIRIN,Mikey says hey,PERSON,['DATE'],False,False,Discharged! It is indeed pericarditis. Need to...
36,ASPIRIN,Louisa Thomas,PERSON,[],False,False,"@CarlBialik @andrewflowers I know, but that so..."


In [33]:
tp = df[(df.hasDrug == True) & (df.hasSymptom == True)]
tp.groupby('Product').size()

Product
ACETAMINOPHEN            9
ACETAMINOPHEN.           9
ACETYLSALICYLIC ACID     2
ACIPHEX                  1
ACYCLOVIR.               4
ADVAIR HFA               1
ADVIL                    1
ALBUTEROL                2
ALBUTEROL.               2
ALEVE                    9
ALLEGRA                  4
ALLOPURINOL              4
ALLOPURINOL.             4
ALPRAZOLAM.              4
AMBIEN                   4
AMITRIPTYLINE           24
AMLODIPINE              10
ASPIRIN                  3
ASPIRIN.                 3
ATENOLOL                 3
ATENOLOL.                3
ATIVAN                   8
ATORVASTATIN             6
AVANDIA                  3
BACLOFEN.               19
BENADRYL                19
BISOPROLOL               1
CALCIUM                  2
CARVEDILOL.             17
CELEBREX                 8
                        ..
SPIRIVA                  2
SPIRONOLACTONE.          4
SYMBICORT               12
SYNTHROID                7
TAMSULOSIN               3
TOPAMAX             

In [37]:
total_correct = len(tp)
total_personal_tweets = len(personalTweets)

print(f"Total Correctly identified Drug Name and Symptom/Total Personal Tweets: {total_correct}/{total_personal_tweets}")
print(f"Identified:  {int(total_correct/total_personal_tweets*100)} %")

# TODO: Display Text Entity and highlight of the tweet texts 
tp

Total Correctly identified Drug Name and Symptom/Total Personal Tweets: 2271/19923
Identified:  11 %


Unnamed: 0,Product,Name,SourceEntity,TextEntity,hasDrug,hasSymptom,Text
64,ASPIRIN,Alice Hayes,PERSON,"['Drug', 'Disease_Symptom']",True,True,Why aspirin and paracetamol could be making yo...
312,ASPIRIN,cesultra,UNKNOWN,"['GPE', 'ORG', 'Drug', 'GPE', 'Disease_Symptom']",True,True,Russia has an opioid abuse problem as big as ...
313,ASPIRIN,cesultra,UNKNOWN,"['GPE', 'ORG', 'Drug', 'GPE', 'Disease_Symptom']",True,True,Russia has an opioid abuse problem as big as ...
44,ASPIRIN.,Alice Hayes,PERSON,"['Drug', 'Disease_Symptom']",True,True,Why aspirin and paracetamol could be making yo...
371,ASPIRIN.,cesultra,UNKNOWN,"['GPE', 'ORG', 'Drug', 'GPE', 'Disease_Symptom']",True,True,Russia has an opioid abuse problem as big as ...
372,ASPIRIN.,cesultra,UNKNOWN,"['GPE', 'ORG', 'Drug', 'GPE', 'Disease_Symptom']",True,True,Russia has an opioid abuse problem as big as ...
46,CELEBREX,Dennis Raye,PERSON,"['Drug', 'Disease_Symptom']",True,True,New blog post: Celebrex disrupts heart rhythm ...
95,CELEBREX,Jina Riggs M.D.,PERSON,"['Drug', 'Drug', 'Disease_Symptom']",True,True,celebrex uses | celebrex high | generic celebr...
160,CELEBREX,Sextus Fulloway,PERSON,"['Drug', 'Disease_Symptom', 'ORG']",True,True,Celebrex appears to have some activity against...
168,CELEBREX,Batya Mcgiven,PERSON,"['Drug', 'Disease_Symptom']",True,True,celebrex over the counter pain reliever http:/...


In [38]:
DrugCountDF = df[(df.hasDrug == True)]
total_product_recognized = len(DrugCountDF)
total_personal_tweets = len(personalTweets)
print(f"Total Correctly identified Drug Name and Symptom/Total Personal Tweets: {total_product_recognized}/{total_personal_tweets}")
print(f"Correctness: {int(total_product_recognized/total_personal_tweets*100)}%")

Total Correctly identified Drug Name and Symptom/Total Personal Tweets: 10321/19923
Correctness: 51%


# After finding these result to retrain the model see if we can get better result.

In [40]:
prods = tp.Product.unique()
products = list(map(lambda p: p, prods))
print(f"Number of products to monitor : {len(products)}")

Number of products to monitor : 176


# Real Time Twitter Streaming Monitoring

In [174]:
from IPython.display import HTML, display
from datetime import datetime


name = "Dr Gordon Caldwell"
text = "No it was after the ibuprofen for the gout caused by the furosemide given to clear oedema from too much iv fluids https://twitter.com/betabetic/status/676544651248189443 …"

class DisplayTwitterDetector:     
    color = {
        'DRUG'           :'#bfeeb7', 
        'DISEASE_SYMPTOM':'#ffbf00', 
        #'PERSON'         :'#7aecec', 
        #'ORG'            :'#A9CCE3',
        'OTHER'          :'#D3D3D3'
    }
    def detect(self, name, text, displayDetected = 'both'):
        tw = TwitterWatch(tagger)
        processed = tw.detect(name, text)         
        if(processed['hasDrug'] and processed['hasSymptom']):             
            self.displayResult(name, processed)    
            return processed
        elif (displayDetected == 'either'):
            if(len(processed['TextEntity']) > 0):
                self.displayResult(name, processed)            
        else:
            if(displayDetected == 'none'):                
                self.displayResult(name, processed)   
        return None
        
    def findEntity(self, p, start, end):
        entities = p['entities']['entities']
        for ent in entities:
            if(start == ent['start_pos'] and end == ent['end_pos']):
                return ent                    
        return None
    def getHtmlText(self, color, text, label):
        return f"""
            <mark class="entity" style="background: {color}; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em; box-decoration-break: clone; -webkit-box-decoration-break: clone">{text}
            <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">{label}</span></mark>
        """

    def getColor(self, entityType):
        if(entityType in self.color): 
            return self.color[entityType]
        else:
            return self.color['OTHER']
        
    def displayResult(self,name, p):
        now = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        result = f"""<span>{now}</span> <span style="padding-left:5em">{name}</span> <span style="padding-left:5em">"""
        for token in p['sentence']:
            ent = self.findEntity(p, token.start_pos,  token.end_pos)
            if(ent != None):
                result = result + ' '+ self.getHtmlText(self.getColor(ent['type'].upper()), ent['text'], ent['type'])
            else:
                result = result + ' ' + token.text
        result = result + "</span>"
        h = HTML(result); 
        display(h)

d = DisplayTwitterDetector()
d.detect(name, text)
d.detect(name, """RT BougieLa: It IS killing us. When folks are having strokes because they can’t afford their blood pressure meds, and diabetics are going…""", 'either')

# Personal Twitter Developer api only allows to track 400 max

In [160]:
df = getAdverseEvents(330)
products = df.term.unique()
products = list(map(lambda p: p.lower(), products))
print(f"Number of products to monitor : {len(products)}")
products

Number of products to monitor : 330


['aspirin',
 'aspirin.',
 'celebrex',
 'ibuprofen.',
 'acetaminophen',
 'ibuprofen',
 'lipitor',
 'plavix',
 'humira',
 'vitamin d',
 'metformin',
 'omeprazole.',
 'acetaminophen.',
 'lisinopril.',
 'lasix',
 'nexium',
 'amlodipine',
 'acetylsalicylic acid',
 'atorvastatin',
 'synthroid',
 'advil',
 'lisinopril',
 'gabapentin.',
 'revlimid',
 'prednisone.',
 'simvastatin',
 'methotrexate',
 'lyrica',
 'calcium',
 'fish oil',
 'prilosec',
 'simvastatin.',
 'crestor',
 'omeprazole',
 'folic acid.',
 'furosemide.',
 'vitamin d3',
 'folic acid',
 'naproxen.',
 'enbrel',
 'tylenol',
 'xarelto',
 'meloxicam.',
 'protonix',
 'furosemide',
 'norvasc',
 'metoprolol',
 'atenolol',
 'pantoprazole',
 'dexamethasone.',
 'cymbalta',
 'prednisone',
 'vitamin b12',
 'naproxen',
 'niaspan',
 'xanax',
 'zocor',
 'clopidogrel',
 'neurontin',
 'lantus',
 'mobic',
 'levothyroxine.',
 'prevacid',
 'hydrochlorothiazide',
 'motrin',
 'paracetamol',
 'spiriva',
 'metoprolol.',
 'avandia',
 'multivitamin',
 'me

In [175]:
from __future__ import absolute_import, print_function

import ipywidgets as widgets

from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream

from IPython.display import clear_output, display
import json
from collections import deque


# Go to http://apps.twitter.com and create an app.
# The consumer key and secret will be generated for you after
consumer_key="6jWuKi5xOokKx65MI35Q"
consumer_secret="I0cqvp8CFhLe8Xer9GoJKLg6J0NfwHddjVUjIQFTeI"

# After the step above, you will be redirected to your app's page.
# Create an access token under the the "Your access token" section
access_token="127221870-92JvNvmBlrJt7bteAjM9csRA5LRHebTaH3FmKlzg"
access_token_secret="aRwTHxCTbnIk6Ba9drK8H8qGEfXBBo1rjGN4HZfy0BGKc"


    
class StdOutListener(StreamListener):    
    lines = deque([])
    def on_data(self, data):       
        tweet = json.loads(data)
        #if(len(self.lines)>5):
        #    self.lines.popleft()                        
        username = tweet['user']['name']
        text = tweet['text']
        d = DisplayTwitterDetector()
        r = d.detect(name, text, 'either')
        #r = detect(username, text)
        #if(r != None):            
            #self.lines.append(displayLine)
            #clear_output(wait=True)   
            #for line in self.lines:
            #    print(line)
        #else:
            #print(f"Not detected: {username}\t\t\t{text}")
    def on_error(self, status):
        print(status)

l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
print(f"Monitoring {len(products)} products ...")
stream.filter(languages=["en"], track=products)

    

Monitoring 330 products ...


KeyboardInterrupt: 