In [36]:
import twint
import pandas as pd
import argparse
import os
import datetime
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [37]:
df = pd.read_csv('WHO_covid.csv')

In [40]:
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    return tweet

my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

# cleaning master function
def clean_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords

    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet



def compute_lda_model(df, number_of_topics):
    
    removed = df.tweet.apply(remove_links)
    removed = df.tweet.apply(remove_users)
    removed = df.tweet.apply(clean_tweet)
    
    vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')
    tf = vectorizer.fit_transform(removed).toarray()
    tf_feature_names = vectorizer.get_feature_names()
    

    model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
    return model.fit(tf), tf_feature_names
    
    
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

    
    

In [43]:
model, features = compute_lda_model(df, 5) #5 topics

In [44]:
display_topics(model, features, 25) #25 words from each topic

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights
0,com,357.9,#covid,332.9,#covid,318.7,twitter,866.3,case,480.2
1,pic,357.9,#coronavirus,274.8,…,183.9,pic,865.1,#coronavirus,311.0
2,twitter,357.6,countri,213.9,#coronavirus,166.5,com,865.1,report,241.2
3,#coronavirus,192.6,’,122.3,countri,156.5,health,363.9,#covid,232.0
4,health,178.7,we,118.2,#ncov,147.7,peopl,205.6,twitter,162.4
5,#covid,140.2,’r,105.2,outbreak,137.8,diseas,155.9,pic,160.4
6,emerg,123.1,”,97.5,health,127.2,year,136.6,com,160.4
7,food,96.1,take,78.4,respons,111.5,children,129.9,countri,158.9
8,protect,91.4,work,73.5,brief,89.2,#healthforall,117.2,transmiss,156.1
9,suppli,82.2,togeth,72.4,peopl,84.2,risk,107.1,situat,94.3


In [26]:
tf_feature_names

['#beatncds',
 '#cancer',
 '#china',
 '#china🇨🇳',
 '#coronavirus',
 '#covid',
 '#depression',
 '#drc',
 '#eb',
 '#ebola',
 '#healthforall',
 '#healthworkers',
 '#hiv',
 '#letstalk',
 '#mentalhealth',
 '#msc',
 '#ncov',
 '#roadsafety',
 '#safehands',
 '#supportnursesandmidwives',
 'acceler',
 'access',
 'achiev',
 'act',
 'action',
 'activ',
 'address',
 'adult',
 'advic',
 'affect',
 'africa',
 'age',
 'alcohol',
 'almost',
 'alreadi',
 'also',
 'although',
 'among',
 'anim',
 'approach',
 'appropri',
 'area',
 'around',
 'ask',
 'assess',
 'avail',
 'avoid',
 'base',
 'becom',
 'best',
 'better',
 'brief',
 'bring',
 'build',
 'call',
 'cancer',
 'capac',
 'care',
 'case',
 'caus',
 'challeng',
 'chang',
 'child',
 'children',
 'clean',
 'clear',
 'clinic',
 'close',
 'cluster',
 'com',
 'come',
 'commit',
 'committe',
 'common',
 'commun',
 'concern',
 'condit',
 'confirm',
 'contact',
 'contain',
 'continu',
 'contribut',
 'control',
 'coordin',
 'cough',
 'could',
 'countri',
 'cov

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [32]:
no_top_words = 20
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,com,227.8,#covid,201.5,#covid,207.3,twitter,491.9,case,381.0,#covid,159.9,emerg,140.1,peopl,143.8,#covid,184.9,health,388.9
1,pic,227.8,#coronavirus,182.8,#coronavirus,153.8,com,490.0,report,241.1,#coronavirus,129.2,health,130.2,diseas,118.8,countri,157.1,com,317.1
2,twitter,227.7,countri,156.6,health,94.7,pic,490.0,#coronavirus,193.7,…,105.2,#coronavirus,112.4,pic,114.2,we,118.1,pic,317.1
3,help,94.9,”,78.6,countri,83.4,infect,119.5,#covid,129.9,brief,89.1,case,98.9,com,114.2,work,115.9,twitter,316.9
4,peopl,93.8,risk,71.0,’t,72.2,prevent,107.8,countri,111.5,com,83.8,outbreak,94.8,twitter,114.0,#ncov,107.5,#healthforall,117.1
5,#covid,82.1,’,67.1,peopl,69.8,use,97.2,twitter,92.8,pic,83.8,intern,92.4,year,100.9,#coronavirus,106.0,access,98.3
6,#coronavirus,66.8,nurs,60.1,patient,64.9,new,82.3,com,90.9,twitter,83.8,transmiss,71.4,million,89.7,’r,105.1,servic,86.1
7,activ,66.3,commun,54.6,care,55.9,tobacco,67.1,pic,90.9,hand,71.0,committe,68.1,let,79.8,respons,101.5,women,85.9
8,health,59.3,take,54.6,know,55.3,risk,61.1,#china,87.5,media,66.1,public,59.0,cancer,66.1,togeth,94.1,countri,73.3
9,support,56.5,epidem,52.1,protect,51.3,vaccin,60.6,situat,80.7,use,52.5,#ebola,58.7,live,57.4,global,90.2,’,54.5


In [49]:
date_time_str = '2018-06-29'
    date_time_obj = datetime.datetime.strptime(date_time_str, '%Y-%m-%d')
date_time_obj

datetime.datetime(2018, 6, 29, 0, 0)

In [40]:


start_date = datetime.datetime(2008, 1, 1)
threshold = datetime.datetime(2008,5,1)

while start_date < threshold:
    end_date = start_date + datetime.timedelta(days = 30)
    c = twint.Config()
    c.Username = "WHO"
    c.Until = end_date.strftime("%Y-%m-%d")
    c.Since = start_date.strftime("%Y-%m-%d")
    c.Store_csv = True
    c.Output = "WHO.csv"
    twint.run.Search(c)
    
    
    start_date = end_date + datetime.timedelta(days = 1)
    
    

RuntimeError: This event loop is already running

In [24]:
timestampStr = start_date.strftime("%Y-%m-%d")

Current Timestamp :  2010-12-31


In [27]:
c = twint.Config()
c.Username = "WHO"
c.Until = end_date.strftime("%Y-%m-%d")
c.Since = start_date.strftime("%Y-%m-%d")
c.Store_csv = True
c.Output = "WHO.csv"
c.Lang = "en"
#c.Translate = True
c.TranslateDest = "it"
twint.run.Search(c)


RuntimeError: This event loop is already running

30217923326906368 2011-01-26 11:58:09 W. Europe Daylight Time <WHO> Emergency #vaccination campaign against yellow fever outbreak in Cote d'Ivoire    http://tinyurl.com/687rjsh #globalhealth #yellowfever
28415915137699840 2011-01-21 12:37:36 W. Europe Daylight Time <WHO> Keep schools and places where #children gather free from #marketing of unhealthy #foods  http://tinyurl.com/65ljtp7  #globalhealth
27402806835027968 2011-01-18 17:31:52 W. Europe Daylight Time <WHO> See the latest trends in the uptake of #telemedicine globally  http://tinyurl.com/4676goy #globalhealth
26670705563144192 2011-01-16 17:02:46 W. Europe Daylight Time <WHO> Exclusive #breastfeeding for six months best for babies everywhere.  http://tinyurl.com/6bhvrjh #globalhealth
25636224207167488 2011-01-13 20:32:06 W. Europe Daylight Time <WHO> Theme for World No #Tobacco Day 2011: The WHO Framework Convention on Tobacco Control  http://tiny.cc/lwco6 #health #globalhealth #FCTC
25167532688146432 2011-01-12 13:29:42 W. Eu

In [None]:
print(twint.output.panda.Tweets_df)

In [None]:
print(len(l))

In [None]:
print(x[-1])