## Import the required libraries

In [38]:
import tweepy
import csv
import time
import glob
import pandas as pd
from bs4 import BeautifulSoup as BS

## Twitter API credentials

In [2]:
consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""

## Authorize twitter, initialize tweepy

In [3]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

## Create a list of all counties for which we need tweets. We will iterate through this list

In [10]:
counties = ["BondCountyHD", "boonecohealth", "CU_PublicHealth", "ChiPublicHealth", "CookCtyHealth", "DeKalbCoHD", "DuPageHD", 
            "fwbicohealth", "KaneCoHealth", "kankakeehealth", "kendallhealth", "kchd92", "LakeCoHealth", "LaSalleCoHealth", "LCHD_IL", 
            "LCHD_Illinois", "Maconcountyhd", "MacoupinHealth", "McHenryCoHealth", "McLeanHealth", "vopnews", 
            "peoriaprepare", "polkcohealth", "RICO_HealthDept", "HenryStarkHD", "stclairhealth", "ERCSCHD", 
            "tazewellhealth", "Whiteside_CHC", "WillCoHealth", "WinnCoHealth", "WoodfordHealth"]

## Function to retrieve all hashtags in a tweet

In [23]:
def getHashtags(tweet):
    #hashtags=[None]*len(tweet["entities"]["hashtags"])
    hashtags=[]
    if len(tweet["entities"]["hashtags"]) > 0:
        for i in range(len(tweet["entities"]["hashtags"])-1):
            hashtags.append(tweet["entities"]["hashtags"][i]['text'])
        print(hashtags)
        hashtags=list(set(hashtags))
        hashtags=' '.join(hashtags)
    return hashtags

## Function to know if the tweet has image/video/etc

In [22]:
def getMedia(tweet):
    try:
        value = tweet['extended_entities']['media'][0]['type']
        return value
    except KeyError:
        return "NA"

## Function to get date time in a proper format

In [21]:
def getDate(tweet):
    return time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y'))

## Function to get tweet source from HTML format (Get value of anchor tag)

In [20]:
def getTweetSource(tweet):
    #Beautiful Soup is used here
    soup=BS(tweet)
    return soup.find('a', {'rel':'nofollow'}).text

## Iterating through the list and generating a csv for each county

In [25]:
for county in counties:
    #initialize a list to hold all the tweepy Tweets
    alltweets = []

    new_tweets = api.user_timeline(screen_name = county,count=200, parser=tweepy.parsers.JSONParser(), tweet_mode = "extended")

    #save most recent tweets
    alltweets.extend(new_tweets)
    
    
    oldest = alltweets[-1]['id'] - 1
    
    #keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:

        
        print("getting tweets before %s" % (oldest))
        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name = county,count=200,max_id=oldest, parser=tweepy.parsers.JSONParser(), 
                                       tweet_mode = "extended")

        #save most recent tweets
        alltweets.extend(new_tweets)

         #update the id of the oldest tweet less one
        oldest = alltweets[-1]['id'] - 1

        print("...%s tweets downloaded so far" % (len(alltweets)))

        #transform the tweepy tweets into a 2D array that will populate the csv	
        outtweets = [[tweet['id_str'], getDate(tweet), tweet['full_text'], tweet['user']['name'], tweet['user']['location'],  
                      tweet['user']['followers_count'], tweet['user']['friends_count'], tweet['user']['favourites_count'], 
                      tweet['user']['statuses_count'], tweet['retweet_count'], tweet['favorite_count'], 
                      tweet['favorited'], getHashtags(tweet), getTweetSource(tweet['source']), getMedia(tweet)] for tweet in alltweets]

        #write the csv
        with open('%s_tweets.csv' % county, 'w') as f:
            writer = csv.writer(f)
            writer.writerow(["id","created_at","tweet", "twitter_handle", "location","followers", 
                             "following", "profile_likes", "total_tweets", "retweets", "favorites", "is_favorited",
                             "Hashtags", "source", "is_media"])
            writer.writerows(outtweets)
    print(county," is done----------------------------------------------------------")

getting tweets before 621298741216030720
...400 tweets downloaded so far
[]




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


getting tweets before 517118483264311295
...600 tweets downloaded so far
[]
getting tweets before 416654220444844031
...800 tweets downloaded so far
[]
getting tweets before 325299092253204479
...1000 tweets downloaded so far
[]
getting tweets before 280763428317036543
...1200 tweets downloaded so far
[]
getting tweets before 226380556026392575
...1224 tweets downloaded so far
[]
getting tweets before 220253819223810047
...1224 tweets downloaded so far
[]
BondCountyHD  is done----------------------------------------------------------
getting tweets before 822123848728543233
...400 tweets downloaded so far
[]
[]
['NPHW']
['NationalDiabetesMonth']
[]
['BreakUpWithSalt', 'SodiumReduction']
['MedsUpAway']
[]
[]
[]
['GetVaccinated']
['GetVaccinated']
['Behavioralhealth']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
['Parents']
['smoking']
['frostbite']
['Flu', 'GetAFluVax']
['Flu']
['fluvax', 'NIVW']
['fluvax', 'flu']
['fluvax']
['flu', 'NIVW']
[]
[]
[]
[]
['HPV']
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]

## Combining all csv files

In [28]:
csvs = glob.glob('*.csv')

In [29]:
csvs

['ChiPublicHealth_tweets.csv',
 'tazewellhealth_tweets.csv',
 'DuPageHD_tweets.csv',
 'HenryStarkHD_tweets.csv',
 'peoriaprepare_tweets.csv',
 'MacoupinHealth_tweets.csv',
 'WoodfordHealth_tweets.csv',
 'KaneCoHealth_tweets.csv',
 'Whiteside_CHC_tweets.csv',
 'BondCountyHD_tweets.csv',
 'stclairhealth_tweets.csv',
 'LakeCoHealth_tweets.csv',
 'CookCtyHealth_tweets.csv',
 'fwbicohealth_tweets.csv',
 'McLeanHealth_tweets.csv',
 'WillCoHealth_tweets.csv',
 'vopnews_tweets.csv',
 'LCHD_IL_tweets.csv',
 'boonecohealth_tweets.csv',
 'McHenryCoHealth_tweets.csv',
 'DeKalbCoHD_tweets.csv',
 'CU_PublicHealth_tweets.csv',
 'LaSalleCoHealth_tweets.csv',
 'kankakeehealth_tweets.csv',
 'LCHD_Illinois_tweets.csv',
 'ERCSCHD_tweets.csv',
 'kendallhealth_tweets.csv',
 'RICO_HealthDept_tweets.csv',
 'Maconcountyhd_tweets.csv',
 'WinnCoHealth_tweets.csv',
 'kchd92_tweets.csv',
 'polkcohealth_tweets.csv']

In [30]:
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in csvs])
#export to csv
combined_csv.to_csv( "ALLTWEETS.csv", index=False, encoding='utf-8-sig')