# Load packages

In [1]:
# 'private_twitter_credentials' contains my Twitter credentials. Replace by 'twitter_credentials' with your credentials
import private_twitter_credentials
import twitter
import datetime
import pandas as pd
import time

TodaysDate = time.strftime("%Y-%m-%d-%H-%M")

# Seeting up twitter authentication

In [2]:
consumer_key = private_twitter_credentials.consumer_key
consumer_secret = private_twitter_credentials.consumer_secret
access_token = private_twitter_credentials.access_token
access_token_secret = private_twitter_credentials.access_token_secret

api = twitter.Api(
    consumer_key         =   consumer_key,
    consumer_secret      =   consumer_secret,
    access_token_key     =   access_token,
    access_token_secret  =   access_token_secret,
    tweet_mode = 'extended'
)

# Functions

Class `TweetMiner` class contains two methods: 

* `mine_user_tweets` which mine user's tweets making use of [Get user_timeline](https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-user_timeline)

* `search_tweets` which mine tweets using [GetSearch](https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets)

`search_tweets` gives you possibility to perform queries. You can for instances perform a search at Twitter and copy what comes after `q` in your browser.

For example, if I search `@picnic @JumboSupermarkt @albertheijn covid-19` the query used as argument in `search_tweets` is `q=%40picnic%20%40JumboSupermarkt%20%40albertheijn%20covid-19&src=typed_query`. 


In [22]:
import datetime

class TweetMiner(object):
    """ Make possible obtaining tweets using twitter user id (mine_user_tweets) or performing a standard Twitter 
    API search"""

    
    def __init__(self, api, result_limit = 20, max_pages = 40):
        """result_limit = count that can take max 200 (mine_user_tweets) and max 100 (search_tweets)"""
        
        self.api = api        
        self.result_limit = result_limit
        self.max_pages = max_pages
        

    def mine_user_tweets(self, user, mine_retweets=False):
        """ Mine tweets of user = screen_name or user_id"""

        data           =  []
        last_tweet_id  =  False
        page           =  1
        
        while page <= self.max_pages:
            
            if last_tweet_id:
                statuses   =   self.api.GetUserTimeline(screen_name=user, count=self.result_limit, max_id=last_tweet_id - 1, 
                                                        include_rts=mine_retweets)
                statuses = [ _.AsDict() for _ in statuses]
            else:
                statuses   =   self.api.GetUserTimeline(screen_name=user, count=self.result_limit, 
                                                        include_rts=mine_retweets)
                statuses = [_.AsDict() for _ in statuses]
                
            for item in statuses:
                # Using try except here.
                # When retweets = 0 we get an error (GetUserTimeline fails to create a key, 'retweet_count')
                try:
                    mined = {
                        'mined_at':         datetime.datetime.now(),
                        'created_at':       item['created_at'],
                        'tweet_id':         item['id'],
                        'tweet_id_str':     item['id_str'],
                        'screen_name':      item['user']['screen_name'],
                        'favorite_count':   item['favorite_count'],
                        'text':             item['full_text'],
                        'source':           item['source'],
                        'language':         item['lang'],
                        'retweet_count':    item['retweet_count'],
                        #user info
                        'user_favourites_count': item['user']['favourites_count'],
                        'followers_count':  item['user']['followers_count'],
                        'friends_count':    item['user']['friends_count']
                    }
            
                
                except:
                    mined = {
                        'mined_at':         datetime.datetime.now(),
                        'created_at':       item['created_at'],
                        'tweet_id':         item['id'],
                        'tweet_id_str':     item['id_str'],
                        'screen_name':      item['user']['screen_name'],
#                         'favorite_count':   item['favorite_count'],
                        'text':             item['full_text'],
                        'source':           item['source'],
                        'language':         item['lang'],
                        'retweet_count':    0,
                        # user info
                        'user_favourites_count': item['user']['favourites_count'],
                        'followers_count':  item['user']['followers_count'],
                        'friends_count':    item['user']['friends_count']
                        }
                
                last_tweet_id = item['id']
                data.append(mined)
                
            page += 1
            
        return data
    
    def search_tweets(max_pages = 20, count = 20, raw_query = None, result_type = 'mixed'):
        """ Search tweets """

        data           =  []
        last_tweet_id  =  False
        page           =  1
        
        while page <= max_pages:
            
            if last_tweet_id:
                statuses = api.GetSearch(raw_query=raw_query, count = count, result_type=result_type, 
                                         max_id=last_tweet_id - 1)
                statuses = [ _.AsDict() for _ in statuses]
            else:
                statuses = api.GetSearch(raw_query=raw_query, count = count, result_type=result_type)
                statuses = [_.AsDict() for _ in statuses]
                
            for item in statuses:
                # Using try except here.
                # When retweets = 0 we get an error (GetUserTimeline fails to create a key, 'retweet_count')
                try:
                    mined = {
                        'mined_at':                datetime.datetime.now(),
                        'created_at':              item['created_at'],
                        'tweet_id':                item['id'],
                        'tweet_id_str':            item['id_str'],
                        'in_reply_to_screen_name': item['in_reply_to_screen_name'],
                        'in_reply_to_status_id':   item['in_reply_to_status_id'],
                        'in_reply_to_user_id':     item['in_reply_to_user_id'],
                        'language':                item['lang'],
                        'text':                    item['full_text'],
                        'hashtags':                item['hashtags'],
                        'source':                  item['source'],
                       # info about user
                        'screen_name':             item['user']['screen_name'],
                        'user_tweet_id':           item['user']['id'],
                        'user_tweet_id_str':       item['user']['id_str'],
                        'user_favourites_count':   item['user']['favourites_count'],
                        'followers_count':         item['user']['followers_count'],
                        'friends_count':           item['user']['friends_count']
                    }
                    
                except:
                    mined = {
                        'mined_at':                datetime.datetime.now(),
                        'created_at':              item['created_at'],
                        'tweet_id':                item['id'],
                        'tweet_id_str':            item['id_str'],
#                         'in_reply_to_screen_name': item['in_reply_to_screen_name'],
#                         'in_reply_to_status_id':   item['in_reply_to_status_id'],
#                         'in_reply_to_user_id':     item['in_reply_to_user_id'],
                        'language':                item['lang'],
                        'text':                    item['full_text'],
                        'hashtags':                item['hashtags'],
                        'source':                  item['source'],
                       # info about user
                        'screen_name':             item['user']['screen_name'],
                        'user_tweet_id':           item['user']['id'],
                        'user_tweet_id_str':       item['user']['id_str'],
                        'user_favourites_count':   item['user']['favourites_count'],
                        'followers_count':         item['user']['followers_count'],
                        'friends_count':           item['user']['friends_count']
                    }
                                            
                
                last_tweet_id = item['id']
                data.append(mined)
                
            page += 1
            
        return data

In [19]:
def processing_and_saving(df, file_name, mine_user_twitter=1):
    """ Save retrieved tweets in csv file.
    
    Input:
    
    df : dataframe of tweets'data
    file_name: name with which the csv will be saved (without extension)
    mine_user_twitter: Indicates if df came contains tweets from a twitter user, i.e., was obtained using 
    GetUserTimeline since the information obtained from this method is different from an API search from GetSearch
    
    """
    
    TodaysDate = time.strftime("%Y-%m-%d-%H-%M")

    
    # Create columns 'year', 'month', 'day', 'hour', 'min' from 'created_at'
    df['created_at'] = pd.to_datetime(df['created_at'])
        
    df['year'] = df['created_at'].dt.year 
    df['month'] = df['created_at'].dt.month 
    df['day'] = df['created_at'].dt.day 
    df['hour'] = df['created_at'].dt.hour 
    df['minute'] = df['created_at'].dt.minute
    df['day_of_week'] = df['created_at'].dt.weekday
    
    if mine_user_twitter:
    
        df = df[['mined_at', 'screen_name', 'tweet_id', 'tweet_id_str', 'created_at', 'year', 'month', 'day','day_of_week', 
             'hour', 'minute', 'retweet_count', 'favorite_count', 'source', 'language', 'user_favourites_count', 
             'followers_count','friends_count','text']]
    else:
        df = df[['mined_at', 'tweet_id', 'tweet_id_str', 'in_reply_to_screen_name','in_reply_to_status_id',
                 'in_reply_to_user_id', 'hashtags','source','language', 'created_at', 'year', 'month', 'day','day_of_week', 
                 'hour', 'minute','screen_name','user_tweet_id','user_tweet_id_str','user_favourites_count','followers_count',
                 'friends_count', 'text']]
        
    df.sort_values(by='created_at',inplace = True)
    df.reset_index(drop = True, inplace = True)
    
    
    df.to_csv("../data/tweets/"+file_name+"_"+TodaysDate+".csv", index = False)
    
    return df
    
    

# Retrieving twitter data

The goal of this project is to check the sentiment of users towards the main providers of online grocery shopping, i.e., Jumbo Supermarkten, AH, and Picnic.

Everything changed since the first case of corona virus in The Netherlands (February 27th) and the way of shopping groceries suffered important change with a jump in number of users that opted for online grocery shopping. Supermarkts were not ready for such a explosion of demand, some adapted faster than others. Specially, `Picnic` that has the sole focus on online shopping.

I have my own experiences but I want to via tweets messages over these 3 providers of online grocery shopping get the sentiment of the users in this 'special' moments faced by both consumers and providers.

The idea is to get twitters covering the period from the 1st case until today both for info retrieved by user (`GetUserTimeline`) and by query (`GetSearch`).

Although, it is not possible to have control over the period covered by the search, we will play with parameters and go as far as possible.

## Getting twitter by user

To start we will obtain tweets for `picnic`, `JumboSupermarkt`, and `albertheijn` which are the tweet screen name of the 3 providers of online grocery shopping.


### Picnic

In [23]:
# Result limit == count parameter from our GetUserTimeline() it can take max 200
# More pages more back in time you can go
miner = TweetMiner(api, result_limit=20, max_pages = 100)
picnic = miner.mine_user_tweets(user="picnic")
df_picnic = processing_and_saving(pd.DataFrame(picnic), "picnic")

In [38]:
df_picnic.tail()

Unnamed: 0,mined_at,screen_name,tweet_id,tweet_id_str,created_at,year,month,day,day_of_week,hour,minute,retweet_count,favorite_count,source,language,user_favourites_count,followers_count,friends_count,text
1990,2020-06-21 00:29:23.693529,picnic,1273662862096060417,1273662862096060417,2020-06-18 17:04:24+00:00,2020,6,18,3,17,4,0,,"<a href=""http://www.zendesk.com"" rel=""nofollow...",nl,3881,4844,5,"@rolandweyers Ah, dat is inderdaad een dode mu..."
1991,2020-06-21 00:29:23.693529,picnic,1273918233834393602,1273918233834393602,2020-06-19 09:59:09+00:00,2020,6,19,4,9,59,0,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",nl,3881,4844,5,@MiriamVermeulen Wat fijn dat je al 100 plekke...
1992,2020-06-21 00:29:23.693529,picnic,1273920800064757760,1273920800064757760,2020-06-19 10:09:21+00:00,2020,6,19,4,10,9,0,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",nl,3881,4844,5,@MrsVlamingo De Presto maaltijden variëren af ...
1993,2020-06-21 00:29:23.693529,picnic,1274275880756482049,1274275880756482049,2020-06-20 09:40:19+00:00,2020,6,20,5,9,40,0,,"<a href=""http://www.zendesk.com"" rel=""nofollow...",nl,3881,4844,5,@Mr_Widewood Goedemorgen! Goed dat je dit even...
1994,2020-06-21 00:29:23.693529,picnic,1274431035896467466,1274431035896467466,2020-06-20 19:56:51+00:00,2020,6,20,5,19,56,0,,"<a href=""https://mobile.twitter.com"" rel=""nofo...",es,3881,4844,5,@dbenshachar ^Lino https://t.co/XK06YZyXVh


In [24]:
df_picnic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1995 entries, 0 to 1994
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   mined_at               1995 non-null   datetime64[ns]     
 1   screen_name            1995 non-null   object             
 2   tweet_id               1995 non-null   int64              
 3   tweet_id_str           1995 non-null   object             
 4   created_at             1995 non-null   datetime64[ns, UTC]
 5   year                   1995 non-null   int64              
 6   month                  1995 non-null   int64              
 7   day                    1995 non-null   int64              
 8   day_of_week            1995 non-null   int64              
 9   hour                   1995 non-null   int64              
 10  minute                 1995 non-null   int64              
 11  retweet_count          1995 non-null   int64            

In [25]:
min(df_picnic.created_at),max(df_picnic.created_at)

(Timestamp('2019-07-13 18:42:42+0000', tz='UTC'),
 Timestamp('2020-06-20 19:56:51+0000', tz='UTC'))

For `Picnic` we went far enough. We got data back to July 2019!

In [54]:
print("Picnic's followers", df_picnic.loc[df_picnic.shape[0]-1,'followers_count'])
print("Picnic's friends", df_picnic.loc[df_picnic.shape[0]-1,'friends_count'])

Picnic's followers 4844
Picnic's friends 5


In [62]:
df_picnic['language'].value_counts()

nl     1780
und     145
en       53
es        5
de        3
in        2
fr        2
it        1
ht        1
cy        1
sv        1
fi        1
Name: language, dtype: int64

### JumboSupermarkt

In [26]:
# Result limit == count parameter from our GetUserTimeline() it can take max 200
# More pages more back in time you can go
miner = TweetMiner(api, result_limit=20, max_pages = 200)
JumboSupermarkt = miner.mine_user_tweets(user="JumboSupermarkt")
df_JumboSupermarkt = processing_and_saving(pd.DataFrame(JumboSupermarkt), "JumboSupermarkt")


In [27]:
df_JumboSupermarkt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3221 entries, 0 to 3220
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   mined_at               3221 non-null   datetime64[ns]     
 1   screen_name            3221 non-null   object             
 2   tweet_id               3221 non-null   int64              
 3   tweet_id_str           3221 non-null   object             
 4   created_at             3221 non-null   datetime64[ns, UTC]
 5   year                   3221 non-null   int64              
 6   month                  3221 non-null   int64              
 7   day                    3221 non-null   int64              
 8   day_of_week            3221 non-null   int64              
 9   hour                   3221 non-null   int64              
 10  minute                 3221 non-null   int64              
 11  retweet_count          3221 non-null   int64            

In [28]:
min(df_JumboSupermarkt.created_at),max(df_JumboSupermarkt.created_at)

(Timestamp('2020-03-06 19:12:05+0000', tz='UTC'),
 Timestamp('2020-06-20 19:51:11+0000', tz='UTC'))

In [31]:
# Result limit == count parameter from our GetUserTimeline() it can take max 200
# More pages more back in time you can go
miner = TweetMiner(api, result_limit=25, max_pages = 250)
JumboSupermarkt = miner.mine_user_tweets(user="JumboSupermarkt")
df_JumboSupermarkt = processing_and_saving(pd.DataFrame(JumboSupermarkt), "JumboSupermarkt")


In [32]:
min(df_JumboSupermarkt.created_at),max(df_JumboSupermarkt.created_at)

(Timestamp('2020-03-06 19:12:05+0000', tz='UTC'),
 Timestamp('2020-06-20 19:51:11+0000', tz='UTC'))

Changing parameters does not seem to help here and we could only retrieve data from March 6th, 2020 until now for Jumbo.

In [53]:
print("Jumbo's followers", df_JumboSupermarkt.loc[df_JumboSupermarkt.shape[0]-1,'followers_count'])
print("Jumbo's friends", df_JumboSupermarkt.loc[df_JumboSupermarkt.shape[0]-1,'friends_count'])

Jumbo's followers 16211
Jumbo's friends 1710


In [56]:
df_JumboSupermarkt['language'].value_counts()

nl     3121
en       68
und      25
in        3
de        2
da        1
tl        1
Name: language, dtype: int64

### albertheijn

In [57]:
# Result limit == count parameter from our GetUserTimeline() it can take max 200
# More pages more back in time you can go
miner = TweetMiner(api, result_limit=20, max_pages = 200)
albertheijn = miner.mine_user_tweets(user="albertheijn")
df_albertheijn = processing_and_saving(pd.DataFrame(albertheijn), "albertheijn")


In [66]:
df_albertheijn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3204 entries, 0 to 3203
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   mined_at               3204 non-null   datetime64[ns]     
 1   screen_name            3204 non-null   object             
 2   tweet_id               3204 non-null   int64              
 3   tweet_id_str           3204 non-null   object             
 4   created_at             3204 non-null   datetime64[ns, UTC]
 5   year                   3204 non-null   int64              
 6   month                  3204 non-null   int64              
 7   day                    3204 non-null   int64              
 8   day_of_week            3204 non-null   int64              
 9   hour                   3204 non-null   int64              
 10  minute                 3204 non-null   int64              
 11  retweet_count          3204 non-null   int64            

In [67]:
min(df_albertheijn.created_at),max(df_albertheijn.created_at)

(Timestamp('2020-04-03 15:35:47+0000', tz='UTC'),
 Timestamp('2020-06-20 18:18:50+0000', tz='UTC'))

In [68]:
# Result limit == count parameter from our GetUserTimeline() it can take max 200
# More pages more back in time you can go
miner = TweetMiner(api, result_limit=20, max_pages = 300)
albertheijn = miner.mine_user_tweets(user="albertheijn")
df_albertheijn = processing_and_saving(pd.DataFrame(albertheijn), "albertheijn")


In [69]:
min(df_albertheijn.created_at),max(df_albertheijn.created_at)

(Timestamp('2020-04-03 15:35:47+0000', tz='UTC'),
 Timestamp('2020-06-20 18:18:50+0000', tz='UTC'))

Again the changing parameters didn't work. For `AH` we only succeeded in getting back to April 3th, 2020.

In [70]:
print("AH's followers", df_JumboSupermarkt.loc[df_albertheijn.shape[0]-1,'followers_count'])
print("AH's friends", df_JumboSupermarkt.loc[df_albertheijn.shape[0]-1,'friends_count'])

AH's followers 16211
AH's friends 1710


In [71]:
df_albertheijn['language'].value_counts()

nl     3096
en       74
und       7
et        5
fr        5
tr        4
fi        3
in        2
da        2
pl        1
de        1
is        1
ht        1
sv        1
es        1
Name: language, dtype: int64

## A little EDA

In [72]:
# concatenate all dataframes

df_tweet_conc = pd.concat([df_picnic,df_JumboSupermarkt, df_albertheijn])
df_tweet_conc.head()

Unnamed: 0,mined_at,screen_name,tweet_id,tweet_id_str,created_at,year,month,day,day_of_week,hour,minute,retweet_count,favorite_count,source,language,user_favourites_count,followers_count,friends_count,text
0,2020-06-21 00:29:45.875729,picnic,1150113338153742337,1150113338153742337,2019-07-13 18:42:42+00:00,2019,7,13,5,18,42,0,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",nl,3881,4844,5,"@mgrhendriks1984 Hier, Martijn! https://t.co/J..."
1,2020-06-21 00:29:45.875729,picnic,1150125973951700992,1150125973951700992,2019-07-13 19:32:55+00:00,2019,7,13,5,19,32,0,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",en,3881,4844,5,@xarinatan Nice to hear this Alex! We are glad...
2,2020-06-21 00:29:45.875729,picnic,1150341671000956928,1150341671000956928,2019-07-14 09:50:01+00:00,2019,7,14,6,9,50,0,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",nl,3881,4844,5,@remkohartman Wat een gezelligheid zeg!🥰 Hopel...
3,2020-06-21 00:29:45.875729,picnic,1150346698012725248,1150346698012725248,2019-07-14 10:09:59+00:00,2019,7,14,6,10,9,0,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",und,3881,4844,5,@remkohartman https://t.co/2gqMsJJAHu
4,2020-06-21 00:29:45.875729,picnic,1150452684107014144,1150452684107014144,2019-07-14 17:11:08+00:00,2019,7,14,6,17,11,0,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",nl,3881,4844,5,"@joost_akker Ah, dat is inderdaad wat vervelen..."


In [74]:
df_tweet_conc.shape[0] == df_picnic.shape[0]+df_JumboSupermarkt.shape[0]+df_albertheijn.shape[0]

True

### Number of followers and friends 

### Languages of messages

# Applying GetSearch to search for a defined query

As seem previously in this notebook making a search in Twitter using `@picnic @JumboSupermarkt @albertheijn covid-19` results in `q=%40picnic%20%40JumboSupermarkt%20%40albertheijn%20covid-19&src=typed_query`.

In [75]:
raw_query = "q=%40picnic%20%40JumboSupermarkt%20%40albertheijn%20covid-19&src=typed_query"
search_picnic_jumbo_ah_covid = TweetMiner.search_tweets(max_pages = 15, count = 20, raw_query = raw_query, result_type = 'mixed')

In [76]:
len(search_picnic_jumbo_ah_covid)

0

In [14]:
search_bolsonaro_covid[0]

{'mined_at': datetime.datetime(2020, 6, 20, 23, 28, 8, 65310),
 'created_at': 'Sat Jun 20 20:26:50 +0000 2020',
 'tweet_id': 1274438583731064835,
 'tweet_id_str': '1274438583731064835',
 'in_reply_to_screen_name': 'secomvc',
 'in_reply_to_status_id': 1274416731453259779,
 'in_reply_to_user_id': 1158389772920020993,
 'language': 'pt',
 'text': '@secomvc @jairbolsonaro @minsaude @funaioficial @DefesaGovBr @anvisa_oficial @MinEconomia É urgente q o Ministério da Saúde cumpra sua função de liderar a crise do Corona\nEle tem q FALAR c/ a população, unir estados e municípios p/ q juntos evitem a disseminação\nEsse 👇 é um triste exemplo de q as pessoas NÃO podem ficar doentes AO MESMO TEMPO\n\nhttps://t.co/TCaK16LAEB',
 'hashtags': [],
 'source': '<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>',
 'screen_name': 'SandraMTRibeir1',
 'user_tweet_id': 1101575981356236803,
 'user_tweet_id_str': '1101575981356236803',
 'user_favourites_count': 1455,
 'followers_co

In [15]:
df_search_bolsonaro_corona = processing_and_saving(pd.DataFrame(search_bolsonaro_covid),"search_bolsonaro_corona",mine_user_twitter=0)

In [16]:
df_search_bolsonaro_corona.tail()

Unnamed: 0,mined_at,tweet_id,tweet_id_str,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,hashtags,source,language,created_at,...,day_of_week,hour,minute,screen_name,user_tweet_id,user_tweet_id_str,user_favourites_count,followers_count,friends_count,text
220,2020-06-20 23:28:08.696027,1274438583731064835,1274438583731064835,secomvc,1.274417e+18,1.15839e+18,[],"<a href=""http://twitter.com/#!/download/ipad"" ...",pt,2020-06-20 20:26:50+00:00,...,5,20,26,SandraMTRibeir1,1101575981356236803,1101575981356236803,1455,20,342,@secomvc @jairbolsonaro @minsaude @funaioficia...
221,2020-06-20 23:28:11.781770,1274438583731064835,1274438583731064835,secomvc,1.274417e+18,1.15839e+18,[],"<a href=""http://twitter.com/#!/download/ipad"" ...",pt,2020-06-20 20:26:50+00:00,...,5,20,26,SandraMTRibeir1,1101575981356236803,1101575981356236803,1455,20,342,@secomvc @jairbolsonaro @minsaude @funaioficia...
222,2020-06-20 23:28:09.875278,1274438583731064835,1274438583731064835,secomvc,1.274417e+18,1.15839e+18,[],"<a href=""http://twitter.com/#!/download/ipad"" ...",pt,2020-06-20 20:26:50+00:00,...,5,20,26,SandraMTRibeir1,1101575981356236803,1101575981356236803,1455,20,342,@secomvc @jairbolsonaro @minsaude @funaioficia...
223,2020-06-20 23:28:11.514203,1274438583731064835,1274438583731064835,secomvc,1.274417e+18,1.15839e+18,[],"<a href=""http://twitter.com/#!/download/ipad"" ...",pt,2020-06-20 20:26:50+00:00,...,5,20,26,SandraMTRibeir1,1101575981356236803,1101575981356236803,1455,20,342,@secomvc @jairbolsonaro @minsaude @funaioficia...
224,2020-06-20 23:28:08.065310,1274438583731064835,1274438583731064835,secomvc,1.274417e+18,1.15839e+18,[],"<a href=""http://twitter.com/#!/download/ipad"" ...",pt,2020-06-20 20:26:50+00:00,...,5,20,26,SandraMTRibeir1,1101575981356236803,1101575981356236803,1455,20,342,@secomvc @jairbolsonaro @minsaude @funaioficia...


In [17]:
df_search_bolsonaro_corona.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 225 entries, 0 to 224
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype              
---  ------                   --------------  -----              
 0   mined_at                 225 non-null    datetime64[ns]     
 1   tweet_id                 225 non-null    int64              
 2   tweet_id_str             225 non-null    object             
 3   in_reply_to_screen_name  90 non-null     object             
 4   in_reply_to_status_id    90 non-null     float64            
 5   in_reply_to_user_id      90 non-null     float64            
 6   hashtags                 225 non-null    object             
 7   source                   225 non-null    object             
 8   language                 225 non-null    object             
 9   created_at               225 non-null    datetime64[ns, UTC]
 10  year                     225 non-null    int64              
 11  month                    225 non

REFS:

> https://python-twitter.readthedocs.io/en/latest/index.html

> https://developer.twitter.com/en/docs

Info:

1st corona case in The Netherlands: 27/02/2020

1st corona case in Brazil: 26/02/2020


**TODO**

- Check functions to see if I have all I need.
- Generalize as much as possible
- Order infor per date (created_at?)
- Should I have a function also for search?
- Retrieve info for:
    - jairbolsonaro
    - picnic
    - JumboSupermarkt
    - albertheijn
    - evanescence
    - WTofficial
    
- Include in class `TweetMiner` a function to user `GetSearch`

Using [`python-twitter`](https://python-twitter.readthedocs.io/en/latest/getting_started.html) since I've been facing some time out problems with [`Tweepy`](http://docs.tweepy.org/en/latest/getting_started.html).

In [None]:
# !pip install python-twitter

# Load packages

# Setting up twitter authentication

# Functions

Function `mine_user_tweets` made use of [Get user_timeline](https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-user_timeline)


In [None]:
import datetime
#TweetMiner function from Mike Roman

# some modification added by me - 15/04/19

class TweetMiner(object):

    
    def __init__(self, api, result_limit = 20, max_pages = 40):
        """result_limit = count that can take max 200"""
        
        self.api = api        
        self.result_limit = result_limit
        self.max_pages = max_pages
        

    def mine_user_tweets(self, user, mine_retweets=False):
        """ Mine tweets of user = screen_name or user_id"""

        data           =  []
        last_tweet_id  =  False
        page           =  1
        
        while page <= self.max_pages:
            
            if last_tweet_id:
                statuses   =   self.api.GetUserTimeline(screen_name=user, count=self.result_limit, max_id=last_tweet_id - 1, 
                                                        include_rts=mine_retweets)
                statuses = [ _.AsDict() for _ in statuses]
            else:
                statuses   =   self.api.GetUserTimeline(screen_name=user, count=self.result_limit, include_rts=mine_retweets)
                statuses = [_.AsDict() for _ in statuses]
                
            for item in statuses:
                # Using try except here.
                # When retweets = 0 we get an error (GetUserTimeline fails to create a key, 'retweet_count')
                try:
                    mined = {
                        'tweet_id':        item['id'],
                        'handle':          item['user']['screen_name'],
                        'retweet_count':   item['retweet_count'],
                        'text':            item['full_text'],
                        'mined_at':        datetime.datetime.now(),
                        'created_at':      item['created_at'],
                        'favorite_count':  item['favorite_count'],
                        'tweet_id_str':    item['id_str'],
                        'source':          item['source'],
                        
                    }
                    
            
                
                except:
                        mined = {
                        'tweet_id':        item['id'],
                        'handle':          item['user']['screen_name'],
                        'retweet_count':   0,
                        'text':            item['full_text'],
                        'mined_at':        datetime.datetime.now(),
                        'created_at':      item['created_at'],
#                         'favorite_count':  item['favorite_count'],
                        'tweet_id_str':    item['id_str'],
                        'source':          item['source'],
                    }
                
                last_tweet_id = item['id']
                data.append(mined)
                
            page += 1
            
        return data

In [None]:
def processing_and_saving(df, file_name):
    
    TodaysDate = time.strftime("%Y-%m-%d-%H-%M")

    
    # Create columns 'year', 'month', 'day', 'hour', 'min' from 'created_at'
    df['created_at'] = pd.to_datetime(df['created_at'])
        
    df['year'] = df['created_at'].dt.year 
    df['month'] = df['created_at'].dt.month 
    df['day'] = df['created_at'].dt.day 
    df['hour'] = df['created_at'].dt.hour 
    df['minute'] = df['created_at'].dt.minute
    df['day_of_week'] = df['created_at'].dt.weekday
    
    df.sort_values(by='created_at',inplace = True)
    
    df = df[['mined_at', 'handle','tweet_id', 'tweet_id_str','created_at', 
             'year', 'month', 'day','day_of_week', 'hour', 'minute','retweet_count', 'source', 'text']]
    
    df.to_csv("./data_tweets/"+file_name+"_"+TodaysDate+".csv", index = False)
    
    return df
    
    

# Getting twitter by user

## Instantiate the class
---

Make sure you pass the keys dictionary and the api as arguments.

**Check:** call the object's `mine_user_tweets()` method, providing a user to pull the tweets of.

In [None]:
# Result limit == count parameter from our GetUserTimeline() it can take max 200
# More pages more back in time you can go
miner = TweetMiner(api, result_limit=20, max_pages = 200)

In [None]:
jairbolsonaro = miner.mine_user_tweets(user="jairbolsonaro")

In [None]:
for x in range(5):
    print(jairbolsonaro[x]['text'])
    print('--')

## Convert the tweet ouputs to a pandas DataFrame
### jairbolsonaro

In [None]:
jairbolsonaro

In [None]:
df_bolsonaro = processing_and_saving(pd.DataFrame(jairbolsonaro), 'jairbolsonaro')

In [None]:
df_bolsonaro.head()

In [None]:
df_bolsonaro.info()

In [None]:
min(df_bolsonaro.created_at),max(df_bolsonaro.created_at)

In [None]:
df_bolsonaro.sort_values(by='created_at')

### picnic

In [None]:
# Result limit == count parameter from our GetUserTimeline() it can take max 200
# More pages more back in time you can go
miner = TweetMiner(api, result_limit=20, max_pages = 100)
picnic = miner.mine_user_tweets(user="picnic")
df_picnic = processing_and_saving(pd.DataFrame(picnic), "picnic")


In [None]:
df_picnic.info()

In [None]:
min(df_picnic.created_at),max(df_picnic.created_at)

### JumboSupermarkt

In [None]:
# Result limit == count parameter from our GetUserTimeline() it can take max 200
# More pages more back in time you can go
miner = TweetMiner(api, result_limit=20, max_pages = 200)
JumboSupermarkt = miner.mine_user_tweets(user="JumboSupermarkt")
df_JumboSupermarkt = processing_and_saving(pd.DataFrame(JumboSupermarkt), "JumboSupermarkt")


In [None]:
df_JumboSupermarkt.info()

In [None]:
min(df_JumboSupermarkt.created_at),max(df_JumboSupermarkt.created_at)

### albertheijn

In [None]:
# Result limit == count parameter from our GetUserTimeline() it can take max 200
# More pages more back in time you can go
miner = TweetMiner(api, result_limit=20, max_pages = 300)
albertheijn = miner.mine_user_tweets(user="albertheijn")
df_albertheijn = processing_and_saving(pd.DataFrame(albertheijn), "albertheijn")


In [None]:
df_albertheijn.info()

In [None]:
min(df_albertheijn.created_at),max(df_albertheijn.created_at)

### evanescence

In [None]:
# Result limit == count parameter from our GetUserTimeline() it can take max 200
# More pages more back in time you can go
miner = TweetMiner(api, result_limit=20, max_pages = 300)
evanescence = miner.mine_user_tweets(user="evanescence")
df_evanescence = processing_and_saving(pd.DataFrame(evanescence), "evanescence")


In [None]:
df_evanescence.info()

In [None]:
min(df_evanescence.created_at),max(df_evanescence.created_at)

In [None]:
df_evanescence.tail()

### Within Temptation - WTofficial

In [None]:
# Result limit == count parameter from our GetUserTimeline() it can take max 200
# More pages more back in time you can go
miner = TweetMiner(api, result_limit=20, max_pages = 300)
WTofficial = miner.mine_user_tweets(user="WTofficial")
df_WTofficial = processing_and_saving(pd.DataFrame(WTofficial), "WTofficial")


In [None]:
df_WTofficial.info()

In [None]:
min(df_WTofficial.created_at),max(df_WTofficial.created_at)

# Experimenting with GetSearch

In [None]:
def search_tweets(max_pages = 20, count = 20, raw_query = None, result_type = 'mixed'):
        """ Search tweets """

        data           =  []
        last_tweet_id  =  False
        page           =  1
        
        while page <= max_pages:
            
            if last_tweet_id:
                statuses = api.GetSearch(raw_query=raw_query, count = count, result_type=result_type, max_id=last_tweet_id - 1)
                statuses = [ _.AsDict() for _ in statuses]
            else:
                statuses = api.GetSearch(raw_query=raw_query, count = count, result_type=result_type)
                statuses = [_.AsDict() for _ in statuses]
                
            for item in statuses:
                # Using try except here.
                # When retweets = 0 we get an error (GetUserTimeline fails to create a key, 'retweet_count')
                try:
                    mined = {
                        'tweet_id':        item['id'],
                        'handle':          item['user']['screen_name'],
                        'retweet_count':   item['retweet_count'],
                        'text':            item['full_text'],
                        'mined_at':        datetime.datetime.now(),
                        'created_at':      item['created_at'],
                        'favorite_count':  item['favorite_count'],
                        'tweet_id_str':    item['id_str'],
                        'source':          item['source'],
                        
                    }
                    
            
                
                except:
                        mined = {
                        'tweet_id':        item['id'],
                        'handle':          item['user']['screen_name'],
                        'retweet_count':   0,
                        'text':            item['full_text'],
                        'mined_at':        datetime.datetime.now(),
                        'created_at':      item['created_at'],
#                         'favorite_count':  item['favorite_count'],
                        'tweet_id_str':    item['id_str'],
                        'source':          item['source'],
                    }
                
                last_tweet_id = item['id']
                data.append(mined)
                
            page += 1
            
        return data

In [None]:
raw_query="q=jairbolsonaro%2C%20corona%2C%20covid&src=typed_query"
results = search_tweets(max_pages = 15, count = 20, raw_query = raw_query, result_type = 'mixed')

In [None]:
len(results)

In [None]:
results[0]

In [None]:
df_search_bolsonaro_corona = processing_and_saving(pd.DataFrame(results),"search_bolsonaro_corona")

In [None]:
df_search_bolsonaro_corona.tail()

In [None]:
df_search_bolsonaro_corona.info()

In [None]:
results = api.GetSearch(raw_query="q=picnic", count = 100, result_type='popular')

In [None]:
results

In [None]:
raw_query = "q=picnic%20jumbo%20ah%20delivery&src=typed_query"

picnic_ah_jumbo_search = search_tweets(max_pages = 1, count = 5, raw_query = raw_query, result_type = 'mixed')
picnic_ah_jumbo_search

In [None]:
df_picnic_ah_jumbo_search = processing_and_saving(pd.DataFrame(picnic_ah_jumbo_search),"picnic_ah_jumbo_search")

In [None]:
df_picnic_ah_jumbo_search