# Twitter scraping using Tweepy

To install: `pip install tweepy`

In [8]:
import tweepy
from configparser import ConfigParser
import pandas as pd
import os.path as path

## Load credentials

In [3]:
cp = ConfigParser()
cp.read('../.secret/credentials.ini')

['../.secret/credentials.ini']

## Get companies' Twitter ids

Load the Twitter handles for companies.

In [4]:
DATA_DIR = "../data/"

In [12]:
companies_twitter_handles_uk = pd.read_csv(path.join(DATA_DIR, 'companies_twitter_handles_uk.csv'), header=None)
companies_twitter_handles_uk.columns = ['twitter_handle', 'ticker', 'sector']
companies_twitter_handles_uk = companies_twitter_handles_uk.append(
    {'twitter_handle': 'UKGamesWorkshop'}, ignore_index=True
)
companies_twitter_handles_uk_list = list(companies_twitter_handles_uk['twitter_handle'].values)

In [10]:
companies_twitter_handles_uk.head()

Unnamed: 0,twitter_handle,ticker,sector
0,3i,III,FinancialServices
1,3iInfrastructure,3IN,
2,888Holdings,888,
3,AA,AA.,
4,AberforthSmallerCompaniesTrust,ASL,


Create a Tweepy authentication handler.

In [17]:
auth = tweepy.OAuthHandler(
    cp['emas_twitter_credentials']['consumer_key'],
    cp['emas_twitter_credentials']['consumer_secret'])
auth.set_access_token(
    cp['emas_twitter_credentials']['access_token'],
    cp['emas_twitter_credentials']['access_token_secret']
)

Insantiate a Tweepy API object (a wrapper for the Twitter API).

In [19]:
api = tweepy.API(auth_handler=auth)

Get users' (companies') ids given the handles.

In [23]:
api.get_user(companies_twitter_handles_uk_list[0]).id

18563305

In [25]:
twitter_ids_dict = {}

In [26]:
def get_users_ids(api, twitter_handles_list, twitter_ids_dict, n_users=None):
    """
    PARAMS
    ------
        api: Tweepy API object.
        twitter_handles_list: list of Twitter handles (screen names)
        twitter_ids_dict: dictionary to which to append the new {'handle': 'id'}
            key-value pairs
        n_users: maximum number of users of which to fetch the ids. Default: None
         (i.e. range(twitter_handles_list)).
    """
    if not n_users:
        n_users = len(twitter_handles_list)
        
    print(f"Fetching {n_users} ids")
    
    for i in range(n_users):
        twitter_handle = twitter_handles_list[i]
        
        print(f"Fetching id: @{twitter_handle}")
        
        try:
            twitter_id = api.get_user(twitter_handle).id
            
            twitter_ids_dict.update(
                {twitter_handle: twitter_id}
            )
        except Exception as e:
            print(e)

In [27]:
get_users_ids(api, companies_twitter_handles_uk_list, twitter_ids_dict, n_users=10)

Fetching 10 ids
Fetching id: @3i
Fetching id: @3iInfrastructure
[{'code': 50, 'message': 'User not found.'}]
Fetching id: @888Holdings
Fetching id: @AA
Fetching id: @AberforthSmallerCompaniesTrust
[{'code': 50, 'message': 'User not found.'}]
Fetching id: @AdmiralGroup
Fetching id: @Aggreko
Fetching id: @AllianceTrust
Fetching id: @Amigo
Fetching id: @AngloAmerican


In [29]:
twitter_ids_dict['3i']

18563305

## Printing rate limits

In [30]:
def print_rate_limits(api):
    rate_limit_status = api.rate_limit_status()
    
    for key in rate_limit_status['resources'].keys():
        for endpoint in rate_limit_status['resources'][key].keys():
            limit = rate_limit_status['resources'][key][endpoint]['limit']
            remaining = rate_limit_status['resources'][key][endpoint]['remaining']
            
            if limit!=remaining:
                print(endpoint)
                print(rate_limit_status['resources'][key][endpoint])

In [72]:
print_rate_limits(api)

/application/rate_limit_status
{'limit': 180, 'remaining': 179, 'reset': 1540474516}


## Fetching tweets

For a full list of what's in a `tweepy.models.Status` object, see: https://gist.github.com/dev-techmoe/ef676cdd03ac47ac503e856282077bf2

In [99]:
tweets_cols = [
    'twitter_id',
    'created_at',
    'text',
    'user_id',
    'twitter_handle',
    'is_retweeted',
    'retweet_count',
    'favorite_count'
]

tweets_df = pd.DataFrame(columns=tweets_cols)

In [104]:
twitter_timeline = api.user_timeline(
    id=twitter_ids_dict['888Holdings'],
    count=100,
    since_id=None
)

In [105]:
for tweet in twitter_timeline:
    tweet_dict = {
        'twitter_id': tweet.id_str,
        'created_at': tweet.created_at,
        'text': tweet.text,
        'user_id': tweet.user.id_str,
        'twitter_handle': tweet.user.screen_name,
        'is_retweeted': tweet.retweeted,
        'retweet_count': tweet.retweet_count,
        'favorite_count': tweet.favorite_count
    }
    tweets_df = tweets_df.append(tweet_dict, ignore_index=True)

In [106]:
tweets_df

Unnamed: 0,twitter_id,created_at,text,user_id,twitter_handle,is_retweeted,retweet_count,favorite_count
0,63599074531749889,2011-04-28 13:42:55,Win Win Win at 888casino,203915171,888Holdings,False,1,0
1,63599074531749889,2011-04-28 13:42:55,Win Win Win at 888casino,203915171,888Holdings,False,1,0
2,63563315896532992,2011-04-28 11:20:50,Don't worry about the Royal Wedding - win a ho...,203915171,888Holdings,False,0,0
3,63137741629698049,2011-04-27 07:09:45,http://t.co/qiKmh9y,203915171,888Holdings,False,0,0
4,51976748086214657,2011-03-27 11:59:57,Play 888casino’s Live Casino and you could win...,203915171,888Holdings,False,0,0
5,51975912245301248,2011-03-27 11:56:37,Play Live Casino through LiveDealer.org for an...,203915171,888Holdings,False,0,1
6,51974680638267392,2011-03-27 11:51:44,365 Free Spins Still Available!,203915171,888Holdings,False,0,0
7,46451127146905601,2011-03-12 06:03:06,"UK: 365 Free Spins Absolutely FREE, for the ch...",203915171,888Holdings,False,0,0
8,43213827940155393,2011-03-03 07:39:14,Get $13 FREE upon your first deposit: http://w...,203915171,888Holdings,False,0,0
9,43213111943106560,2011-03-03 07:36:23,Like Live Dealer? \n$13 Free upon First Deposi...,203915171,888Holdings,False,0,0
