# Twitter scraping using Tweepy

To install: `pip install tweepy`

In [30]:
import tweepy
from configparser import ConfigParser
import pandas as pd
import os.path as path
import json

## Load credentials

In [2]:
cp = ConfigParser()
cp.read('../.secret/credentials.ini')

['../.secret/credentials.ini']

## Get companies' Twitter ids

Load the Twitter handles for companies.

In [3]:
DATA_DIR = "../data/data_tweepy/"

In [4]:
twitter_handles_list = [
    'dpalmisano',
    'Intel',
    'AMD',
    'nvidia'
]

Create a Tweepy authentication handler.

In [5]:
auth = tweepy.OAuthHandler(
    cp['emas_twitter_credentials']['consumer_key'],
    cp['emas_twitter_credentials']['consumer_secret'])
auth.set_access_token(
    cp['emas_twitter_credentials']['access_token'],
    cp['emas_twitter_credentials']['access_token_secret']
)

Insantiate a Tweepy API object (a wrapper for the Twitter API).

In [6]:
api = tweepy.API(auth_handler=auth)

Get users' (companies') ids given the handles.

In [7]:
api.get_user(twitter_handles_list[0]).id

14656799

In [8]:
twitter_ids_dict = {}

In [9]:
def get_users_ids(api, twitter_handles_list, twitter_ids_dict, n_users=None):
    """
    PARAMS
    ------
        api: Tweepy API object.
        twitter_handles_list: list of Twitter handles (screen names)
        twitter_ids_dict: dictionary to which to append the new {'handle': 'id'}
            key-value pairs
        n_users: maximum number of users of which to fetch the ids. Default: None
         (i.e. range(twitter_handles_list)).
    """
    if not n_users:
        n_users = len(twitter_handles_list)
        
    print(f"Fetching {n_users} ids")
    
    for i in range(n_users):
        twitter_handle = twitter_handles_list[i]
        
        print(f"Fetching id: @{twitter_handle}")
        
        try:
            twitter_id = api.get_user(twitter_handle).id
            
            twitter_ids_dict.update(
                {twitter_handle: twitter_id}
            )
        except Exception as e:
            print(e)

In [11]:
get_users_ids(api, twitter_handles_list, twitter_ids_dict)

Fetching 4 ids
Fetching id: @dpalmisano
Fetching id: @Intel
Fetching id: @AMD
Fetching id: @nvidia


In [12]:
twitter_ids_dict

{'dpalmisano': 14656799, 'Intel': 2803191, 'AMD': 14861876, 'nvidia': 61559439}

In [31]:
# with open(path.join(DATA_DIR, 'twitter_ids_dict.json'), 'w') as f:
#     json.dump(twitter_ids_dict, f)

## Printing rate limits

In [17]:
def print_rate_limits(api):
    rate_limit_status = api.rate_limit_status()
    
    for key in rate_limit_status['resources'].keys():
        for endpoint in rate_limit_status['resources'][key].keys():
            limit = rate_limit_status['resources'][key][endpoint]['limit']
            remaining = rate_limit_status['resources'][key][endpoint]['remaining']
            
            if limit!=remaining:
                print(endpoint)
                print(rate_limit_status['resources'][key][endpoint])

In [29]:
print_rate_limits(api)

/application/rate_limit_status
{'limit': 180, 'remaining': 177, 'reset': 1540477717}
/users/show/:id
{'limit': 900, 'remaining': 895, 'reset': 1540477555}
/statuses/user_timeline
{'limit': 900, 'remaining': 898, 'reset': 1540477760}


## Fetching tweets

For a full list of what's in a `tweepy.models.Status` object, see: https://gist.github.com/dev-techmoe/ef676cdd03ac47ac503e856282077bf2

In [32]:
tweets_cols = [
    'twitter_id',
    'created_at',
    'text',
    'user_id',
    'twitter_handle',
    'is_retweeted',
    'retweet_count',
    'favorite_count'
]

tweets_df = pd.DataFrame(columns=tweets_cols)

In [33]:
twitter_timeline = api.user_timeline(
    id=twitter_ids_dict['nvidia'],
    count=200,
    since_id=None
)

In [34]:
for tweet in twitter_timeline:
    tweet_dict = {
        'twitter_id': tweet.id,
        'created_at': tweet.created_at,
        'text': tweet.text,
        'user_id': tweet.user.id_str,
        'twitter_handle': tweet.user.screen_name,
        'is_retweeted': tweet.retweeted,
        'retweet_count': tweet.retweet_count,
        'favorite_count': tweet.favorite_count
    }
    tweets_df = tweets_df.append(tweet_dict, ignore_index=True)

In [35]:
tweets_df

Unnamed: 0,twitter_id,created_at,text,user_id,twitter_handle,is_retweeted,retweet_count,favorite_count
0,1055228806972096512,2018-10-24 22:45:27,RT @NvidiaAI: We’re honored NVIDIA’s “I am AI”...,61559439,nvidia,False,13,0
1,1055228222000885762,2018-10-24 22:43:07,RT @NVIDIAEmbedded: Agriculture is ripe for AI...,61559439,nvidia,False,21,0
2,1055143286900441088,2018-10-24 17:05:37,"In a keynote at #GTC18 DC, U.S. Chief Informat...",61559439,nvidia,False,19,53
3,1055127392170070016,2018-10-24 16:02:28,RT @NvidiaAI: .@SuzetteKent45 discusses how wi...,61559439,nvidia,False,17,0
4,1054855701074370560,2018-10-23 22:02:52,RT @NVIDIADC: AI continues making giant leaps ...,61559439,nvidia,False,28,0
5,1054855477065015296,2018-10-23 22:01:58,"RT @NVIDIADC: For healthcare, this is the next...",61559439,nvidia,False,47,0
6,1054839314373271552,2018-10-23 20:57:45,RT @NVIDIAAIDev: NVIDIA GPU Cloud reduces the ...,61559439,nvidia,False,43,0
7,1054802321840988166,2018-10-23 18:30:45,.@ChaosGroup‘s latest experiments with @NVIDIA...,61559439,nvidia,False,53,381
8,1054798324581179392,2018-10-23 18:14:52,Join the #GTC18 DC Keynote by Federal CIO Suze...,61559439,nvidia,False,9,38
9,1054795137585270785,2018-10-23 18:02:12,RT @NvidiaAI: Announced at #GTC18 in Washingto...,61559439,nvidia,False,35,0
