# Twitter scraping using Tweepy

To install: `pip install tweepy`

In [1]:
import tweepy
from configparser import ConfigParser
import pandas as pd
import os.path as path
import json

## Load credentials

In [2]:
cp = ConfigParser()
cp.read('../.secret/credentials.ini')

['../.secret/credentials.ini']

## Get companies' Twitter ids

Load the Twitter handles for companies.

In [3]:
DATA_DIR = "../data/data_tweepy/"

In [4]:
twitter_handles_list = [
    'dpalmisano',
    'Intel',
    'AMD',
    'nvidia'
]

Create a Tweepy authentication handler.

In [29]:
auth = tweepy.OAuthHandler(
    cp['emas_twitter_credentials']['consumer_key'],
    cp['emas_twitter_credentials']['consumer_secret'])
auth.set_access_token(
    cp['emas_twitter_credentials']['access_token'],
    cp['emas_twitter_credentials']['access_token_secret']
)

Insantiate a Tweepy API object (a wrapper for the Twitter API).

In [30]:
api = tweepy.API(auth_handler=auth)

Get users' (companies') ids given the handles.

In [7]:
api.get_user(twitter_handles_list[0]).id

14656799

In [8]:
twitter_ids_dict = {}

In [9]:
def get_users_ids(api, twitter_handles_list, twitter_ids_dict, n_users=None):
    """
    PARAMS
    ------
        api: Tweepy API object.
        twitter_handles_list: list of Twitter handles (screen names)
        twitter_ids_dict: dictionary to which to append the new {'handle': 'id'}
            key-value pairs
        n_users: maximum number of users of which to fetch the ids. Default: None
         (i.e. range(twitter_handles_list)).
    """
    if not n_users:
        n_users = len(twitter_handles_list)
        
    print(f"Fetching {n_users} ids")
    
    for i in range(n_users):
        twitter_handle = twitter_handles_list[i]
        
        print(f"Fetching id: @{twitter_handle}")
        
        try:
            twitter_id = api.get_user(twitter_handle).id
            
            twitter_ids_dict.update(
                {twitter_handle: twitter_id}
            )
        except Exception as e:
            print(e)

In [10]:
get_users_ids(api, twitter_handles_list, twitter_ids_dict)

Fetching 4 ids
Fetching id: @dpalmisano
Fetching id: @Intel
Fetching id: @AMD
Fetching id: @nvidia


In [11]:
twitter_ids_dict

{'dpalmisano': 14656799, 'Intel': 2803191, 'AMD': 14861876, 'nvidia': 61559439}

In [31]:
# with open(path.join(DATA_DIR, 'twitter_ids_dict.json'), 'w') as f:
#     json.dump(twitter_ids_dict, f)

## Printing rate limits

In [12]:
def print_rate_limits(api):
    rate_limit_status = api.rate_limit_status()
    
    for key in rate_limit_status['resources'].keys():
        for endpoint in rate_limit_status['resources'][key].keys():
            limit = rate_limit_status['resources'][key][endpoint]['limit']
            remaining = rate_limit_status['resources'][key][endpoint]['remaining']
            
            if limit!=remaining:
                print(endpoint)
                print(rate_limit_status['resources'][key][endpoint])

In [13]:
print_rate_limits(api)

/application/rate_limit_status
{'limit': 180, 'remaining': 179, 'reset': 1540897987}
/users/show/:id
{'limit': 900, 'remaining': 895, 'reset': 1540897970}


## Fetching tweets

For a full list of what's in a `tweepy.models.Status` object, see: https://gist.github.com/dev-techmoe/ef676cdd03ac47ac503e856282077bf2

In [155]:
tweets_cols = [
    'twitter_id',
    'created_at',
    'text',
    'user_id',
    'twitter_handle',
    'is_retweet',
    'retweet_count',
    'favorite_count'
]

tweets_df = pd.DataFrame(columns=tweets_cols)

In [156]:
twitter_timeline = api.user_timeline(
    id=twitter_ids_dict['nvidia'],
    count=200,
    tweet_mode="extended",
    since_id=None,
    include_rts=1,
    exclude_replies=1
)

In [157]:
for tweet in twitter_timeline:
    tweet_dict = {
        'twitter_id': tweet.id,
        'created_at': tweet.created_at,
        'user_id': tweet.user.id_str,
        'twitter_handle': tweet.user.screen_name,
        'is_retweet': str(tweet.retweeted),
        'retweet_count': tweet.retweet_count,
        'favorite_count': tweet.favorite_count
    }
    try:
        tweet_dict['text'] = tweet._json['retweeted_status']['full_text']
    except:
        tweet_dict['text'] = tweet.full_text
        
    tweets_df = tweets_df.append(tweet_dict, ignore_index=True)

In [150]:
pd.set_option('max_colwidth',300)

In [158]:
tweets_df

Unnamed: 0,twitter_id,created_at,text,user_id,twitter_handle,is_retweet,retweet_count,favorite_count
0,1056996903785623552,2018-10-29 19:51:14,"This week, @NVIDIA researchers will present their paper at the Conference on Robot Learning #CoRL18, allowing a robot to perceive household objects and put smart picking within grasp. https://t.co/oIPJtSA6aj https://t.co/l24G9iDSWg",61559439,nvidia,False,54,0
1,1055958243028066304,2018-10-26 23:03:58,"Today is the debut of @Livermore_Lab's Sierra, one of the world’s fastest #supercomputers. Powered by NVIDIA Tesla V100 GPUs, learn what this impressive machine will do to help solve national security and energy challenges. https://t.co/WjgxVxu3B2 https://t.co/gduzYlyQd7",61559439,nvidia,False,53,0
2,1055890108451192832,2018-10-26 18:33:14,"SF-based startup, @motionloft, and #NVIDIAJetson are helping brick-and-mortar retailers measure foot traffic in more than 1,500 locations, offering valuable insights for retail and city planning. https://t.co/AHpaxpwMUI https://t.co/bowQaUQhQz",61559439,nvidia,False,24,0
3,1055889820499636224,2018-10-26 18:32:05,"Finding the perfect place to stay when you’re traveling can be a daunting task. To help users get the best suggestions, @Airbnb developers trained a #deeplearning algorithm on @NVIDIA #GPUs to supercharge the company's search rankings. https://t.co/wtpfwLmGR3 https://t.co/5JpCtCnjuD",61559439,nvidia,False,10,0
4,1055564757971812352,2018-10-25 21:00:24,".@aivatechnology, the company behind the world’s first non-human AI music composition, just released a brand new AI-generated rock song trained and generated on @NVIDIA #GPUs. https://t.co/eBxXxxGkqG https://t.co/wcCCOFTJn6",61559439,nvidia,False,92,0
5,1055502591658868737,2018-10-25 16:53:22,"#GTC18, the largest AI event it the DC area, brought together thousand of agency leaders, congressional staff, entrepreneurs, developers and startups. Learn top insights from the show on the massive opportunity of #AI: https://t.co/7MqDkEIYcp https://t.co/2zYcI1E541",61559439,nvidia,False,29,90
6,1055494200634949633,2018-10-25 16:20:02,NVIDIA and @RedHat are simplifying #AI development and manageability. @TonyPaikeday discusses the benefits and implications of the certification of DGX-1 for #RHEL: https://t.co/YnBQYRxwqz https://t.co/JyFNlMAqzT,61559439,nvidia,False,60,0
7,1055494021173243904,2018-10-25 16:19:19,Join us at #SC18 for a special address from NVIDIA CEO Jensen Huang to hear about the latest innovations in GPU-accelerated #supercomputing. RSVP today to save your spot. https://t.co/GOhM7yHZj3 https://t.co/s45FAnJr5r,61559439,nvidia,False,28,0
8,1055228806972096512,2018-10-24 22:45:27,We’re honored NVIDIA’s “I am AI” series is a finalist in the @DigidayAwards for Best Branding B2B campaign! Meet our Innovators who are a part of this amazing campaign! https://t.co/JqHLaaUYKr #IamAI https://t.co/nUd1tOOqnD,61559439,nvidia,False,16,0
9,1055228222000885762,2018-10-24 22:43:07,Agriculture is ripe for AI. Check out the blog recap of our #GTC18 DC panel explaining how AI and robotics will play an important role in this industry's future. https://t.co/32795qYq11 https://t.co/caLsdlwQ2E,61559439,nvidia,False,25,0


In [162]:
import pickle

In [163]:
with open(path.join(DATA_DIR, 'tweets_df.pkl'), 'wb') as f:
    pickle.dump(tweets_df, f)

__Problem:__ Tweepy returns a truncated version of the text of the tweets.

In [131]:
twitter_timeline[0]._json['retweeted_status']['full_text']

'This week, @NVIDIA researchers will present their paper at the Conference on Robot Learning #CoRL18, allowing a robot to perceive household objects and put smart picking within grasp. https://t.co/oIPJtSA6aj https://t.co/l24G9iDSWg'