# Test the `twitter_scraping` module

In [34]:
import tweepy
import pandas as pd
from configparser import ConfigParser
import daiquiri
import logging
import sys
import os.path as path
import pickle
from twitter_scraping.twitter_scraping import print_rate_limits, get_users_ids, \
    fetch_tweets, TwitterScraper

%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Test functions

Create a Tweepy API object.

In [4]:
cp = ConfigParser()
cp.read('../.secret/credentials.ini')

['../.secret/credentials.ini']

In [5]:
auth = tweepy.OAuthHandler(
    cp['emas_twitter_credentials']['consumer_key'],
    cp['emas_twitter_credentials']['consumer_secret'])
auth.set_access_token(
    cp['emas_twitter_credentials']['access_token'],
    cp['emas_twitter_credentials']['access_token_secret']
)

In [4]:
api = tweepy.API(auth_handler=auth)

Test functions in the `twitter_scraping.twitter_scraping` module.

In [5]:
print_rate_limits(api)

/application/rate_limit_status
{'limit': 180, 'remaining': 179, 'reset': 1540483361}
/statuses/user_timeline
{'limit': 900, 'remaining': 895, 'reset': 1540483068}


In [6]:
twitter_handles_list = [
    'dpalmisano',
    'Intel',
    'AMD',
    'nvidia']
twitter_ids_dict = {}

get_users_ids(api, twitter_handles_list, twitter_ids_dict)

Fetching 4 ids
Fetching id: @dpalmisano
Fetching id: @Intel
Fetching id: @AMD
Fetching id: @nvidia


In [7]:
twitter_ids_dict

{'dpalmisano': 14656799, 'Intel': 2803191, 'AMD': 14861876, 'nvidia': 61559439}

In [8]:
tweets_cols = [
    'twitter_id',
    'created_at',
    'text',
    'user_id',
    'twitter_handle',
    'is_retweet',
    'retweet_count',
    'favorite_count'
]

tweets_df = pd.DataFrame(columns=tweets_cols)

In [9]:
tweets_df = fetch_tweets(api, twitter_ids_dict, tweets_df)

Fetching tweets: dpalmisano
since_id=None
Fetching tweets: Intel
since_id=None
Fetching tweets: AMD
since_id=None
Fetching tweets: nvidia
since_id=None


In [10]:
tweets_df.groupby(by='twitter_handle').count()

Unnamed: 0_level_0,twitter_id,created_at,text,user_id,is_retweet,retweet_count,favorite_count
twitter_handle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AMD,200,200,200,200,200,200,200
Intel,200,200,200,200,200,200,200
dpalmisano,197,197,197,197,197,197,197
nvidia,200,200,200,200,200,200,200


In [12]:
with open('../data/data_tweepy/tweets_df.pkl', 'wb') as f:
    pickle.dump(tweets_df, f)

## Test the `TwitterScraper` object

Instantiate logger.

In [6]:
LOGS_DIR = '../logs/logs_tweepy/'
IDS_PATH = '../data/data_tweepy/twitter_ids_dict.json'
DATA_PATH = '../data/data_tweepy/tweets_df.pkl'

In [7]:
daiquiri.setup(
    level=logging.INFO,
    outputs=(
        daiquiri.output.Stream(sys.stdout),
        daiquiri.output.File(
            path.join(LOGS_DIR, 'update_tweets.log'),
            formatter=daiquiri.formatter.TEXT_FORMATTER
        )
    )
)

logger = daiquiri.getLogger(__name__)

Instantiate scraper.

In [43]:
%autoreload 2

twitter_scraper = TwitterScraper(
    logger,
    data_path=DATA_PATH,
    credentials_path='../.secret/credentials.ini',
    twitter_ids_dict_path=IDS_PATH
)

In [158]:
twitter_scraper.load_tweets()

In [159]:
twitter_scraper.tweets_df.shape

(1, 8)

In [160]:
twitter_scraper.get_twitter_ids()

In [161]:
print(twitter_scraper.twitter_ids_dict)

{'dpalmisano': 14656799, 'Intel': 2803191, 'AMD': 14861876, 'nvidia': 61559439}


In [167]:
twitter_scraper.get_twitter_ids()

twitter_scraper.fetch_tweets(
    twitter_scraper.get_tweepy_api(),
    twitter_scraper.twitter_ids_dict
)

2018-10-30 14:02:33,769 [16990] INFO     __main__: Loading tweets...
2018-10-30 14:02:33,790 [16990] INFO     __main__: Fetching tweets...
Fetching tweets: dpalmisano
since_id=None
Fetching tweets: Intel
since_id=None
Fetching tweets: AMD
since_id=None
Fetching tweets: nvidia
since_id=None
2018-10-30 14:02:40,525 [16990] INFO     __main__: Saving tweets...


In [168]:
twitter_scraper.tweets_df.groupby(by='twitter_handle').count()

Unnamed: 0_level_0,twitter_id,created_at,text,user_id,is_retweet,retweet_count,favorite_count
twitter_handle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AMD,200,200,200,200,200,200,200
Intel,200,200,200,200,200,200,200
dpalmisano,197,197,197,197,197,197,197
nvidia,200,200,200,200,200,200,200


In [170]:
tweets_cols = [
    'twitter_id',
    'created_at',
    'text',
    'user_id',
    'twitter_handle',
    'is_retweet',
    'retweet_count',
    'favorite_count'
]

tweets_df = pd.DataFrame(columns=tweets_cols)

In [171]:
# with open('../data/data_tweepy/tweets_df.pkl', 'wb') as f:
#     pickle.dump(tweets_df, f)

Test adding Twitter handles.

In [44]:
twitter_scraper.get_twitter_ids()

In [48]:
twitter_scraper.twitter_ids_dict

{'dpalmisano': 14656799,
 'Intel': 2803191,
 'AMD': 14861876,
 'nvidia': 61559439,
 'UKGamesWorkshop': 226546364}

In [46]:
twitter_scraper.save_twitter_ids()

In [47]:
twitter_scraper.add_handle('UKGamesWorkshop')