In [1]:
import json
import os
import pandas as pd
import time

from typing import Tuple

import s3fs
import tweepy

In [2]:
twitter_key_path = '/Users/evanmcneal/Desktop/Projects/athena/data_igsn/keys/twitter_keys.txt'
tweet_df_path = '/Users/evanmcneal/Desktop/Projects/athena/data_igsn/data/tweet_df.csv'
cities_df_path = '/Users/evanmcneal/Desktop/Projects/athena/data_igsn/data/cities.csv'

## Tweepy

In [3]:
with open(twitter_key_path) as json_file:
    keys = json.load(json_file)

In [4]:
API_KEY = keys["API_KEY"]
API_SECRET_KEY = keys["API_SECRET_KEY"]

ACCESS_TOKEN = keys["ACCESS_TOKEN"]
ACCESS_TOKEN_SECRET = keys["ACCESS_TOKEN_SECRET"]

In [5]:
# Authenticate to Twitter
auth = tweepy.OAuthHandler(API_KEY, API_SECRET_KEY)

auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

In [6]:
api = tweepy.API(
    auth,
    wait_on_rate_limit=True,
    wait_on_rate_limit_notify=True
    #parser=tweepy.parsers.JSONParser()
)

try:
    api.verify_credentials()
    print("Authentication OK")
except:
    print("Error during authentication")

Authentication OK


In [7]:
def create_tweet_df(
        #lat, 
        #long, 
        radius="100mi",
        most_recent_id=False, 
        max_id=False
        ) -> Tuple[pd.DataFrame, int]:
    
    tweet_tuples = []

    for tweet in tweepy.Cursor(
        api.search,
        q="*",
        #geocode=lat+","+long+","+radius,
        lang="en",
        result_type="popular",
        #since_id=most_recent_id,
        max_id=max_id,
        tweet_mode="extended",
        count = 100
    ).items():

        tweet_info = (
            tweet.full_text, 
            tweet.favorite_count, 
            tweet.retweet_count, 
            tweet.user.followers_count, 
            tweet.created_at,
            tweet.id
        )

        tweet_tuples.append(tweet_info)

    tweet_df = pd.DataFrame(
        tweet_tuples, 
        columns =[
            'Tweet', 
            'Num_Favorites', 
            'Num_Retweets', 
            'Num_Followers',
            'Created_At',
            'id'
        ])
    
    tweet_df.sort_values('Created_At', inplace = True)
    
    return tweet_df

In [8]:
#all_tweets_df = create_tweet_df()
all_tweets_df = pd.read_csv(tweet_df_path).sort_values('Created_At')

most_recent_id = all_tweets_df['id'].iloc[-1]
max_id = all_tweets_df['id'].iloc[0]

In [9]:
all_tweets_df.shape[0]

1309

In [10]:
cities = pd.read_csv(cities_df_path)
cities.head()

Unnamed: 0,City,State,WOEID,latitude,longitude
0,New York,New York,2459115,40.712728,-74.006015
1,Los Angeles,California,2442047,34.053691,-118.242767
2,Chicago,Illinois,2379574,41.875562,-87.624421
3,Houston,Texas,2424766,29.758938,-95.367697
4,Phoenix,Arizona,2471390,33.448437,-112.074142


In [11]:
# Just pulling popular

max_tweets = 10000
alert_interval = 1000

while all_tweets_df.shape[0] < max_tweets:
    
    new_tweets_df = create_tweet_df(most_recent_id=most_recent_id)

    if new_tweets_df.shape[0] == 0:
            print("No new tweets; sleeping for 15 minutes. Current count: ", all_tweets_df.shape[0])
            time.sleep(15*60)
    else:
        most_recent_id = new_tweets_df['id'].iloc[-1]

        all_tweets_df = all_tweets_df.append(new_tweets_df, ignore_index=True)
        
        all_tweets_df.drop_duplicates(inplace = True)

        total_num_tweets = all_tweets_df.shape[0]
        if total_num_tweets > alert_interval:
            alert_interval += total_num_tweets # Stupid way to do this, but ¯\_(ツ)_/¯
            print("Current number of tweets: ", total_num_tweets)

Current number of tweets:  1892
Current number of tweets:  3058


Rate limit reached. Sleeping for: 797


KeyboardInterrupt: 

In [None]:
# Pulling popular by city

error
alert_interval = 1000

for index, row in cities.iterrows():

    lat = str(row["latitude"])
    long = str(row["longitude"])
    
    new_tweets_df = create_tweet_df(lat, long)

    '''if new_tweets_df.shape[0] == 0:
        print("No new tweets; sleeping for 15 minutes. Current count: ", all_tweets_df.shape[0])
        time.sleep(15*60)
    else:
        most_recent_id = new_tweets_df['id'].iloc[-1]
        
        all_tweets_df = all_tweets_df.append(new_tweets_df, ignore_index=True)
        
        total_num_tweets = all_tweets_df.shape[0]
        if total_num_tweets > alert_interval:
            alert_interval += total_num_tweets # Stupid way to do this, but ¯\_(ツ)_/¯
            print("Current number of tweets: ", total_num_tweets)'''
    
    all_tweets_df = all_tweets_df.append(new_tweets_df, ignore_index=True)
    
    if index%10 == 0:
        print("Number of tweets: ", all_tweets_df.shape[0])

In [12]:
print("Length before duplicate drop: ", all_tweets_df.shape[0])

Length before duplicate drop:  3058


In [13]:
all_tweets_df.drop_duplicates(inplace = True)

In [14]:
# Should be the same?...
print("Length after duplicate drop: ", all_tweets_df.shape[0])

Length after duplicate drop:  2486


## AWS
`s3fs` plays ball with `pandas` behind the scenes, so provided you have generated your IAM User access keys, installed the `AWS CLI`, and run `aws configure`, you can just write to an existing S3 bucket like you would locally.

In [15]:
#tweet_df.to_csv("s3://twitterathena/tweet_df.csv", index=False)
all_tweets_df.to_csv(tweet_df_path, index=False)

In [None]:
#test = pd.read_csv("s3://twitterathena/tweet_df.csv")

In [None]:
#test