# Tweepy Tweet to DataFrame Downloader
Load tweets using tweepy with the twitter API and turn into a pandas dataframe for data science and AI

### Setup

In [1]:
import tweepy
import pandas as pd
import re

In [2]:
#run page where passwords are saved
%run ./tweepy_passwds.ipynb

In [3]:
# set up authorization access objects
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [4]:
# Cursor Handler
import time
def limit_handled(cursor):
    while True:
        try:
            yield cursor.next()
        except tweepy.RateLimitError:
            # send email / webhook/ log
            print('sleeping for 15 min')
            time.sleep(15*60) # 15 minutes =15*60 sec
        
        except StopIteration:
            return

Function to extraact Cursor object

In [57]:
# extraction function

def extract_df(cursor):
    columns = set()
    allowed_types = [str, int]
    tweets_data = []
    # test cursor for 1 tweet
    for status in cursor:
        status_dict = dict(vars(status))
        keys = vars(status).keys()
        single_tweet_data = {'user': status.user.screen_name, 'author':status.author.screen_name, 'timestamp':status.created_at}
        try:
            single_tweet_data['full_retweet_txt'] = status.retweeted_status.full_text
        except AttributeError:
            single_tweet_data['full_retweet_txt'] = status.full_text
        for k in keys:
            try:
                v_type = type(status_dict[k])
            except:
                v_type = None
            if v_type != None:
                if v_type in allowed_types:
                    single_tweet_data[k] = status_dict[k]
                    columns.add(k)
        tweets_data.append(single_tweet_data)

    header_cols = list(columns)
    header_cols.append('user')
    header_cols.append('author')
    header_cols.append('full_retweet_txt')
    header_cols.append('timestamp')
    df = pd.DataFrame(tweets_data, columns=header_cols)
    return df

#### Save search query variables

In [None]:
# Search
query = '#votenoontherecall -#recallgavinnewsom until:2021-09-15 min_retweets:0'
num_tweets = 4000

## Download tweets and turn in to df

In [51]:
df = extract_df(limit_handled(tweepy.Cursor(api.search, q=query,tweet_mode='extended').items(num_tweets)))
df.tail(5)

Rate limit reached. Sleeping for: 838


Unnamed: 0,in_reply_to_status_id,quoted_status_id,source_url,retweet_count,in_reply_to_status_id_str,in_reply_to_user_id_str,id,in_reply_to_screen_name,quoted_status_id_str,in_reply_to_user_id,favorite_count,id_str,full_text,source,lang,user,author,full_retweet_txt,timestamp
3995,,,https://mobile.twitter.com,4,,,1437899718403706883,,,,0,1437899718403706883,RT @FreedomInc5: #VoteNoOnTheRecall https://t....,Twitter Web App,und,Spacereportern1,Spacereportern1,#VoteNoOnTheRecall https://t.co/mASeCxzRkl,2021-09-14 22:03:02
3996,,,https://mobile.twitter.com,56,,,1437899687151943692,,,,0,1437899687151943692,RT @DogginTrump: I dont want to end up with th...,Twitter Web App,en,8675309_6,8675309_6,I dont want to end up with this asshole as our...,2021-09-14 22:02:54
3997,,,https://mobile.twitter.com,97,,,1437899670873911303,,,,0,1437899670873911303,RT @MiaBonta: Our message is clear! #VoteNoOnT...,Twitter Web App,en,8675309_6,8675309_6,Our message is clear! #VoteNoOnTheRecall @seiu...,2021-09-14 22:02:51
3998,1.437788e+18,,http://twitter.com/#!/download/ipad,0,1.4377876792786534e+18,1640929196.0,1437899668705349632,mmpadellan,,1640929000.0,1,1437899668705349632,@mmpadellan @i_am4tunate #VoteNoOnTheRecall,Twitter for iPad,und,Sherrie00,Sherrie00,@mmpadellan @i_am4tunate #VoteNoOnTheRecall,2021-09-14 22:02:50
3999,,,http://twitter.com/download/iphone,0,,,1437899666587193347,,,,0,1437899666587193347,#VoteNoOnTheRecall,Twitter for iPhone,und,purplebieber1D,purplebieber1D,#VoteNoOnTheRecall,2021-09-14 22:02:50


#### Clean Up Query to Save as CSV
Turn the query actual query into snake case to use as the name of the csv automatically

In [55]:
clean_query = re.sub('\s','_', query)
clean_query

'#votenoontherecall_-#recallgavinnewsom_until:2021-09-15_min_retweets:0'

In [56]:
rows = len(df)
df.to_csv(f'{clean_query}_{rows}rows.csv')
f'{clean_query}_{rows}rows.csv'

'#votenoontherecall_-#recallgavinnewsom_until:2021-09-15_min_retweets:0_4000rows.csv'