In [2]:
from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
import tweepy
import json
import pandas as pd
import csv
import re
import string
import os
import time
from tqdm import tqdm

In [3]:
def get_credentials(file):
    path = os.path.join(os.getcwd(),file)
    with open(path, 'r') as f:
        creds = json.load(f)
        return creds

def scrapetweets(search_words, date_since, numTweets, numRuns):
    
    credential_path = r'Data\twitter_credentials.json'
    creds = get_credentials(credential_path)
    auth = tweepy.OAuthHandler(creds['CONSUMER_KEY'], creds['CONSUMER_SECRET'])
    auth.set_access_token(creds['ACCESS_TOKEN'], creds['ACCESS_SECRET'])
 
    api = tweepy.API(auth,wait_on_rate_limit=True)
    
    # Define a for-loop to generate tweets at regular intervals
    # We cannot make large API call in one go. Hence, let's try T times
    
    # Define a pandas dataframe to store the date:
    db_tweets = pd.DataFrame(columns = ['username', 'acctdesc', 'location', 'following',
                                        'followers', 'totaltweets', 'usercreatedts', 'tweetcreatedts',
                                        'retweetcount', 'text', 'hashtags']
                                )
    program_start = time.time()
    tweet_no = 0
    for i in tqdm(range(0, numRuns)):
        # We will time how long it takes to scrape tweets for each run:
        start_run = time.time()
        
        # Collect tweets using the Cursor object
        # .Cursor() returns an object that you can iterate or loop over to access the data collected.
        # Each item in the iterator has various attributes that you can access to get information about each tweet
        tweets = tweepy.Cursor(api.search, q=search_words, lang="en", since=date_since, tweet_mode='extended').items(numTweets)
        tweet_list = [tweet for tweet in tweets]
        
        noTweets = 0
        for tweet in tweet_list:
# Pull the values
            username = tweet.user.screen_name
            acctdesc = tweet.user.description
            location = tweet.user.location
            following = tweet.user.friends_count
            followers = tweet.user.followers_count
            totaltweets = tweet.user.statuses_count
            usercreatedts = tweet.user.created_at
            tweetcreatedts = tweet.created_at
            retweetcount = tweet.retweet_count
            hashtags = tweet.entities['hashtags']
            try:
                text = tweet.retweeted_status.full_text
            except AttributeError:  # Not a Retweet
                text = tweet.full_text
            
            # Add the 11 variables to the empty list - ith_tweet:
            ith_tweet = [username, acctdesc, location, following, followers, totaltweets,
                         usercreatedts, tweetcreatedts, retweetcount, text, hashtags]
            
            # Append to dataframe - db_tweets
            db_tweets.loc[len(db_tweets)] = ith_tweet
            
            # increase counter - noTweets  
            noTweets += 1
        
        # Run ended:
        tweet_no += noTweets
        end_run = time.time()
        duration_run = round((end_run-start_run)/60, 2)
        
        #print('no. of tweets scraped for run {} is {}'.format(i + 1, noTweets))
        #print('time take for {} run to complete is {} mins'.format(i+1, duration_run))
        #time.sleep(920) #15 minute sleep time
            
    # Once all runs have completed, save them to a single csv file:
    from datetime import datetime
    
    # Obtain timestamp in a readable format
    to_csv_timestamp = datetime.today().strftime('%Y%m%d_%H%M%S')
    
    # Define working path and filename
    
    search = [i.replace('#','').strip() for i in search_words.split('AND')]
    filename = os.path.join('Data','Tweets','_'.join(search),to_csv_timestamp + '.csv')
    
    
    # Store dataframe in csv with creation date timestamp
    db_tweets.to_csv(filename, index = False)
    
    program_end = time.time()
    print('Scraping has completed!')
    print('Total time taken to scrape is {} minutes.'.format(round(program_end - program_start)/60, 2))
    print('Allocated a number of {} tweets.'.format(tweet_no))

In [4]:
date_since = "2020-06-01"
numTweets = 30
numRuns = 10

biden_election = "#biden AND #election"
trump_election = "#trump AND #election"
biden_trump_election = "#trump AND #biden AND #election"

In [None]:
while True:
    scrapetweets(biden_election, date_since, numTweets, numRuns)
    scrapetweets(trump_election, date_since, numTweets, numRuns)
    scrapetweets(biden_trump_election, date_since, numTweets, numRuns)
    time.sleep(900)

100%|██████████| 10/10 [00:09<00:00,  1.10it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

Scraping has completed!
Total time taken to scrape is 0.15 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:11<00:00,  1.18s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Scraping has completed!
Total time taken to scrape is 0.2 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:08<00:00,  1.21it/s]


Scraping has completed!
Total time taken to scrape is 0.13333333333333333 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:08<00:00,  1.20it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

Scraping has completed!
Total time taken to scrape is 0.13333333333333333 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:11<00:00,  1.11s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Scraping has completed!
Total time taken to scrape is 0.18333333333333332 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:08<00:00,  1.13it/s]


Scraping has completed!
Total time taken to scrape is 0.15 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:08<00:00,  1.23it/s]
  0%|          | 0/10 [00:00<?, ?it/s]

Scraping has completed!
Total time taken to scrape is 0.13333333333333333 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:11<00:00,  1.14s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Scraping has completed!
Total time taken to scrape is 0.18333333333333332 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:12<00:00,  1.20s/it]


Scraping has completed!
Total time taken to scrape is 0.2 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:12<00:00,  1.25s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Scraping has completed!
Total time taken to scrape is 0.21666666666666667 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:11<00:00,  1.14s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Scraping has completed!
Total time taken to scrape is 0.18333333333333332 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:11<00:00,  1.18s/it]


Scraping has completed!
Total time taken to scrape is 0.2 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:12<00:00,  1.24s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Scraping has completed!
Total time taken to scrape is 0.2 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:11<00:00,  1.12s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Scraping has completed!
Total time taken to scrape is 0.18333333333333332 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:12<00:00,  1.22s/it]


Scraping has completed!
Total time taken to scrape is 0.2 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:11<00:00,  1.19s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Scraping has completed!
Total time taken to scrape is 0.2 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:14<00:00,  1.45s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Scraping has completed!
Total time taken to scrape is 0.25 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:12<00:00,  1.25s/it]


Scraping has completed!
Total time taken to scrape is 0.2 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:12<00:00,  1.20s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Scraping has completed!
Total time taken to scrape is 0.2 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:15<00:00,  1.57s/it]
  0%|          | 0/10 [00:00<?, ?it/s]

Scraping has completed!
Total time taken to scrape is 0.26666666666666666 minutes.
Allocated a number of 300 tweets.


100%|██████████| 10/10 [00:11<00:00,  1.16s/it]


Scraping has completed!
Total time taken to scrape is 0.2 minutes.
Allocated a number of 300 tweets.
