# Scraping Tweets v1 (2012 - All NYC teams)

### Importing/ Downloading packages

In [1]:
# Install packages - uncomment if required

#pip install tweepy
#pip install pandas


In [2]:
# Import libraries

import tweepy
import datetime
import time
import pandas as pd
from calendar import monthrange
import os
from os.path import exists

## Structure:
#### - Use a tweepy search_all_tweets function to find tweets that contains #Giants (to be replaced with whichever sports team we are searching for).
#### The serch query uses:   is not a retweet (-is:retweet), is in English (lang:en), the tweet does not include links (-has:links) and is geo-tagged within 10km of the longitudinal/latitudinal coordinates of the Empire State building (40.7301366,-74.1831972) (point_radius:40.7301366,-74.1831972,10km).
#### - This search function will be run for each day. This is because the end_time and start_time of the search function uses returns the most recent x tweets in relation to end_time  - therefore, to get equal amount of tweets spread over the year - need to specify the start_time and end_time for each day. This will be run iteratively - stored in the dictionary 'daily_start_end_time'.
#### - The information from each month will be saved down in a csv file. Therefore - for 2012 and 2013 there will be 24 file outputs. The reason for doing this is to reduce the risk of an error interupting the code and having to start from the beginning.  

### Twitter API Authentification

In [None]:
# INSERT ACADEMIC API BEARER TOKEN HERE
bearer_token = ""

In [None]:
# A rate limit shows an error if you've made too many queries - Tweepy will either stop your code
# or wait until you have more requests. Using True waits til you can have more queries. 
client = tweepy.Client(bearer_token = bearer_token, 
                       wait_on_rate_limit=True)

#### daily_start_end_time': Create a list of the start_time and end_time for each day in each month (in 2012)


In [3]:
# Store the name of the month, the numeric reference (e.g. jan = 1, feb = 2) and 
# the num of days in the dictionary months 
month_names = ['1','2','3','4','5','6','7','8','9','10','11','12']

# This dict will contain the start and end time for each day in each month
daily_start_end_time = {'1':[],'2':[],'3':[],'4':[],'5':[],'6':[],'7':[],'8':[],'9':[],'10':[],'11':[],'12':[]}

# Update months 
for month, month_str in enumerate(month_names):
    
    # Add 1 to month (as the first index is 0)
    month = month + 1 
    # The number of days in the the month (add 1 because the first index is 0)
    num_days = monthrange(2012, month)[1]
    # print("Number of days in ", month_str, " is " ,str(num_days))
    
    # Loop through each day of the month and add the start_time and end_time to the daily_end_start_time dictionary
    for day in range(1,num_days + 1):
        
        
        # The start_time 
        start_time = datetime.datetime(2012, month, day, 0, 0, 0, 0, datetime.timezone.utc)

        # The end_time for the month - using num_days
        end_time = datetime.datetime(2012, month, day, 23, 59, 59, 0, datetime.timezone.utc)
        
        # Append to the month a list containing the start and end time for the day
        daily_start_end_time[month_str].append([start_time, end_time])

In [4]:
# Check the dictionary 'daily_start_end_time' is correct
daily_start_end_time

{'1': [[datetime.datetime(2013, 1, 1, 0, 0, tzinfo=datetime.timezone.utc),
   datetime.datetime(2013, 1, 1, 23, 59, 59, tzinfo=datetime.timezone.utc)],
  [datetime.datetime(2013, 1, 2, 0, 0, tzinfo=datetime.timezone.utc),
   datetime.datetime(2013, 1, 2, 23, 59, 59, tzinfo=datetime.timezone.utc)],
  [datetime.datetime(2013, 1, 3, 0, 0, tzinfo=datetime.timezone.utc),
   datetime.datetime(2013, 1, 3, 23, 59, 59, tzinfo=datetime.timezone.utc)],
  [datetime.datetime(2013, 1, 4, 0, 0, tzinfo=datetime.timezone.utc),
   datetime.datetime(2013, 1, 4, 23, 59, 59, tzinfo=datetime.timezone.utc)],
  [datetime.datetime(2013, 1, 5, 0, 0, tzinfo=datetime.timezone.utc),
   datetime.datetime(2013, 1, 5, 23, 59, 59, tzinfo=datetime.timezone.utc)],
  [datetime.datetime(2013, 1, 6, 0, 0, tzinfo=datetime.timezone.utc),
   datetime.datetime(2013, 1, 6, 23, 59, 59, tzinfo=datetime.timezone.utc)],
  [datetime.datetime(2013, 1, 7, 0, 0, tzinfo=datetime.timezone.utc),
   datetime.datetime(2013, 1, 7, 23, 59, 59

In [7]:
# Create a dictionary with each of the New York sports teams in and their twitter query
team_search_queries = {'giants': '#Giants -is:retweet -has:links lang:en',
                      'rangers': '#NYRangers -is:retweet -has:links lang:en',
                      'islanders': '#NYIslanders -is:retweet -has:links lang:en',
                      'devils': '#NJDevils -is:retweet -has:links lang:en',
                      'knicks': '#nyknicks -is:retweet -has:links lang:en',
                      'mets': '#Mets -is:retweet -has:links lang:en',
                      'yankees': '#Yankees -is:retweet -has:links lang:en'}

## Main search function:
#### - Currently looks up 200 tweets per day (Controlled by 'max_results' and 'limit') 


### Checking to see if tweets for the month/team have already been scraped:

#### The if statement will check to see whether a csv file exists in the working directory for a month/team (e.g. 1_giants_tweets_2012). If it already exists then the code will not run for that month/team and will move until it gets to a month for which the data has not been scraped yet.

### Dealing with errors when there are no tweets on that day:

#### try, except: This aims to resolve the error that is produced when there is no tweet data for a specific date. The error if thrown out when the code tries to the API output (response) into user friendly format - but cannot when there is no data to sort. If the data does not exist a KeyError is raised. Now, if this happens the code will move onto the next day, using 'continue'. 

In [None]:
# Loop through each team
for team, query in team_search_queries.items():
    # loop through each month and each day and export results to a csv
    # month is the key to the dict, and times is a list of lists - in each is a start and end time
    for month, times in daily_start_end_time.items():
        
        ### Check to see if the output csv file exists in the working directory already 
        
        # Get the current file working directory
        working_directory = os.getcwd()
        csv_file_name = "\\{}_{}_2012_tweets.csv".format(month, team)
        file_path = working_directory + csv_file_name
        
        # does_it_exist is True if there is a file path that exists
        does_it_exist = exists(file_path)
        
        # If the file already exists then move to the next month
        if does_it_exist == True:
            print("The file for month: {}, team: {} already exists".format(month, team))
            continue
        
        # If the file does not already exist - then run the API search
        else:
            team_tweets = []

            # Loop through each start_time and end_time
            for day in times:
                # NOTE: the query is case insensitive - so finds both #Giants as well as #giants - adding more noise 
                for response in tweepy.Paginator(client.search_all_tweets, 
                                             # Tweet contains '#Giants, is not a retweet, does not have links, is in english
                                             query = query,
                                             # user_fields info is stored as features of the object in the team_tweets .includes['users'] 
                                             user_fields = ['username', 'location'],
                                             tweet_fields = ['created_at', 'text'],
                                             expansions = 'author_id',
                                             # start and end time come from the daily_start_end_time dictionary
                                             start_time = day[0],
                                             end_time = day[1],
                                            max_results= 20, 
                                            limit = 10):


                    # API only allows one request per second - so wait for 1 sec 
                    time.sleep(1)
                    team_tweets.append(response)

            ### Tranform the search results for each month into a usable format and save as csv 
            result = []
            user_dict = {}
            # Loop through each response object
            for response in team_tweets:
                # Try to organise the tweets into user friendly format. A KeyError is raised if there is no data- in which case
                # continue
                try:

                    # Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
                    for user in response.includes['users']:
                        user_dict[user.id] = {'username': user.username, 
                                              'location': user.location
                                             }


                    for tweet in response.data:
                        # For each tweet, find the author's information
                        # author_info = user_dict[tweet.author_id]
                        # Put all of the information we want to keep in a single dictionary for each tweet
                        result.append({'author_id': tweet.author_id, 
                                       # 'username': author_info['username'],
                                       'tweet_id': tweet.id,
                                       'text': tweet.text,
                                       'created_at': tweet.created_at})
                
                # If a KeyError is raised then move on to the next day
                except KeyError:
                        print("Except 'response' clause executed")
                        continue


            # Change this list of dictionaries into a dataframe
            monthly_df = pd.DataFrame(result)

            # Save the dataframe for the month as a csv file
            monthly_df.to_csv("{}_{}_2012_tweets.csv".format(month, team), sep = ',', index = False)
