In [133]:
import requests
import os
import json
import pandas as pd
import datetime

# To set your environment variables in your terminal run the following line:
# export 'BEARER_TOKEN'='<your_bearer_token>'
# bearer_token = os.environ.get("BEARER_TOKEN")
bearer_token ='AAAAAAAAAAAAAAAAAAAAACqHUgEAAAAAcghJSkKBmFmw36ZiRxrqVdyomIk%3DZlDqMjNA0ka4btIuhDjiHUyRGoCHDzKBWvNGcqYXpLJ5kuhUAp'

In [134]:
def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2RecentSearchPython"
    return r

def connect_to_endpoint(url, params):
    response = requests.get(url, auth=bearer_oauth, params=params)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()

In [138]:
class PlanItem:
    def __init__(self, year, month, day, hour = 0):
        self.from_date = datetime.datetime(year, month, day, hour)
        self.to_date = self.from_date + datetime.timedelta(days=1)
        # Creates the dataframe for the planned day
        self.dataframe = pd.DataFrame()
    
    def file_name(self):
        return self.from_date.strftime('data/tweets_%Y%m%d.json')
    
    def get_tweets(self, keyword):
        search_url = "https://api.twitter.com/2/tweets/search/recent"
        query_params = {
            'query': keyword, 
            'start_time': self.from_date.isoformat("T") + "Z", 
            'end_time': self.to_date.isoformat("T") + "Z",
            'tweet.fields': 'author_id,created_at,geo', 
            'max_results': 100}
        print(query_params)
        # Queries Twitter to get the keyword related tweets
        json_response = connect_to_endpoint(search_url, query_params)
        # Convert the response to a dataframe
        df = pd.DataFrame(json_response['data']) 
        df['Keyword'] = keyword
        # Appends the results to the planned day dataframe
        self.dataframe = self.dataframe.append(df, ignore_index=True)

    def check_file_existence(self):
        return os.path.exists(item.file_name())
        
    def save(self):
        print(f'Saving {self.file_name()}')
        self.dataframe.to_json(self.file_name())

In [139]:
plan = [
    PlanItem(2021,10,13,12),
    PlanItem(2021,10,14),
    PlanItem(2021,10,15),
    PlanItem(2021,10,16),
    PlanItem(2021,10,17),
    PlanItem(2021,10,18),
    PlanItem(2021,10,19),
]
keywords = ['altcoin', 'bitcoin', 'coindesk', 'cryptocurrency', 'gold', 'APPL', 'GOOG', 'YHOO']

In [141]:
# Executes the planned queries for each keyword
for item in plan:
    if item.check_file_existence():
        # If the file already exists ommits the query to twitter API
        print(f'File {item.file_name()} already exists.. omiting query')
        continue
    # Gets tweet for each keyword
    for keyword in keywords:
        item.get_tweets(keyword)
    # Saves the file
    item.save()       
        

File data/tweets_20211013.json already exists.. omiting query
File data/tweets_20211014.json already exists.. omiting query
File data/tweets_20211015.json already exists.. omiting query
File data/tweets_20211016.json already exists.. omiting query
File data/tweets_20211017.json already exists.. omiting query
File data/tweets_20211018.json already exists.. omiting query
File data/tweets_20211019.json already exists.. omiting query


In [142]:
# Reads the tweets json files and consolidates them in a single dataframe
complete_df = pd.DataFrame() 
for item in plan:
    item_df = pd.read_json(item.file_name())
    complete_df = complete_df.append(item_df, ignore_index=True)

In [143]:
complete_df.sample(10)

Unnamed: 0,text,created_at,author_id,id,Keyword,geo
3623,@LayahHeilpern I want solid gold 5lb barbells,2021-10-17 23:59:37+00:00,1192307046600728576,1449887856093138944,gold,
3058,"RT @peter_robu: https://t.co/APzep42Zfa ""It se...",2021-10-16 21:43:18+00:00,958742665260163072,1449491166215516160,GOOG,
885,"se vc fala: ""aLtCoIn = ShItCoIn"", eu desconsid...",2021-10-14 23:52:45+00:00,1262388163529658368,1448798965261033472,altcoin,
923,Crypto has a boss &amp; its #mxs!\nOur ranking...,2021-10-14 23:59:47+00:00,1422982390708850688,1448800734204542976,bitcoin,
65,RT @B055Lady_Elle: $poodl\n\nIf you don't got ...,2021-10-14 11:56:38+00:00,3259527900,1448618748827426816,altcoin,
5413,RT @BluthCapital: The Four Horsemen of New Ven...,2021-10-19 23:05:10+00:00,37528064,1450598930870411264,GOOG,
3664,RT @AniTweetCity: This is still gold https://t...,2021-10-17 23:59:13+00:00,1435997355749351424,1449887755287285760,gold,
802,RT @austinahilton: THE LATEST CRYPTO NEWS TODA...,2021-10-14 23:59:29+00:00,580570225,1448800660531548160,altcoin,
2763,BTC H&amp;S targeting $80K - #BTCUSD TradingVi...,2021-10-16 23:58:34+00:00,1342256774464716800,1449525204795805696,cryptocurrency,
4427,RT @waxzyy_: And me i dey always think say all...,2021-10-18 23:59:35+00:00,4917064757,1450250236589481984,gold,


In [147]:
complete_df.describe()

Unnamed: 0,author_id,id
count,5571.0,5571.0
mean,8.610278e+17,1.44954e+18
std,6.060298e+17,688796800000000.0
min,10303.0,1.448546e+18
25%,2455263000.0,1.448801e+18
50%,1.178775e+18,1.449524e+18
75%,1.381319e+18,1.45024e+18
max,1.450611e+18,1.450613e+18


In [148]:
complete_df.text.count()

5571