# Extract Tweets

## Overview
This notebook extracts tweets from a user and their followers and places them into a CSV

## Pre-requisites
Make sure that you create a `.env` file with the following arguments before running the notebook

```
BEARER_TOKEN=XXX
```

# Initial set up

The lines below sets up your dependencies and environment variables.

In [11]:
import sys
!{sys.executable} -m pip install -r requirements.txt

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m

In [1]:
import os
from dotenv import load_dotenv


load_dotenv()
token = os.environ.get("BEARER_TOKEN")

print("Setup complete.")

Setup complete.


# Retrieve twitter feeds

## Set up twitter functions

In [135]:
import requests

search_url = "https://api.twitter.com/2/tweets/search/recent"
following_url = "https://api.twitter.com/2/users/{}/following"
lookup_username_url = "https://api.twitter.com/2/users/by/username/{}"

def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """
    r.headers["Authorization"] = f"Bearer {token}"
    return r

def clean_tweets(tweets, username):
    if 'data' not in tweets:
        return []
    
    ref_tweets = {tweet['id']: tweet['text'] for tweet in tweets['includes']['tweets']} if 'includes' in tweets else {}
    raw_tweets = tweets['data']
    
    results = []
    for t in raw_tweets:
        result_tweet = { 'tweet_id': t['id'], 'username': username }
        if 'referenced_tweets' in t:
            combined_text = []
            for rt in t['referenced_tweets']:
                rt_id = rt['id']
                if rt_id in ref_tweets:
                    rt_text = ref_tweets[rt_id]
                    combined_text.append(rt_text)
            result_tweet['text'] = ' '.join(combined_text)
        else:
            result_tweet['text'] = t['text']
        results.append(result_tweet)
    
    return results

def fetch_tweets_by_username(username):
    params = {
        "query": "from:{} -is:reply".format(username),
        "max_results": 100,
        "expansions": "referenced_tweets.id"
    }
    response = requests.get(search_url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    data = response.json()
    return clean_tweets(data, username)

def fetch_user_by_username(username):
    url = lookup_username_url.format(username)
    response = requests.get(url, auth=bearer_oauth)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    resp = response.json()
    return resp['data']

def fetch_following_by_username(username):
    user = fetch_user_by_username(username)
    url = following_url.format(user['id'])
    params = {
        'max_results': 500
    }
    response = requests.get(url, auth=bearer_oauth, params=params)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    resp = response.json()
    return resp['data']

In [137]:
import pandas as pd

username = "elonmusk"

user_following = fetch_following_by_username(username)

users_to_search = list(map(lambda x: x['username'], user_following))
users_to_search.append(username)

processed = 0
all_tweets = []

for user in users_to_search:
    user_tweets = fetch_tweets_by_username(user)
    processed += 1
    all_tweets.extend(user_tweets)
    progress = round((processed / len(users_to_search)) * 100, 2)
    print("Processed {}/{} users ({}%)".format(processed, len(users_to_search), progress))

df = pd.DataFrame(all_tweets)
df.to_csv('user-tweets.csv')
df.head()

Processed 1/113 users (0.88%)
Processed 2/113 users (1.77%)
Processed 3/113 users (2.65%)
Processed 4/113 users (3.54%)
Processed 5/113 users (4.42%)
Processed 6/113 users (5.31%)
Processed 7/113 users (6.19%)
Processed 8/113 users (7.08%)
Processed 9/113 users (7.96%)
Processed 10/113 users (8.85%)
Processed 11/113 users (9.73%)
Processed 12/113 users (10.62%)
Processed 13/113 users (11.5%)
Processed 14/113 users (12.39%)
Processed 15/113 users (13.27%)
Processed 16/113 users (14.16%)
Processed 17/113 users (15.04%)
Processed 18/113 users (15.93%)
Processed 19/113 users (16.81%)
Processed 20/113 users (17.7%)
Processed 21/113 users (18.58%)
Processed 22/113 users (19.47%)
Processed 23/113 users (20.35%)
Processed 24/113 users (21.24%)
Processed 25/113 users (22.12%)
Processed 26/113 users (23.01%)
Processed 27/113 users (23.89%)
Processed 28/113 users (24.78%)
Processed 29/113 users (25.66%)
Processed 30/113 users (26.55%)
Processed 31/113 users (27.43%)
Processed 32/113 users (28.32%

Unnamed: 0,tweet_id,username,text
0,1511128715597471748,Grimezsz,😞 https://t.co/5Te0RGHgvg
1,1511297746661253120,thesheetztweetz,Breaking - Amazon $AMZN signed the biggest roc...
2,1511153708654120963,thesheetztweetz,The U.S. Air Force's 388th Fighter Wing tested...
3,1511137391263715331,thesheetztweetz,U.S. Space Force Brig. Gen. Stephen Purdy rece...
4,1511087590832758789,thesheetztweetz,"Due the vent valve issue, the launch director ..."
