In [1]:
import csv
import os
import json
import requests
import twitter
import lxml.html

In [2]:
TWITTER_CONS_KEY = os.environ.get('T_CONS_')
TWITTER_CONS_SEC = os.environ.get('T_CONS_SECRET')
TWITTER_ACCESS_TOKEN = os.environ.get('T_ACCESS_')
TWITTER_ACCESS_SEC = os.environ.get('T_ACCESS_SECRET')

In [3]:
t = twitter.Api(
    consumer_key = TWITTER_CONS_KEY,
    consumer_secret = TWITTER_CONS_SEC,
    access_token_key = TWITTER_ACCESS_TOKEN, 
    access_token_secret = TWITTER_ACCESS_SEC,
    tweet_mode='extended'
)

In [4]:
screen_name = "NASA"

In [6]:
# The Twitter api
first_200 = t.GetUserTimeline(screen_name=screen_name, count=200)

In [43]:
def get_tweets(first_200, screen_name, last_id):
    all_tweets = []
    all_tweets.extend(first_200)
    for i in range(900):
        new = t.GetUserTimeline(screen_name=screen_name, max_id=last_id-1)
        all_tweets.extend(new)
        if len(new) > 0:
            last_id = new[-1].id
        else:
            break
    
    return all_tweets

In [45]:
all_tweets = get_tweets(first_200, first_200[-1].id)

In [97]:
print("The 'created_at' parameter is in the form of a %s and looks like this: %s." % (
                                        type(all_tweets[0].created_at),
                                        all_tweets[0].created_at)
     )
print("The 'created_at' parameter is in the form of a %s and looks like this: %s." % (
                                        type(all_tweets[0].source), 
                                        all_tweets[0].source)
     )

The 'created_at' parameter is in the form of a <class 'str'> and looks like this: Sat Jun 30 19:44:09 +0000 2018.
The 'created_at' parameter is in the form of a <class 'str'> and looks like this: <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>.


In [103]:
def clean_source(source):
    raw = lxml.html.document_fromstring(source)
    return source_raw.cssselect('body')[0].text_content()


def string_to_datetime(date_str):
    return datetime.strptime(date_str, '%a %b %d %H:%M:%S %z %Y')


print('Our cleaned text; original was "<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>)":')
print(clean_source('<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>'))
print('***')
print('Our date string converted to an object; original was "Sat Jun 30 11:37:03 +0000 2018":')
print(string_to_datetime('Sat Jun 30 11:37:03 +0000 2018'))


Our cleaned text; original was "<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>)":
Twitter for iPhone
***
Our date string converted to an object; original was "Sat Jun 30 11:37:03 +0000 2018":
2018-06-30 11:37:03+00:00


In [101]:
def write_to_csv(tweets, filename):
    headers = ['id', 'full_text', 'hashtags', 'urls', 'created_at', 'favorite_count', 'retweet_count', 'source']
            
    with open(filename + '.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(headers)
        
        for item in tweets:
            
            writer.writerow([item.id, item.full_text, item.hashtags, item.urls, 
                             item.created_at, item.favorite_count, item.retweet_count, 
                             clean_source(item.source)])
    csvfile.close()

In [102]:
write_to_csv(all_tweets, 'trump_tweets')

In [54]:
print("We collected %d tweets." % len(all_tweets))
print("The most recent tweet in our collection was sent %s and the oldest tweet was sent %s." % (
                                                                            all_tweets[0].created_at, 
                                                                            all_tweets[-1].created_at)
     )

We collected 3199 tweets.
The most recent tweet in our collection was sent Sat Jun 30 19:44:09 +0000 2018 and the oldest tweet was sent Fri May 19 22:16:06 +0000 2017.


In [104]:
def create_dict(tweets):
    dict = {}
    for item in tweets:
        clean_source(item.source)
        dict[str(item.id)] = {
            'id':item.id,
            'full_text': item.full_text,
            'hashtags': item.hashtags,
            'urls': item.urls,
            'created_at': string_to_datetime(item.created_at),
            'favorite_count': item.favorite_count,
            'retweet_count' : item.retweet_count,
            'source': clean_source(item.source)
        }
    return dict

In [105]:
tweet_dict = create_dict(all_tweets)

In [106]:
tweet_dict['1013023608040513537']

{'id': 1013023608040513537,
 'full_text': 'Just spoke to King Salman of Saudi Arabia and explained to him that, because of the turmoil &amp; disfunction in Iran and Venezuela, I am asking that Saudi Arabia increase oil production, maybe up to 2,000,000 barrels, to make up the difference...Prices to high! He has agreed!',
 'hashtags': [],
 'urls': [],
 'created_at': datetime.datetime(2018, 6, 30, 11, 37, 3, tzinfo=datetime.timezone.utc),
 'favorite_count': 107963,
 'retweet_count': 31496,
 'source': 'Twitter for iPhone'}