# Downloading Tweets with Tweepy and JSON File

In [1]:
import tweepy
import json
from pathlib import Path

In [2]:
key_file = 'keys.json'
with open(key_file) as f:
    keys = json.load(f)
from tweepy import TweepError
import logging

auth = tweepy.OAuthHandler(keys["consumer_key"], keys["consumer_secret"])
auth.set_access_token(keys["access_token"], keys["access_token_secret"])
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [3]:
current_tweets = []
new_tweets = api.user_timeline(screen_name = 'realDonaldTrump', count=200)
current_tweets.extend(new_tweets)
oldest = current_tweets[-1].id - 1
while len(current_tweets) < 3200:
    new_tweets = api.user_timeline(screen_name = 'realDonaldTrump',count=200, max_id=oldest)
    current_tweets.extend(new_tweets)
    oldest = current_tweets[-1].id - 1
print("...%s tweets downloaded" % (len(current_tweets)) )
latest_tweets = [[tweet.id, tweet.created_at, tweet.source, tweet.retweet_count, tweet.text] for tweet in current_tweets]

...3201 tweets downloaded


In [4]:
dest_path = 'data/realdonaldtrump.ndjson'
old_trump_tweets = []
with open(dest_path) as f:
    for tweet in f:
        content = json.loads(tweet)
        data = [content['id'], content['created_at'], content['source'], content['retweet_count'], content['text']]
        old_trump_tweets.append(data)
print("...%s tweets downloaded" % (len(old_trump_tweets)) )

...40241 tweets downloaded


In [5]:
all_tweets = latest_tweets + old_trump_tweets
print('Shape: (',len(all_tweets), ', 5 )')

Shape: ( 43442 , 5 )


# Loading Tweets into DataFrame and Cleaning Text

In [6]:
import pandas as pd
tweets_df = pd.DataFrame(all_tweets, columns=['id', 'created_at', 'source', 'retweet_count', 'text'])
tweets_df.shape

(43442, 5)

In [7]:
tweets_df['source'] = tweets_df.source.str.replace(r'<[^>]*>','')
tweets_df['created_at'] = tweets_df.created_at.apply(pd.to_datetime)

In [9]:
import re
import contractions
def clean_tweet(tweet):
    #expand_contractions
    tweet = contractions.fix(tweet)
    # remove twitter Return handles (RT @xxx:)
    tweet = re.sub("RT @[\w]*:", " ",tweet)
    # remove twitter handles (@xxx)
    tweet = re.sub("@[\w]*", " ",tweet)
    # remove hashtags (#xxx)
    tweet = re.sub(r'#', '' , tweet)
    # remove URL links (httpxxx)
    tweet = re.sub("https?://[A-Za-z0-9./]*", " ",tweet)
    # remove special characters, numbers, punctuations
    tweet = re.sub("[^a-zA-Z]", " ",tweet)
    return tweet.lower()

In [10]:
tweets_df['clean'] = tweets_df.text.apply(clean_tweet)

In [14]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
tweets_df['sentiment'] = tweets_df.text.apply(lambda x: analyzer.polarity_scores(x)['compound'])

In [15]:
tweets_df.head(5)

Unnamed: 0,id,created_at,source,retweet_count,text,clean,sentiment
0,1238046001744826369,2020-03-12 10:15:43,Twitter for iPhone,1241,“Nancy Pelosi all of a sudden doesn’t like the...,nancy pelosi all of a sudden does not like th...,0.0516
1,1238042153483411456,2020-03-12 10:00:25,Twitter for iPhone,3987,RT @charliekirk11: Facts:\n\nFederal tax dolla...,facts federal tax dollars will no longer g...,-0.8481
2,1238041635256238080,2020-03-12 09:58:22,Twitter for iPhone,6525,RT @JonathanTurley: Schumer's threat to the Co...,schumer s threat to the court that you will...,-0.6486
3,1238041481396588544,2020-03-12 09:57:45,Twitter for iPhone,2306,RT @flightcrew: MUST WATCH!! Best Trump Ad Eve...,must watch best trump ad ever fighte...,0.7482
4,1238040722391150592,2020-03-12 09:54:44,Twitter for iPhone,7136,RT @RyanAFournier: 77% of Americans are confid...,of americans are confident in the trump ...,0.6705


In [20]:
def sentiment_score_label(score):
    if score > 0:
        return 1
    elif score < 0:
        return -1
    else:
        return 0

In [21]:
sentiment_labels = tweets_df.sentiment.apply(sentiment_score_label)

# Saving Tweet Data and Sentiment Labels

In [22]:
def save_tweets(tweets, path):
    with open(path, "wb") as f:
        import pickle
        pickle.dump(tweets, f)

In [23]:
save_tweets(tweets_df, path = "data/Trump_Tweets_Data-43442.pkl")
save_tweets(sentiment_labels, path = "data/Sentiment_Labels.pkl")