In [14]:
import os
import tweepy
import nltk
import re
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from textblob import TextBlob
from pprint import pprint as pp

In [2]:
TWITTER_API_KEY = os.environ["TWITTER_API_KEY"]
TWITTER_API_SECRET = os.environ["TWITTER_API_SECRET"]
TWITTER_ACCESS_TOKEN = os.environ["TWITTER_ACCESS_TOKEN"]
TWITTER_ACCESS_TOKEN_SECRET = os.environ["TWITTER_ACCESS_TOKEN_SECRET"]

In [3]:
auth = tweepy.OAuthHandler(TWITTER_API_KEY, TWITTER_API_SECRET)
auth.set_access_token(TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)

In [9]:
stmr = PorterStemmer()
stop_words = stopwords.words("english")

In [45]:
# regex patterns
url_pattern = re.compile(r"https://[\w\d./]*")
emoji_code_pattern = re.compile(r"\\u[\w\d]*")
mentions_pattern = re.compile(r"@[\w\d]*")
hashtags_pattern = re.compile(r"#[\w\d]*")
other_pattern = re.compile(r"[🤷♀️]")

In [46]:
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [50]:
def get_sentiment_for_texts(texts: list):
    print('processing ', len(texts), ' items')
    res = []
    for text in texts:
        tweet_data = {"original": text}
        # * here we are doing the following
        # * * converting to lower case
        # * * applying stem operation to convert running to run etc.
        # * * excluding stop words (common words) such as the, I , a, an etc.
        _ = " ".join(
            [stmr.stem(word.lower()) for word in text.split() if word not in stop_words]
        )
        _ = re.sub(url_pattern,"",_)
        _ = re.sub(emoji_code_pattern,"",_)
        _ = re.sub(mentions_pattern,"",_)
        _ = re.sub(hashtags_pattern,"",_)
        _ = re.sub(other_pattern,"",_)
        tweet_data["_"] = deEmojify(_)
        tweet_data["sentiment"] = TextBlob(text).sentiment
        res.append(tweet_data)

    # * we could try doing all of these using comprehensions
    # * but would become hard to understand and maintain.
    # texts = [text.lower() for text in texts]
    # texts = [[stmr.stem(word) for word in text if word not in stop_words] for text in texts]

    return res

In [51]:
def get_tweets(query, count=300, lang="en"):

    # empty list to store parsed tweets
    tweets = []
    fetched_tweets = api.search(query, count=count, lang=lang, result_type="recent")
    # parsing tweets one by one
    for tweet in fetched_tweets:
        tweets.append(tweet.text)
    o = get_sentiment_for_texts(tweets)
    return o

In [57]:
pp(get_tweets(query="batman -rt", count=30, lang="en"))

processing  30  items
[{'_': 'how mani movi fan go want wonder woman kill joker talk masturb batman? '
       'how… ',
  'original': 'How many more movies are fans going to want where Wonder Woman '
              'kills and joker talks about masturbating Batman? How… '
              'https://t.co/3iauWVH5sD',
  'sentiment': Sentiment(polarity=0.5, subjectivity=0.5)},
 {'_': 'batman kick ass ',
  'original': 'batman kicking ass 😫',
  'sentiment': Sentiment(polarity=0.0, subjectivity=0.0)},
 {'_': ' you look familiar🤔',
  'original': '@60s_Batman You look familiar🤔',
  'sentiment': Sentiment(polarity=0.0, subjectivity=0.0)},
 {'_': 'new video up! reaction ch 1  “don’t count on it .”  dead. mother… ',
  'original': 'New video up! Reaction to Ch 1 of #ZackSnydersJusticeLeague   '
              '“Don’t Count On It #Batman.” #Superman  dead. Mother… '
              'https://t.co/CKUbHQWwGq',
  'sentiment': Sentiment(polarity=-0.014772727272727285, subjectivity=0.42727272727272725)},
 {'_': '