In [None]:
import json
import nltk
import spacy 
from datetime import datetime
import tweepy
import re
import string
import unicodedata
from gensim import corpora
from nltk.tokenize.toktok import ToktokTokenizer

### Setting up Twitter API Streaming

In [None]:
# Let's use the Twitter Stream API to get tweets in real time
# We save our tweets to a file called "cats.json"
# override tweepy.StreamListener to add logic to on_status and on_error 
class MyStreamListener(tweepy.StreamListener):
    def on_status(self, status):
        print(status._json)
        with open ("cats.json", "a+") as stream_f:
            json_text = json.dumps(status._json)
            stream_f.write(json_text)
            stream_f.write('\n')
            
    def on_error(self, status_code):
        print("Error detected!")
        print (status_code)
        return False

In [None]:
def clean_tweet(tweet): 
    processed_tweet = {}
    processed_tweet["id"] = tweet['id']
    processed_tweet["user"] = tweet['user']['screen_name']
    processed_tweet["created_at"] = datetime.strptime(tweet["created_at"],'%a %b %d %H:%M:%S +0000 %Y')
    created_at = datetime.strptime(tweet["created_at"],'%a %b %d %H:%M:%S +0000 %Y')
    processed_tweet["lang"] = tweet['lang']
    
    if tweet['lang'] != "en":
        processed_tweet["is_en"] = False
    else: 
        processed_tweet["is_en"] = True
        
    if "full_text" in tweet: 
        processed_tweet["text"] = tweet['full_text']
    elif "extended_tweet" in tweet:
        processed_tweet["text"] = tweet["extended_tweet"]["full_text"]
    elif "text" in tweet: 
        processed_tweet["text"] = tweet['text']
        
    if 'retweeted_status' in tweet:
        rt = tweet['retweeted_status']
        processed_tweet["is_rt"] = True
        processed_tweet["rt_user"] = rt['user']['screen_name']
        processed_tweet["rt_id"] = rt['id']        
        if "full_text" in rt:
            processed_tweet["rt_text"] = rt['full_text']
        elif "extended_tweet" in tweet['retweeted_status']:
            processed_tweet["rt_text"] = tweet['retweeted_status']['extended_tweet']["full_text"]
        elif "text" in rt: 
            processed_tweet["rt_text"] = rt['text']
    else: 
        processed_tweet["is_rt"] = False
            
    return processed_tweet

In [None]:
info = {"consumer_key": "",
        "consumer_secret": "",
        "access_token": "",
        "access_secret": ""}

In [None]:
auth = tweepy.OAuthHandler(info['consumer_key'], info['consumer_secret'])
auth.set_access_token(info['access_token'], info['access_secret'])

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

### Collecting Twitter Data

In [None]:
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener)
myStream.filter(track=['cats'])

In [None]:
# load the raw tweets found from the search and stream apis into the same list for processing
tweet_data = []

with open("cats.json") as stream_f:
    for line in stream_f:
        json_line = json.loads(line)
        tweet_data.append(json_line)
        
# filter all of the raw tweets by turning them into clean_tweet objects
# the filtering is taken care of in the class function
filtered_data = []
for elem in tweet_data: 
    filtered_tweet = clean_tweet(elem)
    filtered_data.append(filtered_tweet)

In [None]:
filtered_data

In [None]:
# create a list of all the tweet text 
# we filter out all tweets that are not English
tweet_text = []
for tweet in filtered_data:
    if tweet["is_en"]:
        if tweet["is_rt"]: 
            tweet_text.append(tweet["rt_text"].replace("\n", " "))
        else:
            tweet_text.append(tweet["text"].replace("\n", " "))

In [None]:
tweet_text

### Preprocessing Data

In [None]:
# remove HTML links, mentions, hashtags, and special characters

def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ' ')    
    return text

def strip_mentions(text):
    entity_prefixes = ['@']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

def strip_hashtags(text):
    entity_prefixes = ['#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)
        
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [None]:
stripped_tweet_text = []
for elem in tweet_text:
    elem = strip_links(elem)
    elem = strip_mentions(elem)
    elem = strip_hashtags(elem)
    elem = elem.replace('RT', '')
    elem = remove_special_characters(elem)
    stripped_tweet_text.append(elem)

In [None]:
stripped_tweet_text

### Sentiment Analysis

In [None]:
from textblob import TextBlob

for elem in stripped_tweet_text:
    print (elem)
    elem_textblob = TextBlob(elem)
    print (elem_textblob.sentiment)
    print ("----")

In [None]:
#find sentiment vader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [None]:
snt = analyser.polarity_scores('This is an examle of a happy tweet')
print(snt)

In [None]:
for elem in stripped_tweet_text:
    print (elem)
    print (analyser.polarity_scores(elem))
    print ("----")

### Stemming/Lemming

In [None]:
# Stemming / Lemming
nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

### Tokenizing and Corpus Creation

In [None]:
tokenizer = ToktokTokenizer()
nltk.download('stopwords')
stopword_list = nltk.corpus.stopwords.words('english')

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text



In [None]:
#Create corupus of all words
words_corpus = []
for elem in stripped_tweet_text:
    # remove stop words
    elem = remove_stopwords(elem)
    # lemmatize text
    elem = lemmatize_text(elem)
    words_corpus.append(elem.lower().split())
print(words_corpus)

dictionary = corpora.Dictionary(words_corpus)
print(dictionary)

### Topic Modeling

In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

corpus_bow = [dictionary.doc2bow(text) for text in words_corpus]

#Term Frequency - Inverse Document Frequency

from gensim import corpora, models
tfidf = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]


In [None]:
#Only 3 topics this time for simplicity
num_topics = 3
lda_model_tfidf = models.LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

### Part of Speech Tagging

In [None]:
sentence = 'London is the capital and most populous city of England and the United Kingdom'
#sentence = stripped_tweet_text[0]
sentence_nlp = nlp(sentence)

In [None]:
from spacy import displacy
displacy.render(sentence_nlp, jupyter=True, 
                options={'distance': 110,
                         'arrow_stroke': 2,
                         'arrow_width': 8})

In [None]:
# print named entities in article
print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_])

# visualize named entities
displacy.render(sentence_nlp, style='ent', jupyter=True)