In [1]:
import json
import nltk
# conda install -c conda-forge nltk
from nltk.tokenize.toktok import ToktokTokenizer
import spacy 
# conda install -c conda-forge spacy
from datetime import datetime
import tweepy
import re
import string
import unicodedata
from gensim import corpora
# conda install -c conda-forge gensim

### Sentiment analysis
from textblob import TextBlob
# conda install -c conda-forge textblob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# conda install -c conda-forge vaderSentiment

In [2]:
api_info = json.loads(open('../.secrets/twitter_api.json').read())

In [3]:
client = tweepy.Client(
    consumer_key       = api_info["api_key"],
    consumer_secret    = api_info["api_key_secret"],
    bearer_token       = api_info["bearer_token"],
    access_token       = api_info["access_token"],
    access_token_secret= api_info["access_token_secret"],
    wait_on_rate_limit = True
)

### Query recent tweets

In [4]:
fields = "created_at,lang,author_id,text,referenced_tweets" 
expansions = "attachments.media_keys,referenced_tweets.id,author_id"

In [5]:
ukraine_tweets = client.search_recent_tweets(
    query="ukraine",
    max_results=100,
    tweet_fields=fields,
    expansions=expansions
)

In [6]:
def process_tweets(search_response):
    results  = search_response.data
    inc_tweets = search_response.includes['tweets']
    inc_tweets_data = [tweet.data for tweet in inc_tweets]
    tweets = []
    tweet_data = {}
    for tweet in results:
        tweet_data = tweet.data
        tweet_data["is_rt"]   = False
        tweet_data["rt_id"]   = None
        tweet_data["rt_text"] = None
        ref_tw = tweet.get('referenced_tweets')
        if ref_tw:
            rt = [rt for rt in ref_tw if rt.get('type')=="retweeted"]
            if len(rt) > 0:
                rt = rt[0]
                tweet_data["is_rt"]   = True
                tweet_data["rt_id"]   = rt.data['id']  
                tweet_data["rt_text"] = [inc['text'] for inc in inc_tweets_data if inc['id']==rt.data['id']][0]
        tweets.append(tweet_data)
    return tweets

In [7]:
processed_tweets = process_tweets(ukraine_tweets)

In [8]:
len(processed_tweets)

100

In [17]:
# Save multiple pages of data from a query
# Alternative : https://docs.tweepy.org/en/stable/streamingclient.html
next_token = None
all_tweet_data = []
for i in range(10):
    if i % 1 == 0:
        print('getting page {} ...'.format(i))
    if next_token:
        ukraine_tweets = client.search_recent_tweets(
            query="ukraine",
            max_results=100,
            tweet_fields=fields,
            expansions=expansions,
            next_token = next_token
        )
        tweet_data_list = process_tweets(ukraine_tweets)
    else:
        ukraine_tweets = client.search_recent_tweets(
            query="ukraine",
            max_results=100,
            expansions=expansions,
            tweet_fields=fields,
        )
        tweet_data_list = process_tweets(ukraine_tweets)
    all_tweet_data += tweet_data_list
    next_token = ukraine_tweets[3]['next_token']

getting page 0 ...
getting page 1 ...
getting page 2 ...
getting page 3 ...
getting page 4 ...
getting page 5 ...
getting page 6 ...
getting page 7 ...
getting page 8 ...
getting page 9 ...


In [18]:
with open("../datasets/ukraine_tweets.json", "w") as outfile:
    json.dump(all_tweet_data, outfile, indent=4)

In [19]:
len(all_tweet_data)

1000

### Clean Tweets

In [20]:
def clean_tweet(tweet): 
    processed_tweet = tweet
    processed_tweet["id"] = tweet['id']
    processed_tweet["user"] = tweet['author_id']
    created_at = datetime.strptime(tweet["created_at"],"%Y-%m-%dT%H:%M:%S.%fZ")
    processed_tweet["created_at"] = created_at
    processed_tweet["lang"] = tweet['lang']
    
    if tweet['lang'] != "en":
        processed_tweet["is_en"] = False
    else: 
        processed_tweet["is_en"] = True
            
    return processed_tweet

In [27]:
tweet_data_list = json.loads(open('../datasets/ukraine_tweets.json').read())
len(tweet_data_list)

1000

In [28]:
# filter all of the raw tweets by turning them into clean_tweet objects
# the filtering is taken care of in the class function
filtered_data = []
for elem in tweet_data_list: 
    filtered_tweet = clean_tweet(elem)
    filtered_data.append(filtered_tweet)

In [29]:
len(filtered_data)

1000

In [30]:
# Re-serialize dates and save raw data
clean_tweets_json = []
for fd in filtered_data:
    fd['created_at'] = datetime.strftime(fd['created_at'],"%Y-%m-%dT%H:%M:%S.%fZ")
    clean_tweets_json.append(fd)
with open("../datasets/ukraine_tweets_clean.json", "w") as outfile:
    json.dump(clean_tweets_json, outfile, indent=4)

### Process data

In [31]:
filtered_data = json.loads(open('../datasets/ukraine_tweets_clean.json').read())

In [32]:
# create a list of all the tweet text 
# we filter out all tweets that are not English
tweet_text = []
for tweet in filtered_data:
    if tweet["is_en"]:
        if tweet.get("is_rt"): 
            tweet_text.append(tweet["rt_text"].replace("\n", " "))
        else:
            tweet_text.append(tweet["text"].replace("\n", " "))

In [33]:
# There are fewer than 1,000 since filtered for english tweets
len(tweet_text)

827

### Preprocessing Data

In [34]:
# remove HTML links, mentions, hashtags, and special characters

def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ' ')    
    return text

def strip_mentions(text):
    entity_prefixes = ['@']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

def strip_hashtags(text):
    entity_prefixes = ['#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)
        
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [35]:
stripped_tweet_text = []
for elem in tweet_text:
    elem = strip_links(elem)
    elem = strip_mentions(elem)
    elem = strip_hashtags(elem)
    elem = elem.replace('RT', '')
    elem = remove_special_characters(elem)
    stripped_tweet_text.append(elem)

In [None]:
tweet_text[119]

In [None]:
stripped_tweet_text[119]

In [None]:
tweet_text[171]

In [None]:
stripped_tweet_text[171]

### Sentiment Analysis

In [None]:
for i,elem in enumerate(stripped_tweet_text):
    elem_textblob = TextBlob(elem)
    sent = elem_textblob.sentiment
    print (elem)
    print(sent)
    print ("----")

In [43]:
#find sentiment vader
analyser = SentimentIntensityAnalyzer()

In [125]:
help(analyser.polarity_scores)

Help on method polarity_scores in module vaderSentiment.vaderSentiment:

polarity_scores(text) method of vaderSentiment.vaderSentiment.SentimentIntensityAnalyzer instance
    Return a float for sentiment strength based on the input text.
    Positive values are positive valence, negative value are negative
    valence.



In [44]:
snt = analyser.polarity_scores('This is an examle of a happy tweet')
print(snt)

{'neg': 0.0, 'neu': 0.654, 'pos': 0.346, 'compound': 0.5719}


In [None]:
for elem in stripped_tweet_text:
    print (elem)
    print (analyser.polarity_scores(elem))
    print ("----")

### Stemming/Lemming

In [46]:
# Stemming / Lemming

### loading a spacy language model
# python -m spacy download en_core_web_sm
# https://spacy.io/models/en
nlp = spacy.load('en_core_web_sm') 

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

### Tokenizing and Corpus Creation

In [None]:
### Run this the first time
nltk.download('stopwords')

In [47]:
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text



In [104]:
#Create corupus of all words
words_corpus = []
for elem in stripped_tweet_text:
    # remove stop words
    elem = remove_stopwords(elem)
    # lemmatize text
    elem = lemmatize_text(elem)
    words_corpus.append(elem.lower().split())
print(len(words_corpus))

dictionary = corpora.Dictionary(words_corpus)
print(len(dictionary))

827
3426


In [64]:
dictionary.num_docs, dictionary.num_pos

(827, 14858)

### Topic Modeling

In [50]:
help(dictionary.filter_extremes)

Help on method filter_extremes in module gensim.corpora.dictionary:

filter_extremes(no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None) method of gensim.corpora.dictionary.Dictionary instance
    Filter out tokens in the dictionary by their frequency.
    
    Parameters
    ----------
    no_below : int, optional
        Keep tokens which are contained in at least `no_below` documents.
    no_above : float, optional
        Keep tokens which are contained in no more than `no_above` documents
        (fraction of total corpus size, not an absolute number).
    keep_n : int, optional
        Keep only the first `keep_n` most frequent tokens.
    keep_tokens : iterable of str
        Iterable of tokens that **must** stay in dictionary after filtering.
    
    Notes
    -----
    This removes all tokens in the dictionary that are:
    
    #. Less frequent than `no_below` documents (absolute number, e.g. `5`) or 
    
    #. More frequent than `no_above` documents (fraction of th

In [146]:
dictionary = corpora.Dictionary(words_corpus)
dictionary.filter_extremes(no_below=125, no_above=0.2, keep_n=10000)

corpus_bow = [dictionary.doc2bow(text) for text in words_corpus]

# Term Frequency - Inverse Document Frequency

from gensim import corpora, models

# https://radimrehurek.com/gensim/models/ldamodel.html

tfidf = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf[corpus_bow]


In [None]:
# Only 2 topics this time for simplicity
num_topics = 2
lda_model_tfidf = models.LdaMulticore(corpus_tfidf, num_topics=num_topics, id2word=dictionary, passes=4, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

### Part of Speech Tagging

In [61]:
sentence = 'London is the capital and most populous city of England and the United Kingdom'
sentence_nlp = nlp(sentence)

In [62]:
from spacy import displacy
displacy.render(sentence_nlp, jupyter=True, 
                options={'distance': 110,
                         'arrow_stroke': 2,
                         'arrow_width': 8})

In [63]:
# print named entities in article
print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_])

# visualize named entities
displacy.render(sentence_nlp, style='ent', jupyter=True)

[(London, 'GPE'), (England, 'GPE'), (the, 'GPE'), (United, 'GPE'), (Kingdom, 'GPE')]
