In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import yaml
import tweepy
import requests

import os
import itertools
import collections
import nltk
from nltk.corpus import stopwords
import networkx
from textblob import TextBlob

# Collecting tweets
We use tweepy which is built on Twitter API to collect tweets.

In [18]:
with open("config.yaml") as file:
    keys = yaml.safe_load(file)
    consumer_key = keys["search_tweets_api"]["consumer_key"]
    consumer_secret = keys["search_tweets_api"]["consumer_secret"]
    access_token = keys["search_tweets_api"]["access_token"]
    access_token_secret = keys["search_tweets_api"]["access_token_secret"]

In [24]:
def auth():
    try:
        auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
        auth.set_access_token(access_token, access_token_secret)
        api = tweepy.API(auth, wait_on_rate_limit=True)
    except:
        print("An error occurred during the authentication")
    return api

In [21]:
# function to remove URL
def remove_url(txt):
    return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())

We note that Twitter’s search service is not meant to be an exhaustive source of Tweets. Not all Tweets will be indexed or made available via the search interface. Therefore, we focus on random sampling on users' sentiment. 

In [34]:
def search_by_hashtag(api, date_until, words):
    df = pd.DataFrame(columns=['id', 'created_at', 'username', 'location', 'following', 
                               'followers', 'retweetcount', 'text']) 
    tweets = tweepy.Cursor(api.search_tweets, q=words, lang="en", until=date_until, tweet_mode='extended').items() 
    list_tweets = [tweet for tweet in tweets] 
         
    for tweet in list_tweets: 
        id = tweet.id
        created_at = tweet.created_at
        username = tweet.user.screen_name 
        location = tweet.user.location 
        following = tweet.user.friends_count 
        followers = tweet.user.followers_count 
        totaltweets = tweet.user.statuses_count 
        retweetcount = tweet.retweet_count 

        try: 
            text = tweet.retweeted_status.full_text 
        except AttributeError: 
            text = tweet.full_text 
        text = remove_url(text)
  
        tweets = [id, created_at, username, location, following, 
                     followers, retweetcount, text] 

        df.loc[len(df)] = tweets 
          
    filename = 'tweets.csv'
    df.to_csv(filename)

In [36]:
api = auth()
words = "#UkraineRussia"
date_until = "2022-02-24"
search_by_hashtag(api, date_until, words)

Rate limit reached. Sleeping for: 3
Rate limit reached. Sleeping for: 851
Rate limit reached. Sleeping for: 851
Rate limit reached. Sleeping for: 850
Rate limit reached. Sleeping for: 853


# Sentiment Analysis
## Microsoft Azure’s Text Analytics
We first adopt Cognitive Service to obtain the sentiment score of tweets.
The score outputted is in range [0, 1].

In [None]:
def connect_to_azure(data):
    azure_url = "https://urwarsentiment.cognitiveservices.azure.com/"
    language_api_url = "{}text/analytics/v2.1/languages".format(azure_url)
    sentiment_url = "{}text/analytics/v2.1/sentiment".format(azure_url)
    subscription_key = data["azure"]["subscription_key"]
    return language_api_url, sentiment_url, subscription_key

def azure_header(subscription_key):
    return {"Ocp-Apim-Subscription-Key": subscription_key}

We need to further format the data collected. The final payload to Azure endpoint should contain text and language only.

In [None]:
def combine_lang_data(df):
    text_lan = df["text"]
    text_lan["language"] = "en"
    json_lines = text_lan.to_json(orient="records")
    return json_lines

In [None]:
def add_document_format(json_lines):
    docu_format = '"' + "documents" + '"'
    json_docu_format = "{}:{}".format(docu_format, json_lines)
    docu_align = "{" + json_docu_format + "}"
    jd_align = json.dumps(docu_align)
    jl_align = json.loads(jd_align)
    return ast.literal_eval(jl_align)

In [None]:
def sentiment_scores(headers, sentiment_url, document_format):
    response = requests.post(
        sentiment_url, headers=headers, json=document_format)
    return response.json()

In [None]:
headers = azure_header(subscription_key)
json_lines = combine_lang_data(documents, with_languages)
document_format = add_document_format(json_lines)
sentiments = sentiment_scores(headers, sentiment_url, document_format)
df["azure_polar"] = sentiments["documents"]["score"]

## TextBlob
Textblob is an open-source python library for processing textual data. It can evaluate both polarity and subjectivity in text. The polarity score is a float within the range [-1.0, 1.0]. The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.

In [None]:
sentiment_objects = [TextBlob(tweet) for tweet in list(df['text'])]
blob_polar = [tweet.sentiment.polarity for tweet in sentiment_objects]
blob_subj = [tweet.sentiment.subj for tweet in sentiment_objects]
df["blob_polar"] = blob_polar
df["blob_subj"] = blob_subj

In [None]:
ax1, ax2 = plt.subplots(2, 1, 1)
df.hist(column="blob_polar", bins=[-1, -0.75, -0.5, -0.25, 0.25, 0.5, 0.75, 1], ax=ax1)
ax1.set_title("Sentiments from Tweets on Ukraine-Russia War")
df.hist(column="blob_subj", bins=[0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1], ax=ax2)
ax1.set_title("Subjectivity from Tweets on Ukraine-Russia War")
plt.show()