In [1]:
import sys
# !{sys.executable} -m pip install tweepy
# !{sys.executable} -m pip install textblob
# !{sys.executable} -m pip install wordcloud
# !{sys.executable} -m pip install nltk
# !{sys.executable} -m pip install twython

In [2]:
# Import the necessary libraies and methods
import json
import numpy as np
import pandas as pd
import re
import warnings

# import Variables that contains the user credentials to access Twitter API
from twitter_authentication import API_KEY, API_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET

# tweepy is the python client for the official Twitter API
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy import API

# textblob is the python library for sentiment analysis
from textblob import TextBlob

# Visualisation
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

# wordcloud is a python library for printing a cloud of words
from wordcloud import WordCloud, STOPWORDS

# nltk
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import tokenize

# set the matplotlib style
matplotlib.style.use('ggplot')

%matplotlib inline

In [3]:
#This line filter Twitter Streams to capture data by the keywords included in TAGS variable
TAGS = ['@PetroleumEcon', '@IHSMarkit', '@PWYPUSA', '@SPGlobalRatings', '@energyintel', '@michaelkamon', 
        '@pvmagazineusa', '@mwtnews', '@NRGInstitute', '@WorldOil', '@robinenergy', '@PortXLglobal', 
        '@UpstreamOnline', '@greentechmedia', '@PlattsOil', '@AyataAnalytics', '@OilandGasIQ', '@oil_recruitment', 
        '@Gaurav81184', '@ReemaHarfoushi', '@EIAgov', '@tfoxlaw', '@tapmanagement', '@boonepickens', '@EmilyPickrell', 
        '@Chevron', '@IERenergy', '@TheOilDrum', '@FPSONetwork', '@OilsandsEditor', '@Shell_NatGas', '@iraqoilreport', 
        '@jpolson9', '@BloombergNRG', '@oilandgasnews', '@OilGasMalaysia', '@UKERCHQ', '@OilGasCanada', '@ooga_hq', 
        '@OilandGasInvest', '@robinenergy', '@derek_brower', '@theoilprice', '@stevelevine', '@marcellusmin', 
        '@AmyAHarder', '@saeverley', '@davidshellblog', '@KateGalbraith', '@oilandgasuk', '@TerryMac999', '@BP_America', 
        '@RigData', '@LaOilGasAssoc', '@Shell', '@BGGroup', '@nelderini', '@PatrickOsgood', '@JonathanFahey', 
        '@wenkennedy', '@psdvi', '@EnergyInDepth', '@exxonmobil', '@twitoil', '@Saudi_Aramco', '@Total', 
        '@offshoremgzn', '@Chevron', '@OGJOnline', '@OilVoice', '@BP_plc', '@Rigzone', '@UpstreamOnline', '@OilandGasIQ', 
        '@WorldOil', '@worldoilngas', '@ExxonMobil', '@ExxonMobil_UK', '@ExxonMobil_EU', '@ExxonMobil_NG', '@ExxonMobil_ID', 
        '@exxonmobil_qa', '@exxonmobil_aus', '@ExxonMobil_AK', '@Shell', '@Shell_UKLtd', '@Shell_US', '@Shell_Canada', 
        '@Shell_Nigeria', '@Shell_India', '@eni', '@BP_UK']

# sort a list
def sortList(list):
    return list.sort(key=str.lower)

# set to lower case elements in a list
def lcase(list):
    l = [a.lower() for a in list]
    return l

# remove duplicates in a list
def remove_dup_inlist(onelist):
    return list(set(onelist))

TAGS = lcase(TAGS)
TAGS = remove_dup_inlist(TAGS)
sortList(TAGS)

print((len(TAGS), TAGS))

(85, ['@amyaharder', '@ayataanalytics', '@bggroup', '@bloombergnrg', '@boonepickens', '@bp_america', '@bp_plc', '@bp_uk', '@chevron', '@davidshellblog', '@derek_brower', '@eiagov', '@emilypickrell', '@energyindepth', '@energyintel', '@eni', '@exxonmobil', '@exxonmobil_ak', '@exxonmobil_aus', '@exxonmobil_eu', '@exxonmobil_id', '@exxonmobil_ng', '@exxonmobil_qa', '@exxonmobil_uk', '@fpsonetwork', '@gaurav81184', '@greentechmedia', '@ierenergy', '@ihsmarkit', '@iraqoilreport', '@jonathanfahey', '@jpolson9', '@kategalbraith', '@laoilgasassoc', '@marcellusmin', '@michaelkamon', '@mwtnews', '@nelderini', '@nrginstitute', '@offshoremgzn', '@ogjonline', '@oil_recruitment', '@oilandgasinvest', '@oilandgasiq', '@oilandgasnews', '@oilandgasuk', '@oilgascanada', '@oilgasmalaysia', '@oilsandseditor', '@oilvoice', '@ooga_hq', '@patrickosgood', '@petroleumecon', '@plattsoil', '@portxlglobal', '@psdvi', '@pvmagazineusa', '@pwypusa', '@reemaharfoushi', '@rigdata', '@rigzone', '@robinenergy', '@saeverl

In [4]:
# this handles Twitter authentication 
auth = OAuthHandler(API_KEY, API_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

api = API(auth, wait_on_rate_limit=True)   # it will wait instead of failing, when rate limit is reached

In [5]:
MAX_TWEETS = 100
LANG = 'en'

# saving tweets by TAG
df_tweets = pd.DataFrame({'tag':[], 'text':[]}) # temporary empty dataframe
for tag in TAGS:
    # fetch tweets of a particular 'tag'
    query = tag
    tweets = api.search(q=query, count=MAX_TWEETS, lang=LANG)
    for tweet in tweets:
        df = pd.DataFrame({'tag':tag, 'text':tweet.text}, index=df_tweets.columns)
        df_tweets = df_tweets.append(df, ignore_index=True)

In [6]:
# show some tweets
df_tweets.head(10)

Unnamed: 0,tag,text
0,@amyaharder,RT @AmyAHarder: In early January I laid out ei...
1,@amyaharder,RT @AmyAHarder: In early January I laid out ei...
2,@amyaharder,@AmyAHarder Israel's red lines may not matter ...
3,@amyaharder,@AmyAHarder Israel's red lines may not matter ...
4,@amyaharder,RT @AmyAHarder: One conservative on EPA's effo...
5,@amyaharder,RT @AmyAHarder: One conservative on EPA's effo...
6,@amyaharder,RT @RLewis_UIowa: Interesting predictions -- &...
7,@amyaharder,RT @RLewis_UIowa: Interesting predictions -- &...
8,@amyaharder,RT @AmyAHarder: In early January I laid out ei...
9,@amyaharder,RT @AmyAHarder: In early January I laid out ei...


In [9]:
df_tweets.drop_duplicates(inplace=True)

In [10]:
# show some (unduplicated) tweets
df_tweets.head(10)

Unnamed: 0,tag,text
0,@amyaharder,RT @AmyAHarder: In early January I laid out ei...
2,@amyaharder,@AmyAHarder Israel's red lines may not matter ...
4,@amyaharder,RT @AmyAHarder: One conservative on EPA's effo...
6,@amyaharder,RT @RLewis_UIowa: Interesting predictions -- &...
16,@amyaharder,"RT @AmyAHarder: Six months into the year, here..."
22,@amyaharder,Interesting predictions -- &amp; progress repo...
24,@amyaharder,"@AmyAHarder For 2019, I would add Supreme Cour..."
26,@amyaharder,@AmyAHarder hi. In your opinion will epa rollb...
32,@amyaharder,RT @AmyAHarder: My latest Harder Line column: ...
34,@amyaharder,@AmyAHarder sees momentum for US carbon tax. “...


In [11]:
len(df_tweets['text'])

2678

In [None]:
# Preprocessing of tweets and cleaning special characters
# TODO

In [None]:
#add tweetos first part
for i in range(len(df_tweets['text'])):
    try:
        df_tweets['tweetos'][i] = df_tweets['text'].str.split(' ')[i][0]
    except AttributeError:    
        df_tweets['tweetos'][i] = 'other'

#Preprocessing tweetos. select tweetos contains 'RT @'
for i in range(len(df_tweets['text'])):
    if df_tweets['tweetos'].str.contains('@')[i]  == False:
        df_tweets['tweetos'][i] = 'other'
        
# remove URLs, RTs, and twitter handles
for i in range(len(df_tweets['text'])):
    df_tweets['text'][i] = " ".join([word for word in df_tweets['text'][i].split()
                                if 'http' not in word and '@' not in word and '<' not in word])

In [None]:
# df_tweets.loc[['tweetos', 'tag', 'text']][:5]
df_tweets[:5]

In [None]:
# tweets_string = list(set(df_tweets['text'].values))
# tweets_string

In [None]:
tweets['text'][1]

In [None]:
def get_tweet_sentiment(tweet):
        
        # create TextBlob object of passed tweet text
        analysis = TextBlob(self.trim_tweet(tweet))

        # set sentiment
        if analysis.sentiment.polarity > 0:
            return 'positive'
        elif analysis.sentiment.polarity == 0:
            return 'neutral'
        else:
            return 'negative'

In [None]:
pd.set_option("display.max_columns", 200)
df_tweets.text.values[:5]

In [None]:
df_tweets.tail().values

In [None]:
# visualization with wordcloud
def wordcloud(tweets, col):
    stopwords = set(STOPWORDS)
    wordcloud = WordCloud(background_color="white",stopwords=stopwords,random_state = 123).generate(" ".join([i for i in tweets[col]]))
    plt.figure(figsize=(20,10), facecolor='k')
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title("Common Words on Tweets of SuperMajors")
# wordcloud(tweets,'text')