In [1]:
import sys
# !{sys.executable} -m pip install tweepy
# !{sys.executable} -m pip install textblob
# !{sys.executable} -m pip install wordcloud
# !{sys.executable} -m pip install nltk
# !{sys.executable} -m pip install twython

In [29]:
# Import the necessary libraies and methods
import json
import numpy as np
import pandas as pd
import re
import warnings

# import Variables that contains the user credentials to access Twitter API
from twitter_authentication import API_KEY, API_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET

# tweepy is the python client for the official Twitter API
from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
from tweepy import API

# textblob is the python library for sentiment analysis
from textblob import TextBlob

# Visualisation
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns

# wordcloud is a python library for printing a cloud of words
from wordcloud import WordCloud, STOPWORDS

# nltk
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import tokenize

# set the matplotlib style
matplotlib.style.use('ggplot')

%matplotlib inline

In [4]:
#This line filter Twitter Streams to capture data by the keywords included in TAGS variable
TAGS = ['@PetroleumEcon', '@IHSMarkit', '@PWYPUSA', '@SPGlobalRatings', '@energyintel', '@michaelkamon', 
        '@pvmagazineusa', '@mwtnews', '@NRGInstitute', '@WorldOil', '@robinenergy', '@PortXLglobal', 
        '@UpstreamOnline', '@greentechmedia', '@PlattsOil', '@AyataAnalytics', '@OilandGasIQ', '@oil_recruitment', 
        '@Gaurav81184', '@ReemaHarfoushi', '@EIAgov', '@tfoxlaw', '@tapmanagement', '@boonepickens', '@EmilyPickrell', 
        '@Chevron', '@IERenergy', '@TheOilDrum', '@FPSONetwork', '@OilsandsEditor', '@Shell_NatGas', '@iraqoilreport', 
        '@jpolson9', '@BloombergNRG', '@oilandgasnews', '@OilGasMalaysia', '@UKERCHQ', '@OilGasCanada', '@ooga_hq', 
        '@OilandGasInvest', '@robinenergy', '@derek_brower', '@theoilprice', '@stevelevine', '@marcellusmin', 
        '@AmyAHarder', '@saeverley', '@davidshellblog', '@KateGalbraith', '@oilandgasuk', '@TerryMac999', '@BP_America', 
        '@RigData', '@LaOilGasAssoc', '@Shell', '@BGGroup', '@nelderini', '@PatrickOsgood', '@JonathanFahey', 
        '@wenkennedy', '@psdvi', '@EnergyInDepth', '@exxonmobil', '@twitoil', '@Saudi_Aramco', '@Total', 
        '@offshoremgzn', '@Chevron', '@OGJOnline', '@OilVoice', '@BP_plc', '@Rigzone', '@UpstreamOnline', '@OilandGasIQ', 
        '@WorldOil', '@worldoilngas', '@ExxonMobil', '@ExxonMobil_UK', '@ExxonMobil_EU', '@ExxonMobil_NG', '@ExxonMobil_ID', 
        '@exxonmobil_qa', '@exxonmobil_aus', '@ExxonMobil_AK', '@Shell', '@Shell_UKLtd', '@Shell_US', '@Shell_Canada', 
        '@Shell_Nigeria', '@Shell_India', '@eni', '@BP_UK']

# sort a list
def sortList(list):
    return list.sort(key=str.lower)

# set to lower case elements in a list
def lcase(list):
    l = [a.lower() for a in list]
    return l

# remove duplicates in a list
def remove_dup_inlist(onelist):
    return list(set(onelist))

TAGS = lcase(TAGS)
TAGS = remove_dup_inlist(TAGS)
sortList(TAGS)

print((len(TAGS), TAGS))
# stream.filter(track=['@PetroleumEcon', '@IHSMarkit', '@michaelkamon'])

(85, ['@amyaharder', '@ayataanalytics', '@bggroup', '@bloombergnrg', '@boonepickens', '@bp_america', '@bp_plc', '@bp_uk', '@chevron', '@davidshellblog', '@derek_brower', '@eiagov', '@emilypickrell', '@energyindepth', '@energyintel', '@eni', '@exxonmobil', '@exxonmobil_ak', '@exxonmobil_aus', '@exxonmobil_eu', '@exxonmobil_id', '@exxonmobil_ng', '@exxonmobil_qa', '@exxonmobil_uk', '@fpsonetwork', '@gaurav81184', '@greentechmedia', '@ierenergy', '@ihsmarkit', '@iraqoilreport', '@jonathanfahey', '@jpolson9', '@kategalbraith', '@laoilgasassoc', '@marcellusmin', '@michaelkamon', '@mwtnews', '@nelderini', '@nrginstitute', '@offshoremgzn', '@ogjonline', '@oil_recruitment', '@oilandgasinvest', '@oilandgasiq', '@oilandgasnews', '@oilandgasuk', '@oilgascanada', '@oilgasmalaysia', '@oilsandseditor', '@oilvoice', '@ooga_hq', '@patrickosgood', '@petroleumecon', '@plattsoil', '@portxlglobal', '@psdvi', '@pvmagazineusa', '@pwypusa', '@reemaharfoushi', '@rigdata', '@rigzone', '@robinenergy', '@saeverl

In [10]:
# this handles Twitter authentication 
auth = OAuthHandler(API_KEY, API_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

api = API(auth)

In [45]:
MAX_TWEETS = 100
LANG = 'en'

df_tweets = pd.DataFrame({'tag':[], 'text':[]}) # temporary empty dataframe
for tag in TAGS:
    print('Extracting {} ...'.format(tag))
    # fetch tweets of a particular 'tag'
    query = tag
    tweets = api.search(q=query, count=MAX_TWEETS, lang=LANG)
    for tweet in tweets:
        df = pd.DataFrame({'tag':tag, 'text':tweet.text}, index=df_tweets.columns)
        df_tweets = df_tweets.append(df, ignore_index=True)
    print('                  added {} tweets'.format(len(tweets)))

Extracting @amyaharder ...
                  added 100 tweets
Extracting @ayataanalytics ...
                  added 3 tweets
Extracting @bggroup ...
                  added 0 tweets
Extracting @bloombergnrg ...
                  added 100 tweets
Extracting @boonepickens ...
                  added 81 tweets
Extracting @bp_america ...
                  added 100 tweets
Extracting @bp_plc ...
                  added 100 tweets
Extracting @bp_uk ...
                  added 100 tweets
Extracting @chevron ...
                  added 100 tweets
Extracting @davidshellblog ...
                  added 5 tweets
Extracting @derek_brower ...
                  added 65 tweets
Extracting @eiagov ...
                  added 100 tweets
Extracting @emilypickrell ...
                  added 0 tweets
Extracting @energyindepth ...
                  added 100 tweets
Extracting @energyintel ...
                  added 99 tweets
Extracting @eni ...
                  added 88 tweets
Extracting @exxonmobil ..

In [51]:
pd.set_option("display.max_columns", 200)
df_tweets.head().values

array([['@amyaharder',
        "@AmyAHarder Israel's red lines may not matter for the Saudi nuclear deal if South Korea gets the business which it… https://t.co/0UOXStGJj9"],
       ['@amyaharder',
        "@AmyAHarder Israel's red lines may not matter for the Saudi nuclear deal if South Korea gets the business which it… https://t.co/0UOXStGJj9"],
       ['@amyaharder',
        'RT @AmyAHarder: One conservative on EPA\'s efforts to repeal Obama-era methane rules: “I’ve given up on even paying attention to that." That…'],
       ['@amyaharder',
        'RT @AmyAHarder: One conservative on EPA\'s efforts to repeal Obama-era methane rules: “I’ve given up on even paying attention to that." That…'],
       ['@amyaharder',
        'RT @RLewis_UIowa: Interesting predictions -- &amp; progress report -- from @AmyAHarder via @axios https://t.co/VwLmI2wxCE']], dtype=object)

In [52]:
df_tweets.tail().values

array([['@worldoil',
        'RT @WorldOil: BREAKING: Scott Pruitt resigns as EPA chief. Report: https://t.co/RHsIwXc0q5'],
       ['@worldoil',
        'RT @WorldOil: Trade tensions loom over world’s fastest-growing fossil fuel. #OOTT #oilandgas #LNG  Find out more: https://t.co/SMRGwUhr2C'],
       ['@worldoil',
        'RT @WorldOil: Trade tensions loom over world’s fastest-growing fossil fuel. #OOTT #oilandgas #LNG  Find out more: https://t.co/SMRGwUhr2C'],
       ['@worldoil',
        'RT @WorldOil: .@eni completes development of the largest producing #offshore #gas field in #Libya. Details: https://t.co/d2F8SNgOuV'],
       ['@worldoil',
        'RT @WorldOil: .@eni completes development of the largest producing #offshore #gas field in #Libya. Details: https://t.co/d2F8SNgOuV']], dtype=object)

In [None]:
# visualization with wordcloud
def wordcloud(tweets, col):
    stopwords = set(STOPWORDS)
    wordcloud = WordCloud(background_color="white",stopwords=stopwords,random_state = 123).generate(" ".join([i for i in tweets[col]]))
    plt.figure(figsize=(20,10), facecolor='k')
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title("Common Words on Tweets of SuperMajors")
# wordcloud(tweets,'text')