In [1]:
#Module 5 project - Sentiment analysis of news companies twitter feed -BBC, CNN, CNBC, Bloomberg and FOX

In [2]:
import tweepy           # To consume Twitter's API
import pandas as pd     # To handle data
import numpy as np      # For number computing

# For plotting and visualization:
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer, PorterStemmer
from textblob import TextBlob

In [3]:
import json
def get_keys(path):
    with open(path) as f:
        return json.load(f)

In [4]:
# We import our access keys:
#from credentials import *    # This will allow us to use the keys as variables
keys = get_keys("/Users/eyambaita/.secret/api.json")

In [5]:
CONSUMER_KEY = keys['CONSUMER_KEY']
CONSUMER_SECRET = keys['CONSUMER_SECRET']
ACCESS_TOKEN = keys['ACCESS_TOKEN']
ACCESS_SECRET = keys['ACCESS_SECRET']

In [6]:
# API's setup:
def twitter_setup():
    """
    Utility function to setup the Twitter's API
    with our access keys provided.
    """
    # Authentication and access using keys:
    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

    # Return API with authentication:
    api = tweepy.API(auth)
    return api

In [7]:
# We create an extractor object:
extractor = twitter_setup()

# We create a tweet list as follows:
tweets = extractor.user_timeline(screen_name="@BBCWorld", count=200)
tweets1 = extractor.user_timeline(screen_name="@CNN", count=200)
tweets2 = extractor.user_timeline(screen_name="@CNBC", count=200)
tweets3 = extractor.user_timeline(screen_name="@business", count=200)
tweets4 = extractor.user_timeline(screen_name="@FoxNews", count=200)
#print("Number of tweets extracted: {}.\n".format(len(tweets)))

# We print the most recent 5 tweets:
#print("5 recent tweets:\n")
for tweet in tweets[:5]:
    BBC=(tweet.text)
for tweet1 in tweets1[:5]:
    CNN=(tweet1.text)
for tweet2 in tweets2[:5]:
    CNBC=(tweet2.text)
for tweet3 in tweets3[:5]:
    BLOOM=(tweet3.text)
for tweet4 in tweets4[:5]:
    FOX=(tweet4.text)

In [8]:
data=pd.DataFrame(data=[tweet.text for tweet in tweets],columns =['BBC'])
data1 = pd.DataFrame(data=[tweet1.text for tweet1 in tweets1], columns=['CNN'])
data2 = pd.DataFrame(data=[tweet2.text for tweet2 in tweets2], columns=['CNBC'])
data3 = pd.DataFrame(data=[tweet3.text for tweet3 in tweets3], columns=['BLOOM'])
data4 = pd.DataFrame(data=[tweet4.text for tweet4 in tweets4], columns=['FOX'])
# We display the first 10 elements of the dataframe:
main_data_untouched =pd.concat([data, data1, data2, data3, data4], axis=1, join='outer', sort=True)
main_data = main_data_untouched
#main_data_untouched.head(3)
main_data.head(3)

Unnamed: 0,BBC,CNN,CNBC,BLOOM,FOX
0,RT @SallyBundockBBC: #Zuckerberg tries to conv...,The trans fats still hidden in many foods incr...,"When you think about London, you might think o...",Forget the IPO. The hippest strategy is backin...,BREAKING: Federal appeals court rules against ...
1,Argentina election: Voters dream of breaking c...,"""For the record, as far as I could tell, alien...",Global recession is a top concern among Asia P...,-Asia is bracing for Treasury’s report on curr...,Breaking News: Grand jury indicts captain of M...
2,Indian 'tiger poacher who ate sloth bear penis...,US soccer superstar striker Alex Morgan and he...,"“I’m not in favor of divorcing China, but I do...",The man allegedly behind the 1MDB heist parts ...,JUST IN: Police identify gunman who opened fir...


In [9]:
#Lowercasing
main_data['BBC'] = main_data['BBC'].apply(lambda x: " ".join(x.lower() for x in x.split()))
main_data['CNN'] = main_data['CNN'].apply(lambda x: " ".join(x.lower() for x in x.split()))
main_data['CNBC'] = main_data['CNBC'].apply(lambda x: " ".join(x.lower() for x in x.split()))
main_data['BLOOM'] = main_data['BLOOM'].apply(lambda x: " ".join(x.lower() for x in x.split()))
main_data['FOX'] = main_data['FOX'].apply(lambda x: " ".join(x.lower() for x in x.split()))
main_data.head(2)

Unnamed: 0,BBC,CNN,CNBC,BLOOM,FOX
0,rt @sallybundockbbc: #zuckerberg tries to conv...,the trans fats still hidden in many foods incr...,"when you think about london, you might think o...",forget the ipo. the hippest strategy is backin...,breaking: federal appeals court rules against ...
1,argentina election: voters dream of breaking c...,"""for the record, as far as i could tell, alien...",global recession is a top concern among asia p...,-asia is bracing for treasury’s report on curr...,breaking news: grand jury indicts captain of m...


In [10]:
## remove punctuation
main_data['BBC'] = main_data['BBC'].str.replace('[@]','')
main_data['CNN'] = main_data['CNN'].str.replace('[@]','')
main_data['CNBC'] = main_data['CNBC'].str.replace('[@]','')
main_data['BLOOM'] = main_data['BLOOM'].str.replace('[@]','')
main_data['FOX'] = main_data['FOX'].str.replace('[@]','')
main_data.head(2)

Unnamed: 0,BBC,CNN,CNBC,BLOOM,FOX
0,rt sallybundockbbc: #zuckerberg tries to convi...,the trans fats still hidden in many foods incr...,"when you think about london, you might think o...",forget the ipo. the hippest strategy is backin...,breaking: federal appeals court rules against ...
1,argentina election: voters dream of breaking c...,"""for the record, as far as i could tell, alien...",global recession is a top concern among asia p...,-asia is bracing for treasury’s report on curr...,breaking news: grand jury indicts captain of m...


In [11]:
#remove stopwords
stop = stopwords.words('english')
main_data['BBC'] = main_data['BBC'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
main_data['CNN'] = main_data['CNN'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
main_data['CNBC'] = main_data['CNBC'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
main_data['BLOOM'] = main_data['BLOOM'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
main_data['FOX'] = main_data['FOX'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
main_data.head(2)

Unnamed: 0,BBC,CNN,CNBC,BLOOM,FOX
0,rt sallybundockbbc: #zuckerberg tries convince...,trans fats still hidden many foods increased r...,"think london, might think tesco, largest retai...",forget ipo. hippest strategy backing tech comp...,breaking: federal appeals court rules trump ad...
1,argentina election: voters dream breaking cycl...,"""for record, far could tell, aliens never cont...",global recession top concern among asia pacifi...,-asia bracing treasury’s report currency manip...,breaking news: grand jury indicts captain miss...


In [12]:
#Stemming
st = PorterStemmer()
main_data['BBC'] = main_data['BBC'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
main_data['CNN'] = main_data['CNN'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
main_data['CNBC'] = main_data['CNBC'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
main_data['BLOOM'] = main_data['BLOOM'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
main_data['FOX'] = main_data['FOX'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
main_data.head(2)

Unnamed: 0,BBC,CNN,CNBC,BLOOM,FOX
0,rt sallybundockbbc: #zuckerberg tri convinc us...,tran fat still hidden mani food increas risk a...,"think london, might think tesco, largest retai...",forget ipo. hippest strategi back tech compani...,breaking: feder appeal court rule trump admini...
1,argentina election: voter dream break cycl cri...,"""for record, far could tell, alien never conta...",global recess top concern among asia pacif bus...,-asia brace treasury’ report currenc manipul -...,break news: grand juri indict captain missouri...


In [13]:
def senti(x):
    return (TextBlob(x).sentiment.polarity)  
###############################################
#Used function below instead

def analyze_sentiment(x):
    '''
    Utility function to classify the polarity of a tweet
    using textblob.
    '''
    analysis = TextBlob(x)
    if analysis.sentiment.polarity > 0:
        return 1
    elif analysis.sentiment.polarity == 0:
        return 0
    else:
        return -1

In [14]:
#As it can be observed there are two scores: 
#the first score is sentiment polarity which tells if the sentiment is positive or negative and 
#the second score is subjectivity score to tell how subjective is the text.
#The sentiment property returns a namedtuple of the form Sentiment(polarity, subjectivity). 
#The polarity score is a float within the range [-1.0, 1.0]. 
#The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.

In [15]:
#main_data['senti_score_BBC'] = main_data['BBC'].apply(analyze_sentiment)
#main_data['senti_score_CNN'] = main_data['CNN'].apply(analyze_sentiment)
#main_data['senti_score_CNBC'] = main_data['CNBC'].apply(analyze_sentiment)
#main_data['senti_score_BLOOM'] = main_data['BLOOM'].apply(analyze_sentiment)
#main_data['senti_score_FOX'] = main_data['FOX'].apply(analyze_sentiment)
main_data['senti_score_BBC'] = main_data['BBC'].apply(senti)
main_data['senti_score_CNN'] = main_data['CNN'].apply(senti)
main_data['senti_score_CNBC'] = main_data['CNBC'].apply(senti)
main_data['senti_score_BLOOM'] = main_data['BLOOM'].apply(senti)
main_data['senti_score_FOX'] = main_data['FOX'].apply(senti)
main_data.head(2)

Unnamed: 0,BBC,CNN,CNBC,BLOOM,FOX,senti_score_BBC,senti_score_CNN,senti_score_CNBC,senti_score_BLOOM,senti_score_FOX
0,rt sallybundockbbc: #zuckerberg tri convinc us...,tran fat still hidden mani food increas risk a...,"think london, might think tesco, largest retai...",forget ipo. hippest strategi back tech compani...,breaking: feder appeal court rule trump admini...,0.0,-0.166667,0.0,0.0,0.0
1,argentina election: voter dream break cycl cri...,"""for record, far could tell, alien never conta...",global recess top concern among asia pacif bus...,-asia brace treasury’ report currenc manipul -...,break news: grand juri indict captain missouri...,0.0,-0.15,0.25,0.25,0.5


In [16]:
#list(main_data)

In [17]:
main_data=main_data[['BBC','senti_score_BBC','CNN','senti_score_CNN','CNBC','senti_score_CNBC','BLOOM','senti_score_BLOOM','FOX','senti_score_FOX']]

In [18]:
main_data.head(2)

Unnamed: 0,BBC,senti_score_BBC,CNN,senti_score_CNN,CNBC,senti_score_CNBC,BLOOM,senti_score_BLOOM,FOX,senti_score_FOX
0,rt sallybundockbbc: #zuckerberg tri convinc us...,0.0,tran fat still hidden mani food increas risk a...,-0.166667,"think london, might think tesco, largest retai...",0.0,forget ipo. hippest strategi back tech compani...,0.0,breaking: feder appeal court rule trump admini...,0.0
1,argentina election: voter dream break cycl cri...,0.0,"""for record, far could tell, alien never conta...",-0.15,global recess top concern among asia pacif bus...,0.25,-asia brace treasury’ report currenc manipul -...,0.25,break news: grand juri indict captain missouri...,0.5


In [19]:
#main_data.loc[main_data['senti_score_BBC'].idxmax()]
#print(main_data[['BBC']][main_data.senti_score_BBC == main_data.senti_score_BBC.max()])
hold = (main_data[['BBC']][main_data.senti_score_BBC == main_data.senti_score_BBC.max()])
hold1 = (main_data[['CNN']][main_data.senti_score_CNN == main_data.senti_score_CNN.max()])
hold2 = (main_data[['CNBC']][main_data.senti_score_CNBC == main_data.senti_score_CNBC.max()])
hold3 = (main_data[['BLOOM']][main_data.senti_score_BLOOM == main_data.senti_score_BLOOM.max()])
hold4 = (main_data[['FOX']][main_data.senti_score_FOX == main_data.senti_score_FOX.max()])
BBC_index = hold.index
CNN_index = hold1.index
CNBC_index = hold2.index
BLOOM_index = hold3.index
FOX_index = hold4.index
print(BBC_index)

Int64Index([52, 98, 173, 175], dtype='int64')


In [20]:
main_data_untouched.head(3)

Unnamed: 0,BBC,CNN,CNBC,BLOOM,FOX,senti_score_BBC,senti_score_CNN,senti_score_CNBC,senti_score_BLOOM,senti_score_FOX
0,rt sallybundockbbc: #zuckerberg tri convinc us...,tran fat still hidden mani food increas risk a...,"think london, might think tesco, largest retai...",forget ipo. hippest strategi back tech compani...,breaking: feder appeal court rule trump admini...,0.0,-0.166667,0.0,0.0,0.0
1,argentina election: voter dream break cycl cri...,"""for record, far could tell, alien never conta...",global recess top concern among asia pacif bus...,-asia brace treasury’ report currenc manipul -...,break news: grand juri indict captain missouri...,0.0,-0.15,0.25,0.25,0.5
2,indian 'tiger poacher ate sloth bear penises' ...,us soccer superstar striker alex morgan husban...,"“i’m favor divorc china, think recalibr rebal ...",man allegedli behind 1mdb heist part london li...,in: polic identifi gunman open fire insid cali...,-0.05,0.0,0.0,-0.4,0.0


In [21]:
Recommend_BBC = main_data_untouched.loc[BBC_index,'BBC']
print(Recommend_BBC)

52     hous rules: 'bullied' realiti tv star win payo...
98     botswana' elect could decid eleph diamond http...
173    renamo call mozambiqu elect cancel https://t.c...
175    north macedonia call snap elect eu talk setbac...
Name: BBC, dtype: object


In [22]:
Recommend_CNN = main_data_untouched.loc[CNN_index,'CNN']
print(Recommend_CNN)

108    "one best valu america free speech... we'r all...
Name: CNN, dtype: object


In [23]:
Recommend_CNBC = main_data_untouched.loc[CNBC_index,'CNBC']
print(Recommend_CNBC)

3      10 best univers world, accord u.s. news &amp; ...
191    ub upgrad coca-cola, say stabl earn growth per...
Name: CNBC, dtype: object


In [24]:
Recommend_BLOOM = main_data_untouched.loc[BLOOM_index,'CNBC']
print(Recommend_BLOOM)

111    rt grow_mag: buy item bulk might save money. h...
124    mark zuckerberg respond question regard leader...
194    5 thing know stock market open wednesday https...
Name: CNBC, dtype: object


In [25]:
Recommend_FOX = main_data_untouched.loc[FOX_index,'CNBC']
print(Recommend_FOX)

43    aoc grill zuckerberg facebook allow lie polit ...
88    good idea take person loan settl credit card d...
Name: CNBC, dtype: object
