In [1]:
#Module 5 project - Sentiment analysis of news companies twitter feed -BBC, CNN, CNBC, Bloomberg and FOX

In [2]:
import tweepy           # To consume Twitter's API
import pandas as pd     # To handle data
import numpy as np      # For number computing

# For plotting and visualization:
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer, PorterStemmer
from textblob import TextBlob

In [3]:
# We import our access keys:
from credentials import *    # This will allow us to use the keys as variables

In [4]:
# API's setup:
def twitter_setup():
    """
    Utility function to setup the Twitter's API
    with our access keys provided.
    """
    # Authentication and access using keys:
    auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
    auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

    # Return API with authentication:
    api = tweepy.API(auth)
    return api

In [5]:
# We create an extractor object:
extractor = twitter_setup()

# We create a tweet list as follows:
tweets = extractor.user_timeline(screen_name="@BBCWorld", count=200)
tweets1 = extractor.user_timeline(screen_name="@CNN", count=200)
tweets2 = extractor.user_timeline(screen_name="@CNBC", count=200)
tweets3 = extractor.user_timeline(screen_name="@business", count=200)
tweets4 = extractor.user_timeline(screen_name="@FoxNews", count=200)
#print("Number of tweets extracted: {}.\n".format(len(tweets)))

# We print the most recent 5 tweets:
#print("5 recent tweets:\n")
for tweet in tweets[:5]:
    BBC=(tweet.text)
for tweet1 in tweets1[:5]:
    CNN=(tweet1.text)
for tweet2 in tweets2[:5]:
    CNBC=(tweet2.text)
for tweet3 in tweets3[:5]:
    BLOOM=(tweet3.text)
for tweet4 in tweets4[:5]:
    FOX=(tweet4.text)

In [6]:
data=pd.DataFrame(data=[tweet.text for tweet in tweets],columns =['BBC'])
data1 = pd.DataFrame(data=[tweet1.text for tweet1 in tweets1], columns=['CNN'])
data2 = pd.DataFrame(data=[tweet2.text for tweet2 in tweets2], columns=['CNBC'])
data3 = pd.DataFrame(data=[tweet3.text for tweet3 in tweets3], columns=['BLOOM'])
data4 = pd.DataFrame(data=[tweet4.text for tweet4 in tweets4], columns=['FOX'])
# We display the first 10 elements of the dataframe:
main_data=pd.concat([data, data1, data2, data3, data4], axis=1, join='outer', sort=True)
main_data.head(3)

Unnamed: 0,BBC,CNN,CNBC,BLOOM,FOX
0,"Kim Ji-young, Born 1982: Feminist film reignit...",Felicity Huffman and Lori Laughlin became the ...,This Bill Gates-backed start-up is fighting wo...,Foreign firms welcome China's new investment l...,BREAKING: Federal appeals court rules against ...
1,House Rules: 'Bullied' reality TV star wins pa...,A seven-story building in the shape of a picni...,Growth in major Asian economies set to slow mo...,Here are the details of SoftBank's $9.5 billio...,Breaking News: Grand jury indicts captain of M...
2,Hong Kong releases murder suspect who sparked ...,Amber Guyger's appellate attorney has filed pa...,“We have a lot of momentum.” From drone delive...,Emerging-market stocks need an end to the trad...,JUST IN: Police identify gunman who opened fir...


In [7]:
#Lowercasing
main_data['BBC'] = main_data['BBC'].apply(lambda x: " ".join(x.lower() for x in x.split()))
main_data['CNN'] = main_data['CNN'].apply(lambda x: " ".join(x.lower() for x in x.split()))
main_data['CNBC'] = main_data['CNBC'].apply(lambda x: " ".join(x.lower() for x in x.split()))
main_data['BLOOM'] = main_data['BLOOM'].apply(lambda x: " ".join(x.lower() for x in x.split()))
main_data['FOX'] = main_data['FOX'].apply(lambda x: " ".join(x.lower() for x in x.split()))
main_data.head(2)

Unnamed: 0,BBC,CNN,CNBC,BLOOM,FOX
0,"kim ji-young, born 1982: feminist film reignit...",felicity huffman and lori laughlin became the ...,this bill gates-backed start-up is fighting wo...,foreign firms welcome china's new investment l...,breaking: federal appeals court rules against ...
1,house rules: 'bullied' reality tv star wins pa...,a seven-story building in the shape of a picni...,growth in major asian economies set to slow mo...,here are the details of softbank's $9.5 billio...,breaking news: grand jury indicts captain of m...


In [8]:
## remove punctuation
#main_data['BBC'] = main_data['BBC'].str.replace('[^ws]','')
#main_data['CNN'] = main_data['CNN'].str.replace('[^ws]','')
#main_data['CNBC'] = main_data['CNBC'].str.replace('[^ws]','')
#main_data['BLOOM'] = main_data['BLOOM'].str.replace('[^ws]','')
#main_data['FOX'] = main_data['FOX'].str.replace('[^ws]','')
#main_data.head(2)

In [9]:
#remove stopwords
stop = stopwords.words('english')
main_data['BBC'] = main_data['BBC'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
main_data['CNN'] = main_data['CNN'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
main_data['CNBC'] = main_data['CNBC'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
main_data['BLOOM'] = main_data['BLOOM'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
main_data['FOX'] = main_data['FOX'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
main_data.head(2)

Unnamed: 0,BBC,CNN,CNBC,BLOOM,FOX
0,"kim ji-young, born 1982: feminist film reignit...",felicity huffman lori laughlin became faces co...,bill gates-backed start-up fighting world hung...,foreign firms welcome china's new investment l...,breaking: federal appeals court rules trump ad...
1,house rules: 'bullied' reality tv star wins pa...,seven-story building shape picnic basket -- co...,growth major asian economies set slow expected...,details softbank's $9.5 billion rescue deal we...,breaking news: grand jury indicts captain miss...


In [10]:
#Stemming
st = PorterStemmer()
main_data['BBC'] = main_data['BBC'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
main_data['CNN'] = main_data['CNN'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
main_data['CNBC'] = main_data['CNBC'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
main_data['BLOOM'] = main_data['BLOOM'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
main_data['FOX'] = main_data['FOX'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
main_data.head(2)

Unnamed: 0,BBC,CNN,CNBC,BLOOM,FOX
0,"kim ji-young, born 1982: feminist film reignit...",felic huffman lori laughlin becam face colleg ...,bill gates-back start-up fight world hunger ma...,foreign firm welcom china' new invest law http...,breaking: feder appeal court rule trump admini...
1,hous rules: 'bullied' realiti tv star win payo...,seven-stori build shape picnic basket -- compl...,"growth major asian economi set slow expected, ...",detail softbank' $9.5 billion rescu deal wewor...,break news: grand juri indict captain missouri...


In [11]:
def senti(x):
    return TextBlob(x).sentiment  


In [12]:
main_data['senti_score_BBC'] = main_data['BBC'].apply(senti)
main_data.senti_score_BBC.head()

0    (0.03333333333333333, 0.06666666666666667)
1                                    (0.8, 0.4)
2                                    (0.0, 0.0)
3     (0.2787878787878788, 0.48484848484848486)
4                                   (0.0, 0.05)
Name: senti_score_BBC, dtype: object

In [13]:
list(main_data)

['BBC', 'CNN', 'CNBC', 'BLOOM', 'FOX', 'senti_score_BBC']

In [14]:
#As it can be observed there are two scores: 
#the first score is sentiment polarity which tells if the sentiment is positive or negative and 
#the second score is subjectivity score to tell how subjective is the text.