In [1]:
#importing all necessary packages
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
nltk.download('stopwords') #download stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chhavi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
nltk.download('vader_lexicon') #download VADER

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/chhavi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
data = pd.read_csv("financial_tweets.csv") #read csv files using Pandas

In [5]:
data.head() #show data set

Unnamed: 0,id,text,timestamp,source,symbols,company_names,url,verified
0,1.0197e+18,VIDEO: “I was in my office. I was minding my o...,Wed Jul 18 21:33:26 +0000 2018,GoldmanSachs,GS,The Goldman Sachs,https://twitter.com/i/web/status/1019696670777...,True
1,1.01971e+18,The price of lumber $LB_F is down 22% since hi...,Wed Jul 18 22:22:47 +0000 2018,StockTwits,M,Macy's,https://twitter.com/i/web/status/1019709091038...,True
2,1.01971e+18,Who says the American Dream is dead? https://t...,Wed Jul 18 22:32:01 +0000 2018,TheStreet,AIG,American,https://buff.ly/2L3kmc4,True
3,1.01972e+18,Barry Silbert is extremely optimistic on bitco...,Wed Jul 18 22:52:52 +0000 2018,MarketWatch,BTC,Bitcoin,https://twitter.com/i/web/status/1019716662587...,True
4,1.01972e+18,How satellites avoid attacks and space junk wh...,Wed Jul 18 23:00:01 +0000 2018,Forbes,ORCL,Oracle,http://on.forbes.com/6013DqDDU,True


In [7]:
pattern_1 = r'@[A-Za-z0-9]+' #drop @ from text
pattern_2 = r'https?://[A-Za-z0-9./]+' #drop urls from text
joint_pattern_1 = r'|'.join((pattern_1, pattern_2)) #join patterns 1 and 2
pattern_3 = r'[^a-zA-Z]' #remove all characters except a-z & A-Z
joint_pattern_2 = r'|'.join((joint_pattern_1,pattern_3)) #join patters 1,2 and 3

In [8]:
print(joint_pattern_2) #print final joint pattern 

@[A-Za-z0-9]+|https?://[A-Za-z0-9./]+|[^a-zA-Z]


In [9]:
porterstemmer = PorterStemmer()
all_tweets = [] #list for holding all clean tweets

for i in range(0,len(data['text'])): #parse through all tweets
    tweet = re.sub(joint_pattern_2, ' ', data['text'][i]) #sustitute joint pattern with blank space
    tweet = tweet.lower() #convert text to lowercase
    tweet = tweet.split() #convert to list
    #if word does not exist in the english stopword dictionary , stem it 
    tweet = [porterstemmer.stem(word) for word in tweet if not word in set(stopwords.words('english'))] 
    tweet = ' '.join(tweet) #join all words of a tweet separated by a space
    
    all_tweets.append(tweet) #add cleaned tweet to list

In [10]:
print(all_tweets) #print clean tweets

['video offic mind busi david solomon tell gs intern learn wa', 'price lumber lb f sinc hit ytd high maci turnaround still happen', 'say american dream dead', 'barri silbert extrem optimist bitcoin predict new crypto entrant go zero', 'satellit avoid attack space junk circl earth paid', 'david butler favorit fang stock realmoneysod alphabet facebook', 'miss convo one favorit thinker', 'u intellig document nelson mandela made public', 'senat want emerg alert go netflix spotifi etc', 'hedg fund manag marc larsi say bitcoin k possibl', 'u propos expedit appeal fight amp time warner purchas', 'roger feder uniqlo deal make one athlet earn endors', 'bond trader ahead jerom powel come inflat expect via', 'alcoa cut adjust ebitda forecast cite tariff share slide', 'custom urg boycott mgm resort casino file lawsuit mass shoot victim', 'gap tighten race trillion dollar valuat amazon hit billion via', 'presid trump endors brian kemp casey cagl georgia governor race', 'white hous struggl contain f

In [12]:
#analyse the sentiment of the text using NLTK library
def findSentiment (text): 
    analyzer = SentimentIntensityAnalyzer()
    polarity = analyzer.polarity_scores(text)
    
    print(text)
    print() 
    
    #print sentiment scores - compound,neg,pos,neu
    for i in sorted(polarity):
        print('{0} : {1}'.format(i,polarity[i]), end='') 
        print()
    print()
    
    #compound<=-0.2 negative sentiment
    #compound>=0.2 positive sentiment
    #compound<0.2 & compound>-0.2 neutral sentiment
    if(polarity['compound'] <= -0.2):  
        sentiment = 'negative'
    if(polarity['compound'] >= 0.2):
        sentiment = 'positive'
    if(polarity['compound'] >-0.2 and polarity['compound'] < 0.2):
        sentiment = 'neutral'
      
    return(sentiment) #return sentiment as string

In [13]:
findSentiment(all_tweets[10]) #find sentiment of 10th tweet

u propos expedit appeal fight amp time warner purchas

compound : -0.3818
neg : 0.271
neu : 0.729
pos : 0.0



'negative'

In [14]:
findSentiment(all_tweets[100]) #find sentiment of 100th tweet

short sale volum short interest aee cm exc sre

compound : 0.4588
neg : 0.0
neu : 0.727
pos : 0.273



'positive'

In [15]:
findSentiment(all_tweets[1000]) #find sentiment of 1000th tweet

notabl two hundr day move averag cross etr energi

compound : 0.0
neg : 0.0
neu : 1.0
pos : 0.0



'neutral'

In [16]:
findSentiment(all_tweets[10000]) #find sentiment of 10000th tweet

dig retail sale report mcd dnkn yum wen

compound : 0.0
neg : 0.0
neu : 1.0
pos : 0.0



'neutral'