In [4]:
%matplotlib inline

import gensim
import json
import nltk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pprint
import re
import string
import time

from collections import defaultdict
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

In [3]:
# !pip install gensim

## Data parsing

### Feature extraction from each tweet

In [6]:
tweets_file = open("../dataset/1502820001-tweets.txt", 'r')
lines = tweets_file.readlines()
print ("Number of tweets: %d" % len(lines))
tweets_file.close()

Number of tweets: 330


In [7]:
pprint.pprint(json.loads(lines[0].strip()))

{'contributors': None,
 'coordinates': None,
 'created_at': 'Tue Aug 15 17:58:26 +0000 2017',
 'entities': {'hashtags': [{'indices': [45, 52], 'text': 'patent'},
                           {'indices': [65, 68], 'text': 'IP'}],
              'symbols': [{'indices': [103, 108], 'text': 'GOOG'},
                          {'indices': [109, 112], 'text': 'FB'}],
              'urls': [{'display_url': 'iam-media.com/blog/Detail.as…',
                        'expanded_url': 'http://www.iam-media.com/blog/Detail.aspx?g=afc6cc58-706a-475d-906a-fd85bd1e49f1',
                        'indices': [113, 136],
                        'url': 'https://t.co/FiHWRiETq3'}],
              'user_mentions': [{'id': 108564136,
                                 'id_str': '108564136',
                                 'indices': [3, 16],
                                 'name': 'IAM',
                                 'screen_name': 'IAM_magazine'}]},
 'favorite_count': 0,
 'favorited': False,
 'geo': None,
 'id':

In [8]:
#TODO use all twitter files
data = defaultdict(dict)
i=0
for line in lines:

    tweet = json.loads(line.strip())
    if 'text' in tweet: # only messages contains 'text' field is a tweet
        ts = time.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
        data[i]["time"] = time.mktime(ts)  
        data[i]["text"] = tweet['text']
    if 'urls' in tweet['entities']:
        #print tweet['entities']['urls']
        data[i]["urls"] = len(tweet['entities']['urls'])
    if 'hashtags' in tweet['entities']:
        data[i]["hashtags"] = len(tweet['entities']['hashtags'])
    i += 1

In [9]:
### which other signals could be useful? 
print (data[0])
print (data[1])

{'time': 1502834306.0, 'text': 'RT @IAM_magazine: Exclusive: In major Valley #patent move Google #IP head Allen Lo is joining Facebook $GOOG $FB https://t.co/FiHWRiETq3', 'urls': 1, 'hashtags': 2}
{'time': 1502834158.0, 'text': 'RT @arnabch01: #investors massive bubble in #tech be careful $AAPL $GOOG $MSFT $AMZN $FB $NFLX $TSLA $CSCO $INTC $NVDA $ZNGA $ORCL $JD $MU…', 'urls': 0, 'hashtags': 2}


In [15]:
#working with text
#tokenizer for tweets
tknzr = TweetTokenizer(strip_handles=True) #(strip_handles=True, reduce_len=True)
corpus = []
for i, info in data.items():  
    text = info['text'].lower()
    text = text.encode('utf-8').decode('ascii','ignore') # content of the tweet
    text = re.sub(r"http\S*", '', text) #remove urls
    text = re.sub(r"^rt", '', text) #remove rt
    text = text.replace('#', '') #remove hashtag
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    words = tknzr.tokenize(text)
    text = " ".join(words).encode('utf-8')

    if text not in corpus:
        corpus.append(text)
        data[i]['text'] = text
        data[i]['exclamations'] = words.count('!')
        data[i]['questions'] = words.count('?')
        data[i]['dollar'] = words.count('$')
        data[i]['num_words'] = len(text) 
    else:
        data.pop(i)
    

          

AttributeError: 'bytes' object has no attribute 'encode'

In [513]:
print (data[0])
print (data[1])
print (len(data))


{'urls': 1, 'text': 'iam_magazine exclusive in major valley patent move google ip head allen lo is joining facebook goog fb', 'hashtags': 2, 'dollar': 0, 'questions': 0, 'time': 1502845106.0, 'exclamations': 0, 'num_words': 102}
{'urls': 0, 'text': 'arnabch investors massive bubble in tech be careful aapl goog msft amzn fb nflx tsla csco intc nvda znga orcl jd mu', 'hashtags': 2, 'dollar': 0, 'questions': 0, 'time': 1502844958.0, 'exclamations': 0, 'num_words': 115}
203


### Understanding the data

In [514]:
df = pd.DataFrame.from_dict(data, orient='index')
df.describe()

Unnamed: 0,urls,hashtags,dollar,questions,time,exclamations,num_words
count,203.0,203.0,203.0,203.0,203.0,203.0,203.0
mean,0.70936,0.566502,0.0,0.0,1502820000.0,0.0,80.743842
std,0.486716,1.643849,0.0,0.0,67446.04,0.0,26.452337
min,0.0,0.0,0.0,0.0,1502506000.0,0.0,23.0
25%,0.0,0.0,0.0,0.0,1502841000.0,0.0,62.0
50%,1.0,0.0,0.0,0.0,1502843000.0,0.0,81.0
75%,1.0,0.0,0.0,0.0,1502845000.0,0.0,103.0
max,2.0,10.0,0.0,0.0,1502845000.0,0.0,131.0


In [515]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 203 entries, 0 to 326
Data columns (total 8 columns):
urls            203 non-null int64
text            203 non-null object
hashtags        203 non-null int64
dollar          203 non-null int64
questions       203 non-null int64
time            203 non-null float64
exclamations    203 non-null int64
num_words       203 non-null int64
dtypes: float64(1), int64(6), object(1)
memory usage: 14.3+ KB


## Feature Extraction from Text

### Text features based on frequencies

In [516]:
#remove duplicates
df = df.drop_duplicates(subset=['text'], keep=False)
df.describe()
df = df.reset_index(drop=True)

In [517]:
from sklearn.feature_extraction.text import CountVectorizer

word_vectorizer = CountVectorizer(analyzer='word', stop_words='english')
sparse_matrix = word_vectorizer.fit_transform(df['text'])
frequencies = sum(sparse_matrix).toarray()[0]
words = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
print words.describe()
words.head(10)

         frequency
count  1078.000000
mean      1.969388
std       3.433408
min       1.000000
25%       1.000000
50%       1.000000
75%       2.000000
max      49.000000


Unnamed: 0,frequency
aap,2
aapl,34
abnormalreturns,1
abound,1
accelerating,1
account,1
acquired,1
acquires,5
acquisition,1
action,1


### Smaller dictionary

In [518]:
word_vectorizer = CountVectorizer(analyzer='word', stop_words='english',min_df=2, max_df=3000)
sparse_matrix = word_vectorizer.fit_transform(df['text'])
frequencies = sum(sparse_matrix).toarray()[0]
words = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
print words.describe()
words.head(10)

        frequency
count  346.000000
mean     3.994220
std      5.541932
min      2.000000
25%      2.000000
50%      2.000000
75%      4.000000
max     49.000000


Unnamed: 0,frequency
aap,2
aapl,34
acquires,5
ads,3
advances,2
advisors,2
affect,2
ago,2
ai,7
alny,2


### Finding structure in text

In [519]:
del words
#create data_samples
#data_samples= [t['text'] for t in data.values()]

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)


In [520]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

n_features = 250
n_components = 10
n_top_words = 10


### Counts
tf_vectorizer = CountVectorizer(min_df=2, max_df=1000, stop_words='english')
tf = tf_vectorizer.fit_transform(df.text)
tf_feature_names = tf_vectorizer.get_feature_names()
print tf_feature_names[:40]

[u'aap', u'aapl', u'acquires', u'ads', u'advances', u'advisors', u'affect', u'ago', u'ai', u'alny', u'alphabet', u'amazon', u'ameystone', u'amp', u'amzn', u'analysis', u'apple', u'apples', u'aprn', u'armonk', u'arnabch', u'arranged', u'ashburton', u'ask', u'asrockinfo', u'asset', u'augmented', u'august', u'auto', u'azure', u'ba', u'bac', u'bank', u'bargaining', u'barronsonline', u'big', u'bigdata', u'bitcf', u'blue', u'bond']


### TF-IDF as text features

In [521]:
## TF-IDF
tfidf_vectorizer = TfidfVectorizer(min_df=2,max_df=1000,stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(df.text)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

## Finding topics with LDA

In [522]:
lda = LatentDirichletAllocation(n_components=n_components, learning_method='online')
lda.fit(tf) ## fitting counts
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0: ibm box business international machines million position corporation china investors
Topic #1: snap buy ibm stocks fb fraud detailed analysis csco like
Topic #2: snap amzn box exploding mr marketing expense snaps amazon amp
Topic #3: box week spy xiv review dan lt alny timfundamentals deux
Topic #4: box snap check technologies chkp software point critical fitzgerald sees
Topic #5: goog arnabch ai ml robotics msft bigdata ibm lift rbc
Topic #6: box ibm goog googl ipo shares aapl know dropbox need
Topic #7: ibm ginni blue china yield killing strategic imperatives dividend vz
Topic #8: msft aapl amzn box ms tsla goog acquires gs nflx
Topic #9: amzn aapl fb nflx spy box amp goog data twtr


In [523]:
lda = LatentDirichletAllocation(n_components=n_components, learning_method='online')
lda.fit(tfidf) ## fitting tf-idf counts
print_top_words(lda, tfidf_feature_names, n_top_words)

Topic #0: ibm buy goog dividend snap right news vz good safest
Topic #1: chart ago volume paying rising ibm bargaining search default iphone
Topic #2: snap buy fb detailed analysis box exploding marketing expense snaps
Topic #3: ibm ginni blue box china arranged tuesday fraud week rev
Topic #4: box aapl ipo googl upcoming dropbox check need know goog
Topic #5: arnabch msft ai ml goog robotics snap bigdata iot amzn
Topic #6: amzn box amazon stock acquires msft ms new aapl gs
Topic #7: box business amp international position million expected machines somewhat delivery
Topic #8: shares box plc buys total amp making sold alphabet group
Topic #9: amzn snap think ibm risks hd stocks aprn market watson


## Simple sentiment analysis

In [524]:
positive = pd.read_csv('positive-words.txt', names=['a'])
positive =  set(positive['a'].tolist())

negative = pd.read_csv('negative-words.txt', names=['a'])
negative =  set(negative['a'].tolist())

In [525]:
count_positive = []
count_negative = []
for i, row in df.iterrows():
    commonp = set(row['text'].split()).intersection(positive) 
    count_positive.append(len(commonp))
    commonn = set(row['text'].split()).intersection(negative) 
    count_negative.append(len(commonn))


In [526]:
df['positive'] = count_positive
df['negative'] = count_negative

df.head(10)

Unnamed: 0,urls,text,hashtags,dollar,questions,time,exclamations,num_words,positive,negative
0,1,iam_magazine exclusive in major valley patent ...,2,0,0,1502845000.0,0,102,0,0
1,0,arnabch investors massive bubble in tech be ca...,2,0,0,1502845000.0,0,115,0,0
2,0,nyinvesting google goog is the embodiment of m...,6,0,0,1502845000.0,0,114,2,1
3,0,greenstocks timberr iwm spy tlt gs gld btc goo...,0,0,0,1502845000.0,0,106,0,0
4,1,bank of nova scotia buys shares of alphabet in...,0,0,0,1502845000.0,0,52,0,0
5,1,alphabet inc goog stake raised by north star a...,0,0,0,1502845000.0,0,65,0,0
6,1,themotleyfool the machines keep getting smarte...,0,0,0,1502845000.0,0,94,2,0
7,0,as alphabet goog valuation rose robshaw amp ju...,0,0,0,1502844000.0,0,88,0,0
8,1,warren averett asset management llc boosts pos...,0,0,0,1502844000.0,0,72,0,0
9,1,goog himx vuzi great article,0,0,0,1502844000.0,0,28,1,0


In [527]:
df.describe()

Unnamed: 0,urls,hashtags,dollar,questions,time,exclamations,num_words,positive,negative
count,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0,203.0
mean,0.70936,0.566502,0.0,0.0,1502820000.0,0.0,80.743842,0.413793,0.310345
std,0.486716,1.643849,0.0,0.0,67446.04,0.0,26.452337,0.649605,0.59473
min,0.0,0.0,0.0,0.0,1502506000.0,0.0,23.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,1502841000.0,0.0,62.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,1502843000.0,0.0,81.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,1502845000.0,0.0,103.0,1.0,0.5
max,2.0,10.0,0.0,0.0,1502845000.0,0.0,131.0,3.0,3.0


### Positive sentiment

In [528]:
df[(df['positive'] >0) & (df['negative']  == 0)]['text'].head(50)

6      themotleyfool the machines keep getting smarte...
9                           goog himx vuzi great article
10     robertrelder apples bargaining power rising go...
14     arnabch ai robotics bigdata genomics stemcell ...
15     applewatch to support both lte and nonlte mode...
20     beijing transit contactless mpayment system ex...
26     tweaktown pr asrockinfo introduces the x iot r...
29     pr asrockinfo introduces the x iot router for ...
41     stocktwits since its ipo home depot is actuall...
45     edborgato amzns same day pick up locations are...
49     would be amazed if jana partners manage to sel...
56           gs aapl amzn need to lead us higher spx dji
58     active traders try one of these free trading g...
59     xplr join us for play by play action on stocks...
60     amzn pzza restaurants are in a tech race to ma...
68     amzn part bmark offering guidance y y y y y y ...
73     hot options alert midday tuesday august bac dk...
77     there is a chance apple 

### Negative sentiment

In [529]:
df[(df['positive'] ==0) & (df['negative']  > 0)]['text'].head(20)

18     goog neonazi group moves to dark web after web...
30     arnabch will advances in ai ml robotics nanote...
33     arnabch hpc ai ml bigdata may soon enable geno...
39     discussing the retail landscape department sto...
48     sitrep risk on mrk ceo youre fired amzn gs leg...
54     amzn aap wmt amazon will probably go onto crus...
55     dont worry about how many shares you can buy c...
67     thestreet amazon will probably go onto crush a...
83     tsla sa another risk factor for tesla shorts d...
99     microsoft acquires cloudcomputing orchestratio...
115    international business machines ibm fall to no...
117             the blue cloud collapses i told u ibm so
121    jimcramer mariabartiromo so u wont ask ginni a...
122    seekingalpha ibm watson disappointment risks f...
123    ibm watson disappointment risks further downwa...
128    china big market thus saith ginni so far zero ...
131    marketsupchuck is ibms dividend yield killing ...
132    is ibms dividend yield k

In [530]:
print "Total tweets:", len(df)
print "Total tweets positive:",len(df[(df['positive'] >0) & (df['negative']  == 0)])
print "Total tweets negative:",len(df[(df['positive'] == 0) & (df['negative']  > 0)])
print "Tweets with no info:", len(df[(df['positive'] == 0) & (df['negative']  == 0)])
print "neutral tweets:", len(df[(df['positive'] >0) & (df['negative']  > 0)])

Total tweets: 203
Total tweets positive: 42
Total tweets negative: 25
Tweets with no info: 110
neutral tweets: 26


## Content Similarity using word embedings (Word2Vec)

In [531]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True, )

### Converting each tweet into a vector

In [532]:
from gensim import matutils

stop_words = set(stopwords.words('english'))

matrix = []
filtered = []
for i, row in data.items():
    filtered_text = [model[w] for w in row['text'].split() if w in model and w not in stop_words]
    filtered.append([w for w in row['text'].split() if w in model and w not in stop_words])
    if len(filtered_text):
        matrix.append(matutils.unitvec(np.array(filtered_text).mean(axis=0)))


In [533]:
#computing similarity between tweets

matrix = np.array(matrix)
sim = np.dot(matrix, matrix.transpose())
print sim

[[ 1.          0.672333    0.60287177 ...,  0.37708838  0.24777096
   0.41357933]
 [ 0.672333    1.          0.61664081 ...,  0.39533368  0.30348312
   0.38047786]
 [ 0.60287177  0.61664081  1.         ...,  0.40589874  0.23713568
   0.37125429]
 ..., 
 [ 0.37708838  0.39533368  0.40589874 ...,  1.          0.5730864
   0.52884238]
 [ 0.24777096  0.30348312  0.23713568 ...,  0.5730864   1.          0.64343482]
 [ 0.41357933  0.38047786  0.37125429 ...,  0.52884238  0.64343482  1.        ]]


In [534]:
#reshaping into a data frame
print sim.shape
dup = np.fill_diagonal(sim, 0)

simdf = pd.DataFrame(list(sim[np.triu_indices(sim.shape[1], 1)]))
simdf.describe()

(203, 203)


Unnamed: 0,0
count,20503.0
mean,0.392697
std,0.121137
min,-0.068056
25%,0.313049
50%,0.388658
75%,0.470452
max,1.0


### Get the most similar tweets for each sentiment

In [535]:
pos = 41
most_similar = np.argmax(sim[pos][:])
print "similarity:", sim[pos][most_similar]
print df.iloc[pos]['text']
print filtered[pos]
print df.iloc[most_similar]['text']
print filtered[most_similar]

similarity: 0.63219642533
stocktwits since its ipo home depot is actually outperforming amazon compare the green to the yellow line on this
['since', 'ipo', 'home', 'depot', 'actually', 'outperforming', 'amazon', 'compare', 'green', 'yellow', 'line']
barronstechblog amazon baird likes hulu win expanding tool set barrons tech trader daily amzn googl msft ibm
['amazon', 'baird', 'likes', 'hulu', 'win', 'expanding', 'tool', 'set', 'tech', 'trader', 'daily', 'msft', 'ibm']


In [536]:
neg = 55
most_similar = np.argmax(sim[neg][:])
print "similarity:", sim[neg][most_similar]
print df.iloc[neg]['text']
print filtered[neg]
print df.iloc[most_similar]['text']
print filtered[most_similar]

similarity: 0.70566082456
dont worry about how many shares you can buy concern yourself wthe return on those shares stocks amzn googl
['dont', 'worry', 'many', 'shares', 'buy', 'concern', 'wthe', 'return', 'shares', 'stocks']
retail never learns when buying stock you dont buy high and sell low thats what theyre doing right now with aapl good luck
['retail', 'never', 'learns', 'buying', 'stock', 'dont', 'buy', 'high', 'sell', 'low', 'thats', 'theyre', 'right', 'aapl', 'good', 'luck']
