/
clustering.py
161 lines (129 loc) · 4.67 KB
/
clustering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from collections import Counter
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram
import re
import json
from sklearn.decomposition import NMF
from pymongo import MongoClient
# input sym as a stock symbol 'AAPL'
# input k as the number features for nmf
# input coll as a collection in a mongo database
def nmf_df(sym, k, coll):
data = [ item for item in coll.find({'text': { '$in' :[re.compile(sym)] }}) ]
sents = [ sentence['text'] for sentence in data ]
dates = [ str(text['created_at']) for text in data ]
d = np.array(dates).T
d = d.reshape(len(dates), 1)
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(sents)
#features = vectorizer.get_feature_names()
model = NMF(n_components=k, init='random', random_state=0)
latent_features = model.fit_transform(X)
# lat0 = list(latent_features[:,0])
# lat1 = list(latent_features[:,1])
# lat2 = list(latent_features[:,2])
# lat3 = list(latent_features[:,3])
df = pd.DataFrame(latent_features) #np.concatenate((d, latent_features), axis=1)
df.columns = [ 'lat'+ str(n) for n in xrange(len(df.columns)) ]
df['time_stamp'] = d
#print df.head()
df['date'] = pd.to_datetime(df['time_stamp']).apply(pd.datetools.normalize_date)
df.pop('time_stamp')
#print df.head()
grouped_data = df.groupby(['date']).mean()
grouped_data['sym'] = sym
return grouped_data
def graph_it(df):
from yahoo_finance import Share
import matplotlib.pyplot as plt
sym = 'GOOG'
start_date = '2016-04-03'
end_date = '2016-04-07'
yahoo = Share(sym)
yahoo_days = yahoo.get_historical(start_date, end_date)
y_money = [ day['Close'] for day in yahoo_days ]
# for yahoo_day in yahoo_days:
# print yahoo_day
# print y_money
dates = ['2016-04-04', '2016-04-05', '2016-04-06', '2016-04-07']
x = [4,5,6,7]
print x
from matplotlib import gridspec
fig, ax1 = plt.subplots() #figsize=(30,15)
ax1.plot(x, y_money)
#ax1.set_xlabel('time (s)')
# Make the y-axis label and tick labels match the line color.
ax1.set_ylabel('Close', color='b')
for tl in ax1.get_yticklabels():
tl.set_color('b')
y_sent = []
for day in dates:
y_sent.append(df['lat0'].ix[day].values[0])
ax2 = ax1.twinx()
ax2.plot(x, y_sent, color='r')
ax2.set_ylabel('Sent1', color='r')
for tl in ax2.get_yticklabels():
tl.set_color('r')
y_sent = []
for day in dates:
y_sent.append(df['lat1'].ix[day].values[0])
ax3 = ax1.twinx()
ax3.plot(x, y_sent, color='r')
ax3.set_ylabel('Sent2', color='g')
for tl in ax3.get_yticklabels():
tl.set_color('g')
y_sent = []
for day in dates:
y_sent.append(df['lat1'].ix[day].values[0])
ax4 = ax1.twinx()
ax4.plot(x, y_sent, color='r')
ax4.set_ylabel('Sent3', color='y')
for tl in ax4.get_yticklabels():
tl.set_color('y')
y_sent = []
for day in dates:
y_sent.append(df['lat1'].ix[day].values[0])
ax5 = ax1.twinx()
ax5.plot(x, y_sent, color='r')
ax5.set_ylabel('Sent4', color='c')
for tl in ax5.get_yticklabels():
tl.set_color('c')
plt.savefig('imgs/eda_nmf_quotes.png')
plt.show()
if __name__ == '__main__':
mongo_client = MongoClient()
db = mongo_client.get_tweets
coll = db.tweets
symbols = ["GOOG", 'AAPL']
k = 4
df = pd.DataFrame()
for sym in symbols:
grouped_data = nmf_df(sym, k, coll)
df = pd.concat([df, grouped_data])
graph_it(df)
# df[df['sym'] == 'AAPL'].head()
# df.ix['2016-04-03']
# db.tweets.aggregate( [ {$match: {'text': { $in :[/GOOG/, /AAPL/] }}}, {$sample: { size: 1 }} ] )
# db.tweets.aggregate( [ {$match: {'text': { $in :[/GOOG/, /AAPL/] }, 'created_at': /Apr 03/}}, {$sample: { size: 1 }} ] )
# kmeans = KMeans()
# kmeans.fit(X)
#print "cluster centers:"
#print kmeans.cluster_centers_
# top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-11:-1]
# print "top features for each cluster:"
# for num, centroid in enumerate(top_centroids):
# print "%d: %s" % (num, ", ".join(features[i] for i in centroid))
#
# #------
# distxy = squareform(pdist(top_centroids, metric='cosine'))
# link = linkage(distxy, method='complete')
#
# dendro = dendrogram(link, color_threshold=1.5, leaf_font_size=9)
# # fix spacing to better view dendrogram and the labels
# plt.subplots_adjust(top=.99, bottom=0.5, left=0.05, right=0.99)
# plt.show()