# Single Company Experiment
## AAPL - Apple Inc. 

# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

In [20]:
import sentisignal

## Configuration

In [6]:
# notebook-wide experiemental configuration
data_primary = '../../data/csv/stocktwits_twitter_withretweets_daily.csv'
data_symbology = '../../data/csv/symbology.csv'
data_pickles = '../../data/pickles/'
start_date = '2014-01-01'
end_date = '2015-01-01'

## Data preprocessing and merging

In [17]:
# get data
data_sentiment = sentisignal.subsample_data(data_primary, data_symbology, data_pickles, start_date, end_date, 'SYMBOL', ['AAPL'], False)
data_finance = sentisignal.get_data_finance('yahoo', ['AAPL'], start_date, end_date, '../../data/pickles/', False, 'NaN')
# preprocess data
sentisignal.preprocess_data_sentiment(data_sentiment)
sentisignal.preprocess_data_finance(data_finance)
# merge data
data_merged = sentisignal.merge_sentiment_finance(data_sentiment, data_finance, False, False, True)

Loaded from pre-created pickle
Loaded from pre-created pickle


## Descriptive statistics
### Distributions, auto-correlation

In [19]:
sentisignal.check_pdf(data_merged)

NameError: name 'check_pdf' is not defined

## Correlation analysis

In [None]:
sentisignal.correlation_analysis(data_merged)

In [None]:
sentisignal.information_surplus(data_merged, 5, 'LOG_RETURN', 'LOG_BULL_BEAR_RATIO', 500)

In [None]:
data_merged_window = sentisignal.apply_rolling_window(data_merged, 30)

In [None]:
sentisignal.correlation_analysis(data_merged_window)

In [None]:
sentisignal.information_surplus(data_merged_window, 5, 'LOG_RETURN', 'LOG_BULL_BEAR_RATIO', 500)

In [None]:
df.info()

In [None]:
df = data_merged_window
df_num = df.select_dtypes(include=[sentisignal.np.float, sentisignal.np.int])
# print df_num.info()
df_num = df_num[['BULL_SCORED_MESSAGES', 'BEAR_SCORED_MESSAGES', 'LOG_RETURN', 'VOLATILITY']]
# Convert DataFrame to matrix
mat = df_num.as_matrix()
# Using sklearn
km = sentisignal.KMeans()
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
# results = pandas.DataFrame([dataset.index,labels]).T

data = mat

reduced_data = sentisignal.PCA(n_components=2).fit_transform(data)
kmeans = sentisignal.KMeans(init='k-means++', n_clusters=6, n_init=10)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02     # point in the mesh [x_min, m_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = sentisignal.np.meshgrid(sentisignal.np.arange(x_min, x_max, h), sentisignal.np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(sentisignal.np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
sentisignal.plt.figure(1)
sentisignal.plt.clf()
sentisignal.plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=sentisignal.plt.get_cmap('Set3'),
           aspect='auto', origin='lower')

sentisignal.plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
sentisignal.plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
sentisignal.plt.title('K-means clustering on sentiment and finance metrics')
sentisignal.plt.xlim(x_min, x_max)
sentisignal.plt.ylim(y_min, y_max)
sentisignal.plt.xticks(())
sentisignal.plt.yticks(())
sentisignal.plt.show()
