# Final Notebook 002 - Data Analysis

## Import

In [1]:
import sentisignal
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np 
import pylab as P 
import ast
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import math
import scipy.stats as s
import statsmodels.api as sm
import pprint
import seaborn as sns

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from datetime import datetime
from yahoo_finance import Share
from pandas_datareader import data, wb
from statsmodels.graphics.api import qqplot
from operator import itemgetter
from decimal import *
from sklearn.neighbors.kde import KernelDensity

# plt.style.use('ggplot')
# sns.set_style("darkgrid")
# sns.set_context("notebook")

## Configuration

In [3]:
data_primary_path = '../../data/csv/stocktwits_twitter_withretweets_daily.csv'
data_symbology_path = '../../data/csv/symbology.csv'
data_pickles_path = '../../data/pickles/'
data_nasdaq_company_list_path = '../../data/nasdaq/companylist.csv'
start_date = '2012-01-01'
end_date = '2016-01-01'

## Reload Data

In [4]:
data_symbology = pd.read_csv(data_symbology_path)

### Use NASDAQ data to get top 100 companies by MarketCap

In [5]:
data_nasdaq_company_list = pd.read_csv(data_nasdaq_company_list_path)
data_nasdaq_top_100_mkt_cap = data_nasdaq_company_list.head(100)
data_nasdaq_top_100_mkt_cap_symbols = data_nasdaq_top_100_mkt_cap['Symbol'].unique()

### Load results from intermediate pickle

In [6]:
data_nasdaq_top_100_preprocessed_merge = pd.read_pickle(data_pickles_path+'intermediate/'+'data_nasdaq_top_100_preprocessed_merge_'+start_date+'_'+end_date)

### Load window results from intermediate pickle

In [7]:
data_nasdaq_top_100_preprocessed_merge_window_30 = pd.read_pickle(data_pickles_path+'intermediate/'+'data_nasdaq_top_100_preprocessed_merge_'+start_date+'_'+end_date+'_window_30')

## Data Analytics (No window)

### MI (Mutual Information)

In [None]:
# def shift
# data_nasdaq_top_100_preprocessed_mi_res = sentisignal.split_apply_combine(data_nasdaq_top_100_preprocessed_merge, 'SYMBOL', sentisignal.information_surplus, 10, 'PCA_SENTIMENT', 'PCA_FINANCE', -1, True)
# def shift (validate)
# data_nasdaq_top_100_preprocessed_mi_res_validate = sentisignal.split_apply_combine(data_nasdaq_top_100_preprocessed_merge, 'SYMBOL', sentisignal.information_surplus, 10, 'PCA_SENTIMENT', 'PCA_FINANCE', -1, False)

In [None]:
data_nasdaq_top_100_preprocessed_merge_mi_res_net = sentisignal.split_apply_combine(data_nasdaq_top_100_preprocessed_merge, 'SYMBOL', sentisignal.net_information_surplus, 10, 'PCA_SENTIMENT', 'PCA_FINANCE', -1)
data_nasdaq_top_100_preprocessed_merge_mi_res_net_constrained = sentisignal.constrain_mi_res(data_nasdaq_top_100_preprocessed_merge_mi_res_net)
sentisignal.plot_inf_res(data_nasdaq_top_100_preprocessed_merge_mi_res_net_constrained)

In [None]:
data_nasdaq_top_100_preprocessed_merge_mi_res_net = sentisignal.split_apply_combine(data_nasdaq_top_100_preprocessed_merge, 'SYMBOL', sentisignal.net_information_surplus, 10, 'BULL_MINUS_BEAR', 'LOG_RETURN', -1)
data_nasdaq_top_100_preprocessed_merge_mi_res_net_constrained = sentisignal.constrain_mi_res(data_nasdaq_top_100_preprocessed_merge_mi_res_net)
sentisignal.plot_inf_res(data_nasdaq_top_100_preprocessed_merge_mi_res_net_constrained)

In [None]:
data_nasdaq_top_100_preprocessed_merge_mi_res_net = sentisignal.split_apply_combine(data_nasdaq_top_100_preprocessed_merge, 'SYMBOL', sentisignal.net_information_surplus, 10, 'RTISf', 'VOLUME', -1)
data_nasdaq_top_100_preprocessed_merge_mi_res_net_constrained = sentisignal.constrain_mi_res(data_nasdaq_top_100_preprocessed_merge_mi_res_net)
sentisignal.plot_inf_res(data_nasdaq_top_100_preprocessed_merge_mi_res_net_constrained)

In [None]:
data_nasdaq_top_100_preprocessed_merge_mi_res_net = sentisignal.split_apply_combine(data_nasdaq_top_100_preprocessed_merge, 'SYMBOL', sentisignal.net_information_surplus, 10, 'BULL_MINUS_BEAR', 'VOLATILITY', -1)
data_nasdaq_top_100_preprocessed_merge_mi_res_net_constrained = sentisignal.constrain_mi_res(data_nasdaq_top_100_preprocessed_merge_mi_res_net)
sentisignal.plot_inf_res(data_nasdaq_top_100_preprocessed_merge_mi_res_net_constrained)

### PMI (Pointwise Mutual Information)
#### Form a time series of mutual information values from single events per company

In [None]:
# pmi_func(data_nasdaq_top_100_preprocessed_merge, 'PCA_SENTIMENT', 'PCA_FINANCE')
sentisignal.kernel_pmi_func(data_nasdaq_top_100_preprocessed_merge, 'PCA_SENTIMENT', 'PCA_FINANCE', 1.0, 0)

plt.plot(data_nasdaq_top_100_preprocessed_merge['PMI_0'])
plt.show()

In [None]:
data_nasdaq_top_100_preprocessed_merge.info()

In [None]:
data_nasdaq_top_100_preprocessed_merge_daily_pmi = sentisignal.split_apply_combine(data_nasdaq_top_100_preprocessed_merge, ['SYMBOL'], sentisignal.net_daily_pmi_info_surplus, 2, 'PCA_SENTIMENT', 'PCA_FINANCE', True)
data_nasdaq_top_100_preprocessed_merge_daily_pmi.head()

In [8]:
data_nasdaq_top_100_preprocessed_merge_daily_pmi = sentisignal.split_apply_combine(data_nasdaq_top_100_preprocessed_merge, ['SYMBOL'], sentisignal.daily_pmi_info_surplus, 2, 'PCA_SENTIMENT', 'PCA_FINANCE', True)
data_nasdaq_top_100_preprocessed_merge_daily_pmi.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SOURCE,SYMBOL,TIMESTAMP_UTC,BULLISH_INTENSITY,BEARISH_INTENSITY,BULL_MINUS_BEAR,BULL_SCORED_MESSAGES,BEAR_SCORED_MESSAGES,BULL_BEAR_MSG_RATIO,TOTAL_SCANNED_MESSAGES,...,PCA_FINANCE,shift_PCA_SENTIMENT_0,shift_PCA_SENTIMENT_-1,shift_PCA_SENTIMENT_-2,PMI_0,pmi_is_0,PMI_-1,pmi_is_-1,PMI_-2,pmi_is_-2
SYMBOL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AAL,0,stocktwits+twitter_withretweets,AAL,2012-01-03T05:00:00Z,0,1.5,-1.5,0,1,0,2,...,-1335582.492857,-60.638367,-62.202768,-60.167295,2.214825,0,2.310193,4.305897,2.279029,2.898871
AAL,1,stocktwits+twitter_withretweets,AAL,2012-01-04T05:00:00Z,0,0.0,0.0,0,0,0,1,...,-1726699.494002,-62.202768,-60.167295,-59.350832,2.306997,0,2.277701,-1.269854,2.396625,3.885071
AAL,2,stocktwits+twitter_withretweets,AAL,2012-01-05T05:00:00Z,0,0.0,0.0,0,0,0,2,...,-2175218.47013,-60.167295,-59.350832,-62.598267,2.268318,0,2.396422,5.647534,2.455259,8.241401
AAL,3,stocktwits+twitter_withretweets,AAL,2012-01-06T05:00:00Z,0,0.0,0.0,0,0,0,3,...,-2727558.457286,-59.350832,-62.598267,-61.232706,2.391257,0,2.443692,2.19278,2.196028,-8.164316
AAL,4,stocktwits+twitter_withretweets,AAL,2012-01-09T05:00:00Z,0,0.0,0.0,0,0,0,1,...,-2048155.473899,-62.598267,-61.232706,-61.664165,2.441872,0,2.189982,-10.315458,2.218019,-9.167274


In [9]:
data_nasdaq_top_100_preprocessed_merge_daily_pmi_valid = sentisignal.split_apply_combine(data_nasdaq_top_100_preprocessed_merge, ['SYMBOL'], sentisignal.daily_pmi_info_surplus, 2, 'PCA_SENTIMENT', 'PCA_FINANCE', False)
data_nasdaq_top_100_preprocessed_merge_daily_pmi_valid.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SOURCE,SYMBOL,TIMESTAMP_UTC,BULLISH_INTENSITY,BEARISH_INTENSITY,BULL_MINUS_BEAR,BULL_SCORED_MESSAGES,BEAR_SCORED_MESSAGES,BULL_BEAR_MSG_RATIO,TOTAL_SCANNED_MESSAGES,...,PCA_FINANCE,shift_PCA_SENTIMENT_0,shift_PCA_SENTIMENT_1,shift_PCA_SENTIMENT_2,PMI_0,pmi_is_0,PMI_1,pmi_is_1,PMI_2,pmi_is_2
SYMBOL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
AAL,2,stocktwits+twitter_withretweets,AAL,2012-01-05T05:00:00Z,0.0,0,0.0,0,0,0,2,...,-2175218.47013,-60.167295,-62.202768,-60.638367,2.455259,0,2.396422,-2.396372,2.268318,-7.613908
AAL,3,stocktwits+twitter_withretweets,AAL,2012-01-06T05:00:00Z,0.0,0,0.0,0,0,0,3,...,-2727558.457286,-59.350832,-60.167295,-62.202768,2.196028,0,2.443692,11.277856,2.391257,8.890135
AAL,4,stocktwits+twitter_withretweets,AAL,2012-01-09T05:00:00Z,0.0,0,0.0,0,0,0,1,...,-2048155.473899,-62.598267,-59.350832,-60.167295,2.218019,0,2.189982,-1.264065,2.441872,10.092479
AAL,5,stocktwits+twitter_withretweets,AAL,2012-01-10T05:00:00Z,2.2,0,2.2,1,0,0,1,...,-714120.526253,-61.232706,-62.598267,-59.350832,2.279029,0,2.209472,-3.052063,2.181927,-4.260674
AAL,6,stocktwits+twitter_withretweets,AAL,2012-01-11T05:00:00Z,0.0,0,0.0,0,0,0,1,...,-1481736.519819,-61.664165,-61.232706,-62.598267,2.284907,0,2.277701,-0.315343,2.203675,-3.555139


In [None]:
data_nasdaq_top_100_preprocessed_merge_daily_pmi_net = sentisignal.split_apply_combine(data_nasdaq_top_100_preprocessed_merge, ['SYMBOL'], sentisignal.net_daily_pmi_info_surplus, 2, 'PCA_SENTIMENT', 'PCA_FINANCE')
data_nasdaq_top_100_preprocessed_merge_daily_pmi_net.head()

In [None]:
# data_nasdaq_top_100_preprocessed_merge.groupby('SYMBOL')
years = mdates.YearLocator()   # every year
months = mdates.MonthLocator()  # every month
yearsFmt = mdates.DateFormatter('%Y')

df = data_nasdaq_top_100_preprocessed_merge

fig, ax = plt.subplots(figsize=(15,5))
for key, grp in df.groupby('SYMBOL'):
#     print "key", key
#     grp.reset_index()
#     print grp.DATE
    ax.plot(grp.DATE.reset_index(drop=True), grp['PMI_0'], label=key)
#     grp['D'] = pd.rolling_mean(grp['B'], window=5)    
#     plt.plot(grp['D'], label='rolling ({k})'.format(k=key))

# datemin = (df.DATE.min().year)
# datemax = (df.DATE.max().year + 1)
# print datemin, datemax
# ax.set_xlim(datemin, datemax)


plt.legend(loc='best')
plt.ylabel('PMI')
fig.autofmt_xdate()
plt.show()

#### Shift finance data and and calculate PMI (per company)


## Data Analytics (with window)

### MI (Mutual Information)

In [None]:
# def shift
data_nasdaq_top_100_preprocessed_window_30_mi_res_net = sentisignal.split_apply_combine(data_nasdaq_top_100_preprocessed_merge_window_30, 'SYMBOL', sentisignal.net_information_surplus, 10, 'PCA_SENTIMENT', 'PCA_FINANCE', -1)
data_nasdaq_top_100_preprocessed_window_30_mi_res_net_constrained = sentisignal.constrain_mi_res(data_nasdaq_top_100_preprocessed_window_30_mi_res_net)
sentisignal.plot_inf_res(data_nasdaq_top_100_preprocessed_window_30_mi_res_net_constrained)

In [None]:
# def shift
data_nasdaq_top_100_preprocessed_window_30_mi_res_net = sentisignal.split_apply_combine(data_nasdaq_top_100_preprocessed_merge_window_30, 'SYMBOL', sentisignal.net_information_surplus, 10, 'BULL_MINUS_BEAR', 'LOG_RETURN', -1)
data_nasdaq_top_100_preprocessed_window_30_mi_res_net_constrained = sentisignal.constrain_mi_res(data_nasdaq_top_100_preprocessed_window_30_mi_res_net)
sentisignal.plot_inf_res(data_nasdaq_top_100_preprocessed_window_30_mi_res_net_constrained)