# Single Company Experiment
## GOOG - Google 

# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

In [3]:
import sentisignal

## Configuration

In [4]:
# notebook-wide experiemental configuration
data_primary = '../../data/csv/stocktwits_twitter_withretweets_daily.csv'
data_symbology = '../../data/csv/symbology.csv'
data_pickles = '../../data/pickles/'
start_date = '2013-01-01'
end_date = '2015-01-01'

## Data preprocessing and merging

In [5]:
# get data
data_sentiment = sentisignal.subsample_data(data_primary, data_symbology, data_pickles, start_date, end_date, 'SYMBOL', ['GOOG'], False)
data_finance = sentisignal.get_data_finance('yahoo', ['GOOG'], start_date, end_date, '../../data/pickles/', False, 'NaN')
# preprocess data
sentisignal.preprocess_data_sentiment(data_sentiment)
sentisignal.preprocess_data_finance(data_finance)
# merge data
data_merged = sentisignal.merge_sentiment_finance(data_sentiment, data_finance, False, False, True)

Subsampling data from csv
Scraping and saving data from Yahoo


In [6]:
data_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 504 entries, 0 to 503
Data columns (total 34 columns):
SOURCE                           504 non-null object
SYMBOL_x                         504 non-null object
TIMESTAMP_UTC                    504 non-null object
BULLISH_INTENSITY                504 non-null float64
BEARISH_INTENSITY                504 non-null float64
BULL_MINUS_BEAR                  504 non-null float64
BULL_SCORED_MESSAGES             504 non-null int64
BEAR_SCORED_MESSAGES             504 non-null int64
BULL_BEAR_MSG_RATIO              504 non-null float64
TOTAL_SCANNED_MESSAGES           504 non-null int64
EXCHANGE                         504 non-null object
SECTOR                           504 non-null object
INDUSTRY                         504 non-null object
TITLE                            504 non-null object
DATE                             504 non-null datetime64[ns]
LOG_BULL_RETURN                  503 non-null float64
LOG_BEAR_RETURN                  503 

## Descriptive statistics
### Probability distributions functions (with histogram)

In [7]:
# sentisignal.check_pdf(data_merged)

### Autocorrelation plots (x-axis: lag, y-axis: autocorrelation)

In [25]:
# sentisignal.check_acf(data_merged)

In [8]:
# sentisignal.adf_test(data_merged)

## Correlation analysis

In [9]:
res_corr = sentisignal.correlation_analysis(data_merged, 0.4, data_sentiment, data_finance)

[['BEAR_SCORED_MESSAGES', 'VOLUME', 0.66442740825812729],
 ['BEAR_SCORED_MESSAGES', 'LOG_VOLUME_DIFF', 0.41447175398727426],
 ['TOTAL_SCANNED_MESSAGES', 'LOG_VOLUME_DIFF', 0.4104126246743815],
 ['BEAR_SCORED_MESSAGES', 'VOLATILITY', 0.40957781204368654]]


In [10]:
# intensity
print res_corr['BULL_MINUS_BEAR']['LOG_RETURN']
print res_corr['BULL_MINUS_BEAR']['VOLUME']
# volume
print res_corr['LOG_BULLISHNESS']['LOG_RETURN']
print res_corr['LOG_BULLISHNESS']['VOLUME']
#


0.209598509401
0.0649609322671
0.291530780271
-0.179875921294


## Information surplus gain (Mutual Information)

### Determine optimal bin size

In [11]:
print "Sturge: ", sentisignal.sturges_bin(data_merged)
# print "Doanes: ", sentisignal.doane_bin(data_merged['BULL_MINUS_BEAR'].ix[1:])
# Doans happens in function if bin size of -1 is passed to function

Sturge:  10.0


In [57]:
g_sent_price_0 = sentisignal.information_surplus(data_merged, 5, 'BULL_MINUS_BEAR', 'LOG_RETURN', -1, True)
g_sent_price_0

Unnamed: 0,INFORMATION_SURPLUS_DIFF,INFORMATION_SURPLUS_PCT,MUTUAL INFORMATION,SHIFT
0,0.0,0.0,0.072652,0
1,0.007327,10.084493,0.079979,-1
2,-0.003577,-4.923868,0.069075,-2
3,-0.018022,-24.805955,0.05463,-3
4,-0.018068,-24.869109,0.054584,-4
5,-0.026946,-37.089048,0.045706,-5


In [58]:
sentisignal.save_information_surplus(data_pickles, g_sent_price_0, 'GOOG', start_date, end_date, 5, 'BULL_MINUS_BEAR', 'LOG_RETURN', -1, True, 0)

Saved to pickle: ../../data/pickles/info_surp_res/GOOG_2013-01-01_2015-01-01_BULL_MINUS_BEAR_LOG_RETURN_-1_True.p


In [59]:
g_sent_vol_0 = sentisignal.information_surplus(data_merged, 5, 'BULL_MINUS_BEAR', 'VOLUME', -1, True)
g_sent_vol_0

Unnamed: 0,INFORMATION_SURPLUS_DIFF,INFORMATION_SURPLUS_PCT,MUTUAL INFORMATION,SHIFT
0,0.0,0.0,0.062318,0
1,0.00254,4.075143,0.064857,-1
2,0.000465,0.745418,0.062782,-2
3,0.002977,4.777689,0.065295,-3
4,0.01758,28.209417,0.079897,-4
5,0.009514,15.266883,0.071832,-5


In [60]:
sentisignal.save_information_surplus(data_pickles, g_sent_vol_0, 'GOOG', start_date, end_date, 5, 'BULL_MINUS_BEAR', 'VOLUME', -1, True, 0)

Saved to pickle: ../../data/pickles/info_surp_res/GOOG_2013-01-01_2015-01-01_BULL_MINUS_BEAR_VOLUME_-1_True.p


In [61]:
g_vol_price_0 = sentisignal.information_surplus(data_merged, 5, 'LOG_BULLISHNESS', 'LOG_RETURN', -1, True)
g_vol_price_0

Unnamed: 0,INFORMATION_SURPLUS_DIFF,INFORMATION_SURPLUS_PCT,MUTUAL INFORMATION,SHIFT
0,0.0,0.0,0.107668,0
1,-0.039331,-36.529906,0.068337,-1
2,-0.066607,-61.86358,0.041061,-2
3,-0.06787,-63.036956,0.039797,-3
4,-0.062894,-58.414946,0.044774,-4
5,-0.051981,-48.279346,0.055686,-5


In [62]:
sentisignal.save_information_surplus(data_pickles, g_vol_price_0, 'GOOG', start_date, end_date, 5, 'LOG_BULLISHNESS', 'LOG_RETURN', -1, True, 0)

Saved to pickle: ../../data/pickles/info_surp_res/GOOG_2013-01-01_2015-01-01_LOG_BULLISHNESS_LOG_RETURN_-1_True.p


In [63]:
g_vol_vol_0 = sentisignal.information_surplus(data_merged, 5, 'LOG_BULLISHNESS', 'VOLUME', -1, True)
g_vol_vol_0

Unnamed: 0,INFORMATION_SURPLUS_DIFF,INFORMATION_SURPLUS_PCT,MUTUAL INFORMATION,SHIFT
0,0.0,0.0,0.078091,0
1,-0.025441,-32.579199,0.05265,-1
2,-0.027191,-34.819632,0.0509,-2
3,-0.0069,-8.835591,0.071191,-3
4,0.005485,7.023387,0.083576,-4
5,-0.007264,-9.30176,0.070827,-5


In [64]:
sentisignal.save_information_surplus(data_pickles, g_vol_vol_0, 'GOOG', start_date, end_date, 5, 'LOG_BULLISHNESS', 'VOLUME', -1, True, 0)

Saved to pickle: ../../data/pickles/info_surp_res/GOOG_2013-01-01_2015-01-01_LOG_BULLISHNESS_VOLUME_-1_True.p


## Apply rolling window
### Averages statistics over 30 days and repeat experiments

In [65]:
window_size = 30
data_merged_window = sentisignal.apply_rolling_window(data_merged, window_size)

In [66]:
sentisignal.correlation_analysis(data_merged_window, 0.8, data_sentiment, data_finance)

[['BEAR_SCORED_MESSAGES', 'VOLUME', 0.90028523760563117]]


Unnamed: 0,BULLISH_INTENSITY,BEARISH_INTENSITY,BULL_MINUS_BEAR,BULL_SCORED_MESSAGES,BEAR_SCORED_MESSAGES,BULL_BEAR_MSG_RATIO,TOTAL_SCANNED_MESSAGES,LOG_BULL_RETURN,LOG_BEAR_RETURN,LOG_BULLISHNESS,...,TOTAL_SENTIMENT_MESSAGES_DIFF,OPEN,HIGH,LOW,CLOSE,VOLUME,ADJ CLOSE,LOG_RETURN,VOLATILITY,LOG_VOLUME_DIFF
BULLISH_INTENSITY,1.0,0.089085,0.732997,0.52846,-0.277469,0.561426,0.386828,0.0432108,-0.104851,0.721048,...,-0.100244,0.0730884,0.0733136,0.0720995,0.0742022,-0.160868,0.447534,0.266802,0.129372,-0.00141779
BEARISH_INTENSITY,0.089085,1.0,-0.612229,0.0831482,-0.403981,0.210762,-0.223881,-0.216114,-0.155791,0.1843,...,-0.140698,-0.411671,-0.411691,-0.416091,-0.414692,-0.412209,0.453028,-0.375819,-0.0316738,-0.236022
BULL_MINUS_BEAR,0.732997,-0.612229,1.0,0.362725,0.0556327,0.301741,0.459977,0.181897,0.0231623,0.446528,...,0.0165115,0.33917,0.339363,0.341403,0.342118,0.153814,0.0458749,0.468462,0.124332,0.160071
BULL_SCORED_MESSAGES,0.52846,0.0831482,0.362725,1.0,-0.0953881,0.940267,0.719401,-0.0325845,-0.062368,0.67715,...,0.0828636,-0.159305,-0.158487,-0.159477,-0.157449,-0.0833878,0.0191531,0.28,-0.0558126,0.074426
BEAR_SCORED_MESSAGES,-0.277469,-0.403981,0.0556327,-0.0953881,1.0,-0.404806,0.523943,0.394885,0.531549,-0.666604,...,0.431761,0.466509,0.46851,0.466147,0.467035,0.900285,-0.730001,0.184126,0.492521,0.608053
BULL_BEAR_MSG_RATIO,0.561426,0.210762,0.301741,0.940267,-0.404806,1.0,0.463153,-0.17471,-0.24051,0.818557,...,-0.108449,-0.345733,-0.345929,-0.34583,-0.344443,-0.394183,0.247728,0.139438,-0.261695,-0.193368
TOTAL_SCANNED_MESSAGES,0.386828,-0.223881,0.459977,0.719401,0.523943,0.463153,1.0,0.27271,0.253891,0.215768,...,0.379144,0.329176,0.331565,0.327968,0.33095,0.547449,-0.314105,0.462762,0.467799,0.568114
LOG_BULL_RETURN,0.0432108,-0.216114,0.181897,-0.0325845,0.394885,-0.17471,0.27271,1.0,0.481144,-0.141459,...,0.575425,0.203589,0.204693,0.204622,0.204785,0.382473,-0.294434,0.178416,0.155633,0.32403
LOG_BEAR_RETURN,-0.104851,-0.155791,0.0231623,-0.062368,0.531549,-0.24051,0.253891,0.481144,1.0,-0.379214,...,0.363175,0.121543,0.123894,0.122366,0.123962,0.542298,-0.63017,0.239226,0.186176,0.424897
LOG_BULLISHNESS,0.721048,0.1843,0.446528,0.67715,-0.666604,0.818557,0.215768,-0.141459,-0.379214,1.0,...,-0.220649,-0.238547,-0.239073,-0.237954,-0.236803,-0.532145,0.550217,0.240318,-0.245991,-0.197979


In [67]:
g_sent_price_30 = sentisignal.information_surplus(data_merged_window, 5, 'BULL_MINUS_BEAR', 'LOG_RETURN', -1, True)
g_sent_price_30

Unnamed: 0,INFORMATION_SURPLUS_DIFF,INFORMATION_SURPLUS_PCT,MUTUAL INFORMATION,SHIFT
0,0.0,0.0,0.34534,0
1,0.01858,5.380074,0.363919,-1
2,0.017119,4.957084,0.362458,-2
3,0.034937,10.11681,0.380277,-3
4,0.039813,11.528534,0.385152,-4
5,0.04244,12.289486,0.38778,-5


In [68]:
sentisignal.save_information_surplus(data_pickles, g_sent_price_30, 'GOOG', start_date, end_date, 5, 'BULL_MINUS_BEAR', 'LOG_RETURN', -1, True, window_size)

Saved to pickle: ../../data/pickles/info_surp_res/GOOG_2013-01-01_2015-01-01_30_BULL_MINUS_BEAR_LOG_RETURN_-1_True.p


In [69]:
g_sent_vol_30 = sentisignal.information_surplus(data_merged_window, 5, 'BULL_MINUS_BEAR', 'VOLUME', -1, True)
g_sent_vol_30

Unnamed: 0,INFORMATION_SURPLUS_DIFF,INFORMATION_SURPLUS_PCT,MUTUAL INFORMATION,SHIFT
0,0.0,0.0,0.492611,0
1,0.00154,0.312707,0.494151,-1
2,-0.005513,-1.119157,0.487098,-2
3,0.007841,1.591638,0.500451,-3
4,0.010355,2.102136,0.502966,-4
5,0.012326,2.502223,0.504937,-5


In [70]:
sentisignal.save_information_surplus(data_pickles, g_sent_vol_30, 'GOOG', start_date, end_date, 5, 'BULL_MINUS_BEAR', 'VOLUME', -1, True, window_size)

Saved to pickle: ../../data/pickles/info_surp_res/GOOG_2013-01-01_2015-01-01_30_BULL_MINUS_BEAR_VOLUME_-1_True.p


In [71]:
g_vol_price_30 = sentisignal.information_surplus(data_merged_window, 5, 'LOG_BULLISHNESS', 'LOG_RETURN', -1, True)
g_vol_price_30

Unnamed: 0,INFORMATION_SURPLUS_DIFF,INFORMATION_SURPLUS_PCT,MUTUAL INFORMATION,SHIFT
0,0.0,0.0,0.284695,0
1,-0.009831,-3.453136,0.274864,-1
2,-0.005527,-1.941547,0.279167,-2
3,0.005188,1.822319,0.289883,-3
4,-0.01105,-3.881347,0.273645,-4
5,-0.01425,-5.005536,0.270444,-5


In [72]:
sentisignal.save_information_surplus(data_pickles, g_vol_price_30, 'GOOG', start_date, end_date, 5, 'LOG_BULLISHNESS', 'LOG_RETURN', -1, True, window_size)

Saved to pickle: ../../data/pickles/info_surp_res/GOOG_2013-01-01_2015-01-01_30_LOG_BULLISHNESS_LOG_RETURN_-1_True.p


In [73]:
g_vol_vol_30 = sentisignal.information_surplus(data_merged_window, 5, 'LOG_BULLISHNESS', 'VOLUME', -1, True)
g_vol_vol_30

Unnamed: 0,INFORMATION_SURPLUS_DIFF,INFORMATION_SURPLUS_PCT,MUTUAL INFORMATION,SHIFT
0,0.0,0.0,0.572947,0
1,-0.003633,-0.634103,0.569314,-1
2,0.007543,1.316513,0.58049,-2
3,-0.001602,-0.279623,0.571345,-3
4,0.020523,3.582035,0.59347,-4
5,0.025062,4.374306,0.598009,-5


In [74]:
sentisignal.save_information_surplus(data_pickles, g_vol_vol_30, 'GOOG', start_date, end_date, 5, 'LOG_BULLISHNESS', 'VOLUME', -1, True, window_size)

Saved to pickle: ../../data/pickles/info_surp_res/GOOG_2013-01-01_2015-01-01_30_LOG_BULLISHNESS_VOLUME_-1_True.p


In [None]:
# df = data_merged_window
# features = ['LOG_BULLISHNESS', 'LOG_RETURN', 'VOLATILITY']
# sentisignal.kmeans(df, features)