Load in our labeled data

In [1]:
import pandas as pd
from dateutil import parser

dataset = 'data/output/vader/vader_bitcoin.xlsx'
output_dataset_name = 'ML_Data/BTC_vader.csv'
price_data = 'data/price/BTC.xlsx'
dates = 'data/dates.csv'

df = pd.read_excel(dataset)
df.columns = ['date', 'comment', 'label', 'compound_score']
df['date'] = pd.to_datetime(df['date'], dayfirst=True)

In [2]:
del df['comment']
del df['compound_score']

In [3]:
df.head()

Unnamed: 0,date,label
0,2019-05-04,0
1,2019-05-04,1
2,2019-05-04,0
3,2019-05-04,0
4,2019-05-04,1


Loop through all comments and create a new DataFrame that hosts the number of positives and negatives for each day.

In [4]:
rows = []
prices = []
grouped = df.groupby('date')
for date, group in grouped:
    positives = len(group[group['label'] == 1])
    negatives = len(group[group['label'] == 0])
        
    new_row = {
        'date': date,
        'positives': positives,
        'negatives': negatives,
        'ratio': positives / (positives + negatives)
    }
    
    rows.append(new_row)

In [5]:
new_df = pd.DataFrame(rows)

In [6]:
new_df.tail()

Unnamed: 0,date,negatives,positives,ratio
457,2019-04-30,18,8,0.307692
458,2019-05-01,18,11,0.37931
459,2019-05-02,39,21,0.35
460,2019-05-03,43,20,0.31746
461,2019-05-04,21,11,0.34375


# Join with dates, so we make sure we have all dates

In [7]:
df_dates = pd.read_csv(dates)
df_dates['date'] = pd.to_datetime(df_dates['date'])
df_dates.tail(10)

Unnamed: 0,date
478,2018-01-10
479,2018-01-09
480,2018-01-08
481,2018-01-07
482,2018-01-06
483,2018-01-05
484,2018-01-04
485,2018-01-03
486,2018-01-02
487,2018-01-01


In [8]:
new_df = pd.merge(df_dates, new_df, on='date', how='outer')

In [9]:
new_df.fillna(0, inplace=True)

In [10]:
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio
0,2019-05-03,43.0,20.0,0.31746
1,2019-05-02,39.0,21.0,0.35
2,2019-05-01,18.0,11.0,0.37931
3,2019-04-30,18.0,8.0,0.307692
4,2019-04-29,15.0,7.0,0.318182
5,2019-04-28,30.0,9.0,0.230769
6,2019-04-27,25.0,17.0,0.404762
7,2019-04-26,32.0,23.0,0.418182
8,2019-04-25,29.0,27.0,0.482143
9,2019-04-24,24.0,23.0,0.489362


Loop through new DataFrame and add 2 lags to each day. First we reverse order of DataFrame.

In [11]:
new_df = new_df.iloc[::-1]

lags = range(1, 8)

In [12]:
new_df = new_df.assign(**{
    '{} (t-{})'.format(col, t): new_df[col].shift(t)
    for t in lags
    for col in new_df
})

In [13]:
del new_df['date (t-1)']
del new_df['date (t-2)']
del new_df['date (t-3)']
del new_df['date (t-4)']
del new_df['date (t-5)']
del new_df['date (t-6)']
del new_df['date (t-7)']

In [14]:
new_df.fillna(0, inplace=True)
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,ratio (t-4),negatives (t-5),positives (t-5),ratio (t-5),negatives (t-6),positives (t-6),ratio (t-6),negatives (t-7),positives (t-7),ratio (t-7)
488,2019-05-04,21.0,11.0,0.34375,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
487,2018-01-01,69.0,38.0,0.35514,21.0,11.0,0.34375,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
486,2018-01-02,75.0,37.0,0.330357,69.0,38.0,0.35514,21.0,11.0,0.34375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
485,2018-01-03,24.0,25.0,0.510204,75.0,37.0,0.330357,69.0,38.0,0.35514,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
484,2018-01-04,25.0,14.0,0.358974,24.0,25.0,0.510204,75.0,37.0,0.330357,...,0.34375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
483,2018-01-05,52.0,36.0,0.409091,25.0,14.0,0.358974,24.0,25.0,0.510204,...,0.35514,21.0,11.0,0.34375,0.0,0.0,0.0,0.0,0.0,0.0
482,2018-01-06,44.0,19.0,0.301587,52.0,36.0,0.409091,25.0,14.0,0.358974,...,0.330357,69.0,38.0,0.35514,21.0,11.0,0.34375,0.0,0.0,0.0
481,2018-01-07,15.0,17.0,0.53125,44.0,19.0,0.301587,52.0,36.0,0.409091,...,0.510204,75.0,37.0,0.330357,69.0,38.0,0.35514,21.0,11.0,0.34375
480,2018-01-08,73.0,45.0,0.381356,15.0,17.0,0.53125,44.0,19.0,0.301587,...,0.358974,24.0,25.0,0.510204,75.0,37.0,0.330357,69.0,38.0,0.35514
479,2018-01-09,17.0,14.0,0.451613,73.0,45.0,0.381356,15.0,17.0,0.53125,...,0.409091,25.0,14.0,0.358974,24.0,25.0,0.510204,75.0,37.0,0.330357


In [15]:
new_df = new_df.sort_values(['date'],ascending=[False])

In [16]:
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,ratio (t-4),negatives (t-5),positives (t-5),ratio (t-5),negatives (t-6),positives (t-6),ratio (t-6),negatives (t-7),positives (t-7),ratio (t-7)
488,2019-05-04,21.0,11.0,0.34375,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2019-05-03,43.0,20.0,0.31746,39.0,21.0,0.35,18.0,11.0,0.37931,...,0.318182,30.0,9.0,0.230769,25.0,17.0,0.404762,32.0,23.0,0.418182
1,2019-05-02,39.0,21.0,0.35,18.0,11.0,0.37931,18.0,8.0,0.307692,...,0.230769,25.0,17.0,0.404762,32.0,23.0,0.418182,29.0,27.0,0.482143
2,2019-05-01,18.0,11.0,0.37931,18.0,8.0,0.307692,15.0,7.0,0.318182,...,0.404762,32.0,23.0,0.418182,29.0,27.0,0.482143,24.0,23.0,0.489362
3,2019-04-30,18.0,8.0,0.307692,15.0,7.0,0.318182,30.0,9.0,0.230769,...,0.418182,29.0,27.0,0.482143,24.0,23.0,0.489362,23.0,22.0,0.488889
4,2019-04-29,15.0,7.0,0.318182,30.0,9.0,0.230769,25.0,17.0,0.404762,...,0.482143,24.0,23.0,0.489362,23.0,22.0,0.488889,25.0,17.0,0.404762
5,2019-04-28,30.0,9.0,0.230769,25.0,17.0,0.404762,32.0,23.0,0.418182,...,0.489362,23.0,22.0,0.488889,25.0,17.0,0.404762,16.0,10.0,0.384615
6,2019-04-27,25.0,17.0,0.404762,32.0,23.0,0.418182,29.0,27.0,0.482143,...,0.488889,25.0,17.0,0.404762,16.0,10.0,0.384615,19.0,7.0,0.269231
7,2019-04-26,32.0,23.0,0.418182,29.0,27.0,0.482143,24.0,23.0,0.489362,...,0.404762,16.0,10.0,0.384615,19.0,7.0,0.269231,9.0,12.0,0.571429
8,2019-04-25,29.0,27.0,0.482143,24.0,23.0,0.489362,23.0,22.0,0.488889,...,0.384615,19.0,7.0,0.269231,9.0,12.0,0.571429,27.0,11.0,0.289474


# Merge with price data

In [17]:
df_price = pd.read_excel(price_data, skiprows=[0])
df_price['date'] = pd.to_datetime(df_price['Date'])

new_df = pd.merge(new_df, df_price, how='inner', on='date')

In [20]:
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,ratio (t-6),negatives (t-7),positives (t-7),ratio (t-7),Date,High,Low,Upper Bound,Lower Bound,Close
0,2019-05-04,21.0,11.0,0.34375,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5/4/19,5849.5,5709.740234,7311.875,4282.305175,5794.140137
1,2019-05-03,43.0,20.0,0.31746,39.0,21.0,0.35,18.0,11.0,0.37931,...,0.404762,32.0,23.0,0.418182,5/3/19,5901.359863,5596.149902,7376.699829,4197.112426,5840.080078
2,2019-05-02,39.0,21.0,0.35,18.0,11.0,0.37931,18.0,8.0,0.307692,...,0.418182,29.0,27.0,0.482143,5/2/19,5891.899902,5484.779785,7364.874878,4113.584839,5753.379883
3,2019-05-01,18.0,11.0,0.37931,18.0,8.0,0.307692,15.0,7.0,0.318182,...,0.482143,24.0,23.0,0.489362,5/1/19,5531.149902,5387.290039,6913.937378,4040.467529,5500.720215
4,2019-04-30,18.0,8.0,0.307692,15.0,7.0,0.318182,30.0,9.0,0.230769,...,0.489362,23.0,22.0,0.488889,4/30/19,5417.169922,5346.700195,6771.462403,4010.025146,5389.540039
5,2019-04-29,15.0,7.0,0.318182,30.0,9.0,0.230769,25.0,17.0,0.404762,...,0.488889,25.0,17.0,0.404762,4/29/19,5372.22998,5224.640137,6715.287475,3918.480103,5350.640137
6,2019-04-28,30.0,9.0,0.230769,25.0,17.0,0.404762,32.0,23.0,0.418182,...,0.404762,16.0,10.0,0.384615,4/28/19,5299.509766,5162.810059,6624.387207,3872.107544,5237.959961
7,2019-04-27,25.0,17.0,0.404762,32.0,23.0,0.418182,29.0,27.0,0.482143,...,0.384615,19.0,7.0,0.269231,4/27/19,5313.700195,5218.419922,6642.125244,3913.814941,5272.450195
8,2019-04-26,32.0,23.0,0.418182,29.0,27.0,0.482143,24.0,23.0,0.489362,...,0.269231,9.0,12.0,0.571429,4/26/19,5282.970215,5192.450195,6603.712769,3894.337646,5229.47998
9,2019-04-25,29.0,27.0,0.482143,24.0,23.0,0.489362,23.0,22.0,0.488889,...,0.571429,27.0,11.0,0.289474,4/25/19,5361.939941,5104.180176,6702.424926,3828.135132,5234.089844


In [76]:
new_df['label'] = new_df['Close'] > new_df['Close'].shift(-1)
new_df['label'] = new_df['label'].astype(int)

In [79]:
new_df.head(25)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,negatives (t-7),positives (t-7),ratio (t-7),Date,High,Low,Upper Bound,Lower Bound,Close,label
0,2019-05-04,21.0,11.0,0.34375,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5/4/19,5849.5,5709.740234,7311.875,4282.305175,5794.140137,0
1,2019-05-03,43.0,20.0,0.31746,39.0,21.0,0.35,18.0,11.0,0.37931,...,32.0,23.0,0.418182,5/3/19,5901.359863,5596.149902,7376.699829,4197.112426,5840.080078,1
2,2019-05-02,39.0,21.0,0.35,18.0,11.0,0.37931,18.0,8.0,0.307692,...,29.0,27.0,0.482143,5/2/19,5891.899902,5484.779785,7364.874878,4113.584839,5753.379883,1
3,2019-05-01,18.0,11.0,0.37931,18.0,8.0,0.307692,15.0,7.0,0.318182,...,24.0,23.0,0.489362,5/1/19,5531.149902,5387.290039,6913.937378,4040.467529,5500.720215,1
4,2019-04-30,18.0,8.0,0.307692,15.0,7.0,0.318182,30.0,9.0,0.230769,...,23.0,22.0,0.488889,4/30/19,5417.169922,5346.700195,6771.462403,4010.025146,5389.540039,1
5,2019-04-29,15.0,7.0,0.318182,30.0,9.0,0.230769,25.0,17.0,0.404762,...,25.0,17.0,0.404762,4/29/19,5372.22998,5224.640137,6715.287475,3918.480103,5350.640137,1
6,2019-04-28,30.0,9.0,0.230769,25.0,17.0,0.404762,32.0,23.0,0.418182,...,16.0,10.0,0.384615,4/28/19,5299.509766,5162.810059,6624.387207,3872.107544,5237.959961,0
7,2019-04-27,25.0,17.0,0.404762,32.0,23.0,0.418182,29.0,27.0,0.482143,...,19.0,7.0,0.269231,4/27/19,5313.700195,5218.419922,6642.125244,3913.814941,5272.450195,1
8,2019-04-26,32.0,23.0,0.418182,29.0,27.0,0.482143,24.0,23.0,0.489362,...,9.0,12.0,0.571429,4/26/19,5282.970215,5192.450195,6603.712769,3894.337646,5229.47998,0
9,2019-04-25,29.0,27.0,0.482143,24.0,23.0,0.489362,23.0,22.0,0.488889,...,27.0,11.0,0.289474,4/25/19,5361.939941,5104.180176,6702.424926,3828.135132,5234.089844,1


In [80]:
new_df.tail(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,negatives (t-7),positives (t-7),ratio (t-7),Date,High,Low,Upper Bound,Lower Bound,Close,label
480,2018-01-10,36.0,29.0,0.446154,17.0,14.0,0.451613,73.0,45.0,0.381356,...,24.0,25.0,0.510204,1/10/18,14919.490234,13450.540039,18649.362793,10087.905029,14919.490234,1
481,2018-01-09,17.0,14.0,0.451613,73.0,45.0,0.381356,15.0,17.0,0.53125,...,75.0,37.0,0.330357,1/9/18,15390.280273,14221.549805,19237.850341,10666.162354,14468.5,0
482,2018-01-08,73.0,45.0,0.381356,15.0,17.0,0.53125,44.0,19.0,0.301587,...,69.0,38.0,0.35514,1/8/18,16302.919922,13902.30957,20378.649903,10426.732178,14976.169922,0
483,2018-01-07,15.0,17.0,0.53125,44.0,19.0,0.301587,52.0,36.0,0.409091,...,21.0,11.0,0.34375,1/7/18,17184.810547,15791.129883,21481.013184,11843.347412,16228.160156,0
484,2018-01-06,44.0,19.0,0.301587,52.0,36.0,0.409091,25.0,14.0,0.358974,...,0.0,0.0,0.0,1/6/18,17252.849609,16286.570313,21566.062011,12214.927735,17172.300781,1
485,2018-01-05,52.0,36.0,0.409091,25.0,14.0,0.358974,24.0,25.0,0.510204,...,0.0,0.0,0.0,1/5/18,17126.949219,14832.360352,21408.686524,11124.270264,16954.779297,1
486,2018-01-04,25.0,14.0,0.358974,24.0,25.0,0.510204,75.0,37.0,0.330357,...,0.0,0.0,0.0,1/4/18,15408.660156,14244.669922,19260.825195,10683.502442,15180.080078,1
487,2018-01-03,24.0,25.0,0.510204,75.0,37.0,0.330357,69.0,38.0,0.35514,...,0.0,0.0,0.0,1/3/18,15435.009766,14579.709961,19293.762207,10934.782471,15156.620117,1
488,2018-01-02,75.0,37.0,0.330357,69.0,38.0,0.35514,21.0,11.0,0.34375,...,0.0,0.0,0.0,1/2/18,15306.129883,12934.160156,19132.662354,9700.620117,14754.129883,1
489,2018-01-01,69.0,38.0,0.35514,21.0,11.0,0.34375,0.0,0.0,0.0,...,0.0,0.0,0.0,1/1/18,13921.530273,12877.669922,17401.912841,9658.252442,13444.879883,0


In [629]:
new_df = new_df.drop(['High', 'Low', 'Upper bound', 'Lower Bound', 'Close', 'Date'], axis=1)

In [630]:
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,negatives (t-5),positives (t-5),ratio (t-5),negatives (t-6),positives (t-6),ratio (t-6),negatives (t-7),positives (t-7),ratio (t-7),label
0,2019-05-04,21.0,11.0,0.34375,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2019-05-03,43.0,20.0,0.31746,39.0,21.0,0.35,18.0,11.0,0.37931,...,30.0,9.0,0.230769,25.0,17.0,0.404762,32.0,23.0,0.418182,0
2,2019-05-02,39.0,21.0,0.35,18.0,11.0,0.37931,18.0,8.0,0.307692,...,25.0,17.0,0.404762,32.0,23.0,0.418182,29.0,27.0,0.482143,1
3,2019-05-01,18.0,11.0,0.37931,18.0,8.0,0.307692,15.0,7.0,0.318182,...,32.0,23.0,0.418182,29.0,27.0,0.482143,24.0,23.0,0.489362,1
4,2019-04-30,18.0,8.0,0.307692,15.0,7.0,0.318182,30.0,9.0,0.230769,...,29.0,27.0,0.482143,24.0,23.0,0.489362,23.0,22.0,0.488889,1
5,2019-04-29,15.0,7.0,0.318182,30.0,9.0,0.230769,25.0,17.0,0.404762,...,24.0,23.0,0.489362,23.0,22.0,0.488889,25.0,17.0,0.404762,1
6,2019-04-28,30.0,9.0,0.230769,25.0,17.0,0.404762,32.0,23.0,0.418182,...,23.0,22.0,0.488889,25.0,17.0,0.404762,16.0,10.0,0.384615,1
7,2019-04-27,25.0,17.0,0.404762,32.0,23.0,0.418182,29.0,27.0,0.482143,...,25.0,17.0,0.404762,16.0,10.0,0.384615,19.0,7.0,0.269231,0
8,2019-04-26,32.0,23.0,0.418182,29.0,27.0,0.482143,24.0,23.0,0.489362,...,16.0,10.0,0.384615,19.0,7.0,0.269231,9.0,12.0,0.571429,1
9,2019-04-25,29.0,27.0,0.482143,24.0,23.0,0.489362,23.0,22.0,0.488889,...,19.0,7.0,0.269231,9.0,12.0,0.571429,27.0,11.0,0.289474,0


In [631]:
new_df.to_csv(output_dataset_name)