Load in our labeled data

In [769]:
import pandas as pd
from dateutil import parser

df = pd.read_excel('data/output/vader/vader_bitcoin.xlsx')
df.columns = ['date', 'comment', 'label', 'compound_score']

df['date'] = pd.to_datetime(df['date'], dayfirst=True)

In [770]:
del df['comment']
del df['compound_score']

In [771]:
df.head()

Unnamed: 0,date,label
0,2019-05-04,0
1,2019-05-04,1
2,2019-05-04,0
3,2019-05-04,0
4,2019-05-04,1


Loop through all comments and create a new DataFrame that hosts the number of positives and negatives for each day.

In [772]:
rows = []
prices = []
grouped = df.groupby('date')
for date, group in grouped:
    positives = len(group[group['label'] == 1])
    negatives = len(group[group['label'] == 0])
        
    new_row = {
        'date': date,
        'positives': positives,
        'negatives': negatives,
        'ratio': positives / (positives + negatives)
    }
    
    rows.append(new_row)

In [773]:
new_df = pd.DataFrame(rows)

In [774]:
new_df.tail()

Unnamed: 0,date,negatives,positives,ratio
457,2019-04-30,18,8,0.307692
458,2019-05-01,18,11,0.37931
459,2019-05-02,39,21,0.35
460,2019-05-03,43,20,0.31746
461,2019-05-04,21,11,0.34375


In [775]:
new_df = new_df.sort_values(['date'],ascending=[False])

In [776]:
new_df.head(5)

Unnamed: 0,date,negatives,positives,ratio
461,2019-05-04,21,11,0.34375
460,2019-05-03,43,20,0.31746
459,2019-05-02,39,21,0.35
458,2019-05-01,18,11,0.37931
457,2019-04-30,18,8,0.307692


Loop through new DataFrame and add 2 lags to each day

In [777]:
lags = range(1, 3)

In [778]:
new_df = new_df.assign(**{
    '{} (t-{})'.format(col, t): new_df[col].shift(t)
    for t in lags
    for col in new_df
})

In [779]:
del new_df['date (t-1)']
del new_df['date (t-2)']

In [780]:
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2)
461,2019-05-04,21,11,0.34375,,,,,,
460,2019-05-03,43,20,0.31746,21.0,11.0,0.34375,,,
459,2019-05-02,39,21,0.35,43.0,20.0,0.31746,21.0,11.0,0.34375
458,2019-05-01,18,11,0.37931,39.0,21.0,0.35,43.0,20.0,0.31746
457,2019-04-30,18,8,0.307692,18.0,11.0,0.37931,39.0,21.0,0.35
456,2019-04-29,15,7,0.318182,18.0,8.0,0.307692,18.0,11.0,0.37931
455,2019-04-28,30,9,0.230769,15.0,7.0,0.318182,18.0,8.0,0.307692
454,2019-04-27,25,17,0.404762,30.0,9.0,0.230769,15.0,7.0,0.318182
453,2019-04-26,32,23,0.418182,25.0,17.0,0.404762,30.0,9.0,0.230769
452,2019-04-25,29,27,0.482143,32.0,23.0,0.418182,25.0,17.0,0.404762


In [781]:
new_df.fillna(0, inplace=True)

In [788]:
df_price = pd.read_excel('data/price/BTC.xlsx')
df_price['date'] = pd.to_datetime(df_price['Date'])

new_df = pd.merge(new_df, df_price, how='inner', on='date')

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,High_y,Low_y,Upper Bound_y,Lower Bound_y,Date_y,High,Low,Upper Bound,Lower Bound,Date
0,2019-05-04,21,11,0.34375,0.0,0.0,0.0,0.0,0.0,0.0,...,5240.189941,4938.0,10480.379882,0,05-04-19,5240.189941,4938.0,10480.379882,0,05-04-19
1,2019-05-03,43,20,0.31746,21.0,11.0,0.34375,0.0,0.0,0.0,...,3893.52002,3720.48999,7787.04004,0,05-03-19,3893.52002,3720.48999,7787.04004,0,05-03-19
2,2019-05-02,39,21,0.35,43.0,20.0,0.31746,21.0,11.0,0.34375,...,3447.879883,3415.669922,6895.759766,0,05-02-19,3447.879883,3415.669922,6895.759766,0,05-02-19
3,2019-05-01,18,11,0.37931,39.0,21.0,0.35,43.0,20.0,0.31746,...,3926.919922,3841.129883,7853.839844,0,05-01-19,3926.919922,3841.129883,7853.839844,0,05-01-19
4,2019-04-30,18,8,0.307692,18.0,11.0,0.37931,39.0,21.0,0.35,...,5417.169922,5346.700195,10834.339844,0,30-04-19,5417.169922,5346.700195,10834.339844,0,30-04-19


In [789]:
new_df['label'] = new_df['High'].shift() > new_df['High']
new_df['label'] = new_df['label'].astype(int)

In [790]:
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,Low_y,Upper Bound_y,Lower Bound_y,Date_y,High,Low,Upper Bound,Lower Bound,Date,label
0,2019-05-04,21,11,0.34375,0.0,0.0,0.0,0.0,0.0,0.0,...,4938.0,10480.379882,0,05-04-19,5240.189941,4938.0,10480.379882,0,05-04-19,0
1,2019-05-03,43,20,0.31746,21.0,11.0,0.34375,0.0,0.0,0.0,...,3720.48999,7787.04004,0,05-03-19,3893.52002,3720.48999,7787.04004,0,05-03-19,1
2,2019-05-02,39,21,0.35,43.0,20.0,0.31746,21.0,11.0,0.34375,...,3415.669922,6895.759766,0,05-02-19,3447.879883,3415.669922,6895.759766,0,05-02-19,1
3,2019-05-01,18,11,0.37931,39.0,21.0,0.35,43.0,20.0,0.31746,...,3841.129883,7853.839844,0,05-01-19,3926.919922,3841.129883,7853.839844,0,05-01-19,0
4,2019-04-30,18,8,0.307692,18.0,11.0,0.37931,39.0,21.0,0.35,...,5346.700195,10834.339844,0,30-04-19,5417.169922,5346.700195,10834.339844,0,30-04-19,0
5,2019-04-29,15,7,0.318182,18.0,8.0,0.307692,18.0,11.0,0.37931,...,5224.640137,10744.45996,0,29-04-19,5372.22998,5224.640137,10744.45996,0,29-04-19,1
6,2019-04-28,30,9,0.230769,15.0,7.0,0.318182,18.0,8.0,0.307692,...,5162.810059,10599.019532,0,28-04-19,5299.509766,5162.810059,10599.019532,0,28-04-19,1
7,2019-04-27,25,17,0.404762,30.0,9.0,0.230769,15.0,7.0,0.318182,...,5218.419922,10627.40039,0,27-04-19,5313.700195,5218.419922,10627.40039,0,27-04-19,0
8,2019-04-26,32,23,0.418182,25.0,17.0,0.404762,30.0,9.0,0.230769,...,5192.450195,10565.94043,0,26-04-19,5282.970215,5192.450195,10565.94043,0,26-04-19,1
9,2019-04-25,29,27,0.482143,32.0,23.0,0.418182,25.0,17.0,0.404762,...,5104.180176,10723.879882,0,25-04-19,5361.939941,5104.180176,10723.879882,0,25-04-19,0


In [797]:
#del new_df['Date']
del new_df['Date_x']

In [798]:
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,Lower Bound_x,High_y,Low_y,Upper Bound_y,Lower Bound_y,High,Low,Upper Bound,Lower Bound,label
0,2019-05-04,21,11,0.34375,0.0,0.0,0.0,0.0,0.0,0.0,...,0,5240.189941,4938.0,10480.379882,0,5240.189941,4938.0,10480.379882,0,0
1,2019-05-03,43,20,0.31746,21.0,11.0,0.34375,0.0,0.0,0.0,...,0,3893.52002,3720.48999,7787.04004,0,3893.52002,3720.48999,7787.04004,0,1
2,2019-05-02,39,21,0.35,43.0,20.0,0.31746,21.0,11.0,0.34375,...,0,3447.879883,3415.669922,6895.759766,0,3447.879883,3415.669922,6895.759766,0,1
3,2019-05-01,18,11,0.37931,39.0,21.0,0.35,43.0,20.0,0.31746,...,0,3926.919922,3841.129883,7853.839844,0,3926.919922,3841.129883,7853.839844,0,0
4,2019-04-30,18,8,0.307692,18.0,11.0,0.37931,39.0,21.0,0.35,...,0,5417.169922,5346.700195,10834.339844,0,5417.169922,5346.700195,10834.339844,0,0
5,2019-04-29,15,7,0.318182,18.0,8.0,0.307692,18.0,11.0,0.37931,...,0,5372.22998,5224.640137,10744.45996,0,5372.22998,5224.640137,10744.45996,0,1
6,2019-04-28,30,9,0.230769,15.0,7.0,0.318182,18.0,8.0,0.307692,...,0,5299.509766,5162.810059,10599.019532,0,5299.509766,5162.810059,10599.019532,0,1
7,2019-04-27,25,17,0.404762,30.0,9.0,0.230769,15.0,7.0,0.318182,...,0,5313.700195,5218.419922,10627.40039,0,5313.700195,5218.419922,10627.40039,0,0
8,2019-04-26,32,23,0.418182,25.0,17.0,0.404762,30.0,9.0,0.230769,...,0,5282.970215,5192.450195,10565.94043,0,5282.970215,5192.450195,10565.94043,0,1
9,2019-04-25,29,27,0.482143,32.0,23.0,0.418182,25.0,17.0,0.404762,...,0,5361.939941,5104.180176,10723.879882,0,5361.939941,5104.180176,10723.879882,0,0


In [799]:
new_df.to_csv('dm_ready.csv')