Load in our labeled data

In [176]:
import pandas as pd
df = pd.read_csv('test-dm.csv')
df.head()

Unnamed: 0,date,comment,price,label
0,04-05-19,..$5780 holds and $6732 becomes a distinct pos...,5849,1
1,04-05-19,..thumbs down if you nasty bears love getting ...,5849,0
2,04-05-19,Mysterious Crypto Whale Unveils Meteoric Bitco...,5849,1
3,04-05-19,Probably be down 20% by Monday,5849,1
4,04-05-19,Short BTC for maßsive profits!,5849,0


Loop through all comments and create a new DataFrame that hosts the number of positives and negatives for each day.

In [177]:
rows = []
prices = []
grouped = df.groupby('date')
i = -1
for date, group in grouped:
    positives = len(group[group['label'] == 1])
    negatives = len(group[group['label'] == 0])
    price = group.iloc[0]['price'].replace(',', '')
        
    new_row = {
        'date': date,
        'positives': positives,
        'negatives': negatives,
        'ratio': positives / (positives + negatives),
        'price': price
    }
    
    rows.append(new_row)
    i += 1

In [178]:
new_df = pd.DataFrame(rows)

In [179]:
new_df.head()

Unnamed: 0,date,negatives,positives,price,ratio
0,01-05-19,13,16,5500,0.551724
1,02-05-19,33,27,6000,0.45
2,03-05-19,31,32,5900,0.507937
3,04-05-19,4,28,5849,0.875


Loop through new DataFrame and add 2 lags to each day

In [180]:
lags = range(1, 3)

In [181]:
new_df = new_df.assign(**{
    '{} (t-{})'.format(col, t): new_df[col].shift(t)
    for t in lags
    for col in new_df
})

In [182]:
del new_df['date (t-1)']
del new_df['date (t-2)']
del new_df['price (t-1)']
del new_df['price (t-2)']

In [183]:
new_df.head()

Unnamed: 0,date,negatives,positives,price,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2)
0,01-05-19,13,16,5500,0.551724,,,,,,
1,02-05-19,33,27,6000,0.45,13.0,16.0,0.551724,,,
2,03-05-19,31,32,5900,0.507937,33.0,27.0,0.45,13.0,16.0,0.551724
3,04-05-19,4,28,5849,0.875,31.0,32.0,0.507937,33.0,27.0,0.45


In [184]:
new_df = new_df.iloc[::-1]

Add label to each row, whether the price increased or decreased.

In [189]:
new_df['label'] = new_df['price'] < new_df['price'].cummax()
new_df['label'] = new_df['label'].astype(int)

In [194]:
new_df.fillna(0, inplace=True)

In [195]:
new_df.head()

Unnamed: 0,date,negatives,positives,price,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),label
3,04-05-19,4,28,5849,0.875,31.0,32.0,0.507937,33.0,27.0,0.45,0
2,03-05-19,31,32,5900,0.507937,33.0,27.0,0.45,13.0,16.0,0.551724,0
1,02-05-19,33,27,6000,0.45,13.0,16.0,0.551724,0.0,0.0,0.0,0
0,01-05-19,13,16,5500,0.551724,0.0,0.0,0.0,0.0,0.0,0.0,1


In [196]:
new_df.to_csv('dm_ready.csv')