Load in our labeled data

In [85]:
import pandas as pd
from dateutil import parser

dataset = 'ML_Data/dj_classified_pos.xlsx'
output_dataset_name = 'ML_Data/DJI_pos.csv'
price_data = 'data/price/DJI.xlsx'

df = pd.read_excel(dataset)
df.columns = ['date', 'comment', 'label', 'compound_score']
df['date'] = pd.to_datetime(df['date'], dayfirst=True)

In [86]:
del df['comment']
del df['compound_score']

In [87]:
df.head()

Unnamed: 0,date,label
0,2019-05-03,0
1,2019-05-03,1
2,2019-05-03,1
3,2019-05-03,0
4,2019-05-03,0


Loop through all comments and create a new DataFrame that hosts the number of positives and negatives for each day.

In [88]:
rows = []
prices = []
grouped = df.groupby('date')
for date, group in grouped:
    positives = len(group[group['label'] == 1])
    negatives = len(group[group['label'] == 0])
        
    new_row = {
        'date': date,
        'positives': positives,
        'negatives': negatives,
        'ratio': positives / (positives + negatives)
    }
    
    rows.append(new_row)

In [89]:
new_df = pd.DataFrame(rows)

In [90]:
new_df.tail()

Unnamed: 0,date,negatives,positives,ratio
329,2019-04-29,5,2,0.285714
330,2019-04-30,5,3,0.375
331,2019-05-01,11,12,0.521739
332,2019-05-02,11,12,0.521739
333,2019-05-03,7,8,0.533333


In [91]:
new_df = new_df.sort_values(['date'],ascending=[False])

In [92]:
new_df.head(5)

Unnamed: 0,date,negatives,positives,ratio
333,2019-05-03,7,8,0.533333
332,2019-05-02,11,12,0.521739
331,2019-05-01,11,12,0.521739
330,2019-04-30,5,3,0.375
329,2019-04-29,5,2,0.285714


Loop through new DataFrame and add 2 lags to each day

In [93]:
lags = range(1, 8)

In [94]:
new_df = new_df.assign(**{
    '{} (t-{})'.format(col, t): new_df[col].shift(t)
    for t in lags
    for col in new_df
})

In [95]:
del new_df['date (t-1)']
del new_df['date (t-2)']
del new_df['date (t-3)']
del new_df['date (t-4)']
del new_df['date (t-5)']
del new_df['date (t-6)']
del new_df['date (t-7)']

In [96]:
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,ratio (t-4),negatives (t-5),positives (t-5),ratio (t-5),negatives (t-6),positives (t-6),ratio (t-6),negatives (t-7),positives (t-7),ratio (t-7)
333,2019-05-03,7,8,0.533333,,,,,,,...,,,,,,,,,,
332,2019-05-02,11,12,0.521739,7.0,8.0,0.533333,,,,...,,,,,,,,,,
331,2019-05-01,11,12,0.521739,11.0,12.0,0.521739,7.0,8.0,0.533333,...,,,,,,,,,,
330,2019-04-30,5,3,0.375,11.0,12.0,0.521739,11.0,12.0,0.521739,...,,,,,,,,,,
329,2019-04-29,5,2,0.285714,5.0,3.0,0.375,11.0,12.0,0.521739,...,0.533333,,,,,,,,,
328,2019-04-26,14,9,0.391304,5.0,2.0,0.285714,5.0,3.0,0.375,...,0.521739,7.0,8.0,0.533333,,,,,,
327,2019-04-25,18,7,0.28,14.0,9.0,0.391304,5.0,2.0,0.285714,...,0.521739,11.0,12.0,0.521739,7.0,8.0,0.533333,,,
326,2019-04-24,7,5,0.416667,18.0,7.0,0.28,14.0,9.0,0.391304,...,0.375,11.0,12.0,0.521739,11.0,12.0,0.521739,7.0,8.0,0.533333
325,2019-04-23,14,7,0.333333,7.0,5.0,0.416667,18.0,7.0,0.28,...,0.285714,5.0,3.0,0.375,11.0,12.0,0.521739,11.0,12.0,0.521739
324,2019-04-22,6,5,0.454545,14.0,7.0,0.333333,7.0,5.0,0.416667,...,0.391304,5.0,2.0,0.285714,5.0,3.0,0.375,11.0,12.0,0.521739


In [97]:
new_df.fillna(0, inplace=True)

In [98]:
df_price = pd.read_excel(price_data, skiprows=[0])
df_price['date'] = pd.to_datetime(df_price['Date'])

new_df = pd.merge(new_df, df_price, how='inner', on='date')

In [99]:
df_price.head()

Unnamed: 0,Date,High,Low,Upper bound,Lower Bound,Close,date
0,1/3/17,19938.529297,19775.929688,24923.161621,14831.947266,19881.759766,2017-01-03
1,1/4/17,19956.140625,19878.830078,24945.175781,14909.122558,19942.160156,2017-01-04
2,1/5/17,19948.599609,19811.119141,24935.749511,14858.339356,19899.289063,2017-01-05
3,1/6/17,19999.630859,19834.080078,24999.538574,14875.560058,19963.800781,2017-01-06
4,1/9/17,19943.779297,19887.380859,24929.724121,14915.535644,19887.380859,2017-01-09


In [100]:
new_df['label'] = new_df['Close'].shift() > new_df['Close']
new_df['label'] = new_df['label'].astype(int)

In [101]:
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,negatives (t-7),positives (t-7),ratio (t-7),Date,High,Low,Upper bound,Lower Bound,Close,label
0,2019-05-03,7,8,0.533333,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5/3/19,26534.960938,26370.089844,33168.701172,19777.567383,26504.949219,0
1,2019-05-02,11,12,0.521739,7.0,8.0,0.533333,0.0,0.0,0.0,...,0.0,0.0,0.0,5/2/19,26454.689453,26180.359375,33068.361816,19635.269531,26307.789063,1
2,2019-05-01,11,12,0.521739,11.0,12.0,0.521739,7.0,8.0,0.533333,...,0.0,0.0,0.0,5/1/19,26689.390625,26426.380859,33361.738281,19819.785644,26430.140625,0
3,2019-04-30,5,3,0.375,11.0,12.0,0.521739,11.0,12.0,0.521739,...,0.0,0.0,0.0,4/30/19,26614.039063,26419.470703,33267.548829,19814.603027,26592.910156,0
4,2019-04-29,5,2,0.285714,5.0,3.0,0.375,11.0,12.0,0.521739,...,0.0,0.0,0.0,4/29/19,26602.539063,26520.75,33253.173829,19890.5625,26554.390625,1
5,2019-04-26,14,9,0.391304,5.0,2.0,0.285714,5.0,3.0,0.375,...,0.0,0.0,0.0,4/26/19,26543.560547,26392.550781,33179.450684,19794.413086,26543.330078,1
6,2019-04-25,18,7,0.28,14.0,9.0,0.391304,5.0,2.0,0.285714,...,0.0,0.0,0.0,4/25/19,26536.480469,26310.279297,33170.600586,19732.709473,26462.080078,1
7,2019-04-24,7,5,0.416667,18.0,7.0,0.28,14.0,9.0,0.391304,...,7.0,8.0,0.533333,4/24/19,26680.580078,26582.859375,33350.725097,19937.144531,26597.050781,0
8,2019-04-23,14,7,0.333333,7.0,5.0,0.416667,18.0,7.0,0.28,...,11.0,12.0,0.521739,4/23/19,26695.960938,26503.560547,33369.951172,19877.67041,26656.390625,0
9,2019-04-22,6,5,0.454545,14.0,7.0,0.333333,7.0,5.0,0.416667,...,11.0,12.0,0.521739,4/22/19,26553.050781,26458.609375,33191.313476,19843.957031,26511.050781,1


In [102]:
del new_df['Date']
#del new_df['Date_x']

In [103]:
new_df = new_df.drop(['High', 'Low', 'Upper bound', 'Lower Bound', 'Close'], axis=1)

In [104]:
new_df.head()

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,negatives (t-5),positives (t-5),ratio (t-5),negatives (t-6),positives (t-6),ratio (t-6),negatives (t-7),positives (t-7),ratio (t-7),label
0,2019-05-03,7,8,0.533333,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2019-05-02,11,12,0.521739,7.0,8.0,0.533333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2019-05-01,11,12,0.521739,11.0,12.0,0.521739,7.0,8.0,0.533333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2019-04-30,5,3,0.375,11.0,12.0,0.521739,11.0,12.0,0.521739,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2019-04-29,5,2,0.285714,5.0,3.0,0.375,11.0,12.0,0.521739,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [105]:
new_df.to_csv(output_dataset_name)