Load in our labeled data

In [284]:
import pandas as pd
from dateutil import parser

dataset = 'ML_Data/tes_classified_pos.xlsx'
output_dataset_name = 'ML_Data/TSLA_pos.csv'
price_data = 'data/price/TSLA.xlsx'
dates = 'data/dates.csv'

df = pd.read_excel(dataset)
df.columns = ['date', 'comment', 'label', 'compound_score']
df['date'] = pd.to_datetime(df['date'], dayfirst=True)

In [285]:
del df['comment']
del df['compound_score']

In [286]:
df.head()

Unnamed: 0,date,label
0,2019-05-03,1
1,2019-05-03,0
2,2019-05-03,0
3,2019-05-03,0
4,2019-05-03,1


Loop through all comments and create a new DataFrame that hosts the number of positives and negatives for each day.

In [287]:
rows = []
prices = []
grouped = df.groupby('date')
for date, group in grouped:
    positives = len(group[group['label'] == 1])
    negatives = len(group[group['label'] == 0])
        
    new_row = {
        'date': date,
        'positives': positives,
        'negatives': negatives,
        'ratio': positives / (positives + negatives)
    }
    
    rows.append(new_row)

In [288]:
new_df = pd.DataFrame(rows)

In [289]:
new_df.tail()

Unnamed: 0,date,negatives,positives,ratio
311,2019-04-29,22,11,0.333333
312,2019-04-30,15,12,0.444444
313,2019-05-01,32,15,0.319149
314,2019-05-02,26,23,0.469388
315,2019-05-03,19,20,0.512821


# Join with dates, so we make sure we have all dates

In [290]:
df_dates = pd.read_csv(dates)
df_dates['date'] = pd.to_datetime(df_dates['date'])
df_dates.tail(10)

Unnamed: 0,date
478,2018-01-10
479,2018-01-09
480,2018-01-08
481,2018-01-07
482,2018-01-06
483,2018-01-05
484,2018-01-04
485,2018-01-03
486,2018-01-02
487,2018-01-01


In [291]:
test = pd.merge(df_dates, new_df, on='date', how='outer')

In [292]:
test.head(10)

Unnamed: 0,date,negatives,positives,ratio
0,2018-01-02,2.0,0.0,0.0
1,2018-01-03,4.0,1.0,0.2
2,2018-01-04,6.0,2.0,0.25
3,2018-01-05,4.0,2.0,0.333333
4,2018-01-08,3.0,2.0,0.4
5,2018-01-09,1.0,1.0,0.5
6,2018-01-10,0.0,1.0,1.0
7,2018-01-11,3.0,0.0,0.0
8,2018-01-17,1.0,0.0,0.0
9,2018-01-22,1.0,0.0,0.0


Loop through new DataFrame and add 2 lags to each day

In [210]:
lags = range(1, 8)

In [211]:
new_df = new_df.assign(**{
    '{} (t-{})'.format(col, t): new_df[col].shift(t)
    for t in lags
    for col in new_df
})

In [212]:
del new_df['date (t-1)']
del new_df['date (t-2)']
del new_df['date (t-3)']
del new_df['date (t-4)']
del new_df['date (t-5)']
del new_df['date (t-6)']
del new_df['date (t-7)']

In [213]:
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,ratio (t-4),negatives (t-5),positives (t-5),ratio (t-5),negatives (t-6),positives (t-6),ratio (t-6),negatives (t-7),positives (t-7),ratio (t-7)
0,2018-01-02,2,0,0.0,,,,,,,...,,,,,,,,,,
1,2018-01-03,4,1,0.2,2.0,0.0,0.0,,,,...,,,,,,,,,,
2,2018-01-04,6,2,0.25,4.0,1.0,0.2,2.0,0.0,0.0,...,,,,,,,,,,
3,2018-01-05,4,2,0.333333,6.0,2.0,0.25,4.0,1.0,0.2,...,,,,,,,,,,
4,2018-01-08,3,2,0.4,4.0,2.0,0.333333,6.0,2.0,0.25,...,0.0,,,,,,,,,
5,2018-01-09,1,1,0.5,3.0,2.0,0.4,4.0,2.0,0.333333,...,0.2,2.0,0.0,0.0,,,,,,
6,2018-01-10,0,1,1.0,1.0,1.0,0.5,3.0,2.0,0.4,...,0.25,4.0,1.0,0.2,2.0,0.0,0.0,,,
7,2018-01-11,3,0,0.0,0.0,1.0,1.0,1.0,1.0,0.5,...,0.333333,6.0,2.0,0.25,4.0,1.0,0.2,2.0,0.0,0.0
8,2018-01-17,1,0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,...,0.4,4.0,2.0,0.333333,6.0,2.0,0.25,4.0,1.0,0.2
9,2018-01-22,1,0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,...,0.5,3.0,2.0,0.4,4.0,2.0,0.333333,6.0,2.0,0.25


In [214]:
new_df.fillna(0, inplace=True)

In [215]:
new_df = new_df.sort_values(['date'],ascending=[False])

In [216]:
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,ratio (t-4),negatives (t-5),positives (t-5),ratio (t-5),negatives (t-6),positives (t-6),ratio (t-6),negatives (t-7),positives (t-7),ratio (t-7)
315,2019-05-03,19,20,0.512821,26.0,23.0,0.469388,32.0,15.0,0.319149,...,0.333333,65.0,37.0,0.362745,87.0,46.0,0.345865,59.0,42.0,0.415842
314,2019-05-02,26,23,0.469388,32.0,15.0,0.319149,15.0,12.0,0.444444,...,0.362745,87.0,46.0,0.345865,59.0,42.0,0.415842,16.0,16.0,0.5
313,2019-05-01,32,15,0.319149,15.0,12.0,0.444444,22.0,11.0,0.333333,...,0.345865,59.0,42.0,0.415842,16.0,16.0,0.5,22.0,13.0,0.371429
312,2019-04-30,15,12,0.444444,22.0,11.0,0.333333,65.0,37.0,0.362745,...,0.415842,16.0,16.0,0.5,22.0,13.0,0.371429,5.0,10.0,0.666667
311,2019-04-29,22,11,0.333333,65.0,37.0,0.362745,87.0,46.0,0.345865,...,0.5,22.0,13.0,0.371429,5.0,10.0,0.666667,3.0,4.0,0.571429
310,2019-04-26,65,37,0.362745,87.0,46.0,0.345865,59.0,42.0,0.415842,...,0.371429,5.0,10.0,0.666667,3.0,4.0,0.571429,11.0,9.0,0.45
309,2019-04-25,87,46,0.345865,59.0,42.0,0.415842,16.0,16.0,0.5,...,0.666667,3.0,4.0,0.571429,11.0,9.0,0.45,17.0,6.0,0.26087
308,2019-04-24,59,42,0.415842,16.0,16.0,0.5,22.0,13.0,0.371429,...,0.571429,11.0,9.0,0.45,17.0,6.0,0.26087,12.0,9.0,0.428571
307,2019-04-23,16,16,0.5,22.0,13.0,0.371429,5.0,10.0,0.666667,...,0.45,17.0,6.0,0.26087,12.0,9.0,0.428571,18.0,19.0,0.513514
306,2019-04-22,22,13,0.371429,5.0,10.0,0.666667,3.0,4.0,0.571429,...,0.26087,12.0,9.0,0.428571,18.0,19.0,0.513514,7.0,9.0,0.5625


Unnamed: 0,date
0,2019-05-03
1,2019-05-02
2,2019-05-01
3,2019-04-30
4,2019-04-29
5,2019-04-28
6,2019-04-27
7,2019-04-26
8,2019-04-25
9,2019-04-24


In [232]:
new_df = pd.merge(new_df, df_dates, on='date', how='outer')

In [233]:
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,ratio (t-4),negatives (t-5),positives (t-5),ratio (t-5),negatives (t-6),positives (t-6),ratio (t-6),negatives (t-7),positives (t-7),ratio (t-7)
0,2019-05-03,19.0,20.0,0.512821,26.0,23.0,0.469388,32.0,15.0,0.319149,...,0.333333,65.0,37.0,0.362745,87.0,46.0,0.345865,59.0,42.0,0.415842
1,2019-05-02,26.0,23.0,0.469388,32.0,15.0,0.319149,15.0,12.0,0.444444,...,0.362745,87.0,46.0,0.345865,59.0,42.0,0.415842,16.0,16.0,0.5
2,2019-05-01,32.0,15.0,0.319149,15.0,12.0,0.444444,22.0,11.0,0.333333,...,0.345865,59.0,42.0,0.415842,16.0,16.0,0.5,22.0,13.0,0.371429
3,2019-04-30,15.0,12.0,0.444444,22.0,11.0,0.333333,65.0,37.0,0.362745,...,0.415842,16.0,16.0,0.5,22.0,13.0,0.371429,5.0,10.0,0.666667
4,2019-04-29,22.0,11.0,0.333333,65.0,37.0,0.362745,87.0,46.0,0.345865,...,0.5,22.0,13.0,0.371429,5.0,10.0,0.666667,3.0,4.0,0.571429
5,2019-04-26,65.0,37.0,0.362745,87.0,46.0,0.345865,59.0,42.0,0.415842,...,0.371429,5.0,10.0,0.666667,3.0,4.0,0.571429,11.0,9.0,0.45
6,2019-04-25,87.0,46.0,0.345865,59.0,42.0,0.415842,16.0,16.0,0.5,...,0.666667,3.0,4.0,0.571429,11.0,9.0,0.45,17.0,6.0,0.26087
7,2019-04-24,59.0,42.0,0.415842,16.0,16.0,0.5,22.0,13.0,0.371429,...,0.571429,11.0,9.0,0.45,17.0,6.0,0.26087,12.0,9.0,0.428571
8,2019-04-23,16.0,16.0,0.5,22.0,13.0,0.371429,5.0,10.0,0.666667,...,0.45,17.0,6.0,0.26087,12.0,9.0,0.428571,18.0,19.0,0.513514
9,2019-04-22,22.0,13.0,0.371429,5.0,10.0,0.666667,3.0,4.0,0.571429,...,0.26087,12.0,9.0,0.428571,18.0,19.0,0.513514,7.0,9.0,0.5625


In [191]:
df_price = pd.read_excel(price_data, skiprows=[0])
df_price['date'] = pd.to_datetime(df_price['Date'])

#new_df = pd.merge(new_df, df_price, how='left', on='date')

In [194]:
df_price.tail(10)

Unnamed: 0,Date,High,Low,Upper bound,Lower bound,Close,date
577,4/22/19,269.679993,262.480011,337.099991,196.860008,262.75,2019-04-22
578,4/23/19,265.600006,255.75,332.000007,191.8125,263.899994,2019-04-23
579,4/24/19,265.320007,258.0,331.650009,193.5,258.660004,2019-04-24
580,4/25/19,259.0,246.070007,323.75,184.552505,247.630005,2019-04-25
581,4/26/19,246.679993,231.130005,308.349991,173.347504,235.139999,2019-04-26
582,4/29/19,243.979996,232.169998,304.974995,174.127499,241.470001,2019-04-29
583,4/30/19,244.210007,237.0,305.262509,177.75,238.690002,2019-04-30
584,5/1/19,240.0,231.5,300.0,173.625,234.009995,2019-05-01
585,5/2/19,247.130005,237.720001,308.912506,178.290001,244.100006,2019-05-02
586,5/3/19,256.609985,243.490005,320.762481,182.617504,255.03,2019-05-03


In [203]:
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,ratio (t-4),negatives (t-5),positives (t-5),ratio (t-5),negatives (t-6),positives (t-6),ratio (t-6),negatives (t-7),positives (t-7),ratio (t-7)
315,2019-05-03,19,20,0.512821,26.0,23.0,0.469388,32.0,15.0,0.319149,...,0.333333,65.0,37.0,0.362745,87.0,46.0,0.345865,59.0,42.0,0.415842
314,2019-05-02,26,23,0.469388,32.0,15.0,0.319149,15.0,12.0,0.444444,...,0.362745,87.0,46.0,0.345865,59.0,42.0,0.415842,16.0,16.0,0.5
313,2019-05-01,32,15,0.319149,15.0,12.0,0.444444,22.0,11.0,0.333333,...,0.345865,59.0,42.0,0.415842,16.0,16.0,0.5,22.0,13.0,0.371429
312,2019-04-30,15,12,0.444444,22.0,11.0,0.333333,65.0,37.0,0.362745,...,0.415842,16.0,16.0,0.5,22.0,13.0,0.371429,5.0,10.0,0.666667
311,2019-04-29,22,11,0.333333,65.0,37.0,0.362745,87.0,46.0,0.345865,...,0.5,22.0,13.0,0.371429,5.0,10.0,0.666667,3.0,4.0,0.571429
310,2019-04-26,65,37,0.362745,87.0,46.0,0.345865,59.0,42.0,0.415842,...,0.371429,5.0,10.0,0.666667,3.0,4.0,0.571429,11.0,9.0,0.45
309,2019-04-25,87,46,0.345865,59.0,42.0,0.415842,16.0,16.0,0.5,...,0.666667,3.0,4.0,0.571429,11.0,9.0,0.45,17.0,6.0,0.26087
308,2019-04-24,59,42,0.415842,16.0,16.0,0.5,22.0,13.0,0.371429,...,0.571429,11.0,9.0,0.45,17.0,6.0,0.26087,12.0,9.0,0.428571
307,2019-04-23,16,16,0.5,22.0,13.0,0.371429,5.0,10.0,0.666667,...,0.45,17.0,6.0,0.26087,12.0,9.0,0.428571,18.0,19.0,0.513514
306,2019-04-22,22,13,0.371429,5.0,10.0,0.666667,3.0,4.0,0.571429,...,0.26087,12.0,9.0,0.428571,18.0,19.0,0.513514,7.0,9.0,0.5625


In [198]:
pd.merge(df_price, new_df, on='date', how='outer')

In [201]:
test.tail(10)

Unnamed: 0,Date,High,Low,Upper bound,Lower bound,Close,date,negatives,positives,ratio,...,ratio (t-4),negatives (t-5),positives (t-5),ratio (t-5),negatives (t-6),positives (t-6),ratio (t-6),negatives (t-7),positives (t-7),ratio (t-7)
577,4/22/19,269.679993,262.480011,337.099991,196.860008,262.75,2019-04-22,22.0,13.0,0.371429,...,0.26087,12.0,9.0,0.428571,18.0,19.0,0.513514,7.0,9.0,0.5625
578,4/23/19,265.600006,255.75,332.000007,191.8125,263.899994,2019-04-23,16.0,16.0,0.5,...,0.45,17.0,6.0,0.26087,12.0,9.0,0.428571,18.0,19.0,0.513514
579,4/24/19,265.320007,258.0,331.650009,193.5,258.660004,2019-04-24,59.0,42.0,0.415842,...,0.571429,11.0,9.0,0.45,17.0,6.0,0.26087,12.0,9.0,0.428571
580,4/25/19,259.0,246.070007,323.75,184.552505,247.630005,2019-04-25,87.0,46.0,0.345865,...,0.666667,3.0,4.0,0.571429,11.0,9.0,0.45,17.0,6.0,0.26087
581,4/26/19,246.679993,231.130005,308.349991,173.347504,235.139999,2019-04-26,65.0,37.0,0.362745,...,0.371429,5.0,10.0,0.666667,3.0,4.0,0.571429,11.0,9.0,0.45
582,4/29/19,243.979996,232.169998,304.974995,174.127499,241.470001,2019-04-29,22.0,11.0,0.333333,...,0.5,22.0,13.0,0.371429,5.0,10.0,0.666667,3.0,4.0,0.571429
583,4/30/19,244.210007,237.0,305.262509,177.75,238.690002,2019-04-30,15.0,12.0,0.444444,...,0.415842,16.0,16.0,0.5,22.0,13.0,0.371429,5.0,10.0,0.666667
584,5/1/19,240.0,231.5,300.0,173.625,234.009995,2019-05-01,32.0,15.0,0.319149,...,0.345865,59.0,42.0,0.415842,16.0,16.0,0.5,22.0,13.0,0.371429
585,5/2/19,247.130005,237.720001,308.912506,178.290001,244.100006,2019-05-02,26.0,23.0,0.469388,...,0.362745,87.0,46.0,0.345865,59.0,42.0,0.415842,16.0,16.0,0.5
586,5/3/19,256.609985,243.490005,320.762481,182.617504,255.03,2019-05-03,19.0,20.0,0.512821,...,0.333333,65.0,37.0,0.362745,87.0,46.0,0.345865,59.0,42.0,0.415842


In [173]:
new_df['label'] = new_df['Close'].shift() > new_df['Close']
new_df['label'] = new_df['label'].astype(int)

In [174]:
new_df.head(10)

Unnamed: 0,date,Date_x,High_x,Low_x,Upper bound_x,Lower bound_x,Close_x,negatives,positives,ratio,...,Low_y,Upper bound_y,Lower bound_y,Close_y,Date,High,Low,Upper bound,Lower bound,Close
0,2017-01-03,1/3/17,220.330002,210.960007,275.412503,158.220005,216.990005,,,,...,210.960007,275.412503,158.220005,216.990005,1/3/17,220.330002,210.960007,275.412503,158.220005,216.990005
1,2017-01-04,1/4/17,228.0,214.309998,285.0,160.732498,226.990005,,,,...,214.309998,285.0,160.732498,226.990005,1/4/17,228.0,214.309998,285.0,160.732498,226.990005
2,2017-01-05,1/5/17,227.479996,221.949997,284.349995,166.462498,226.75,,,,...,221.949997,284.349995,166.462498,226.75,1/5/17,227.479996,221.949997,284.349995,166.462498,226.75
3,2017-01-06,1/6/17,230.309998,225.449997,287.887497,169.087498,229.009995,,,,...,225.449997,287.887497,169.087498,229.009995,1/6/17,230.309998,225.449997,287.887497,169.087498,229.009995
4,2017-01-09,1/9/17,231.919998,228.0,289.899997,171.0,231.279999,,,,...,228.0,289.899997,171.0,231.279999,1/9/17,231.919998,228.0,289.899997,171.0,231.279999
5,2017-01-10,1/10/17,232.0,226.889999,290.0,170.167499,229.869995,,,,...,226.889999,290.0,170.167499,229.869995,1/10/17,232.0,226.889999,290.0,170.167499,229.869995
6,2017-01-11,1/11/17,229.979996,226.679993,287.474995,170.009995,229.729996,,,,...,226.679993,287.474995,170.009995,229.729996,1/11/17,229.979996,226.679993,287.474995,170.009995,229.729996
7,2017-01-12,1/12/17,230.699997,225.580002,288.374996,169.185001,229.589996,,,,...,225.580002,288.374996,169.185001,229.589996,1/12/17,230.699997,225.580002,288.374996,169.185001,229.589996
8,2017-01-13,1/13/17,237.850006,229.589996,297.312507,172.192497,237.75,,,,...,229.589996,297.312507,172.192497,237.75,1/13/17,237.850006,229.589996,297.312507,172.192497,237.75
9,2017-01-17,1/17/17,239.960007,234.369995,299.950009,175.777496,235.580002,,,,...,234.369995,299.950009,175.777496,235.580002,1/17/17,239.960007,234.369995,299.950009,175.777496,235.580002


In [175]:
del new_df['Date']
#del new_df['Date_x']

In [176]:
new_df = new_df.drop(['High', 'Low', 'Upper bound', 'Lower bound', 'Close'], axis=1)

In [170]:
new_df.head()

Unnamed: 0,date,Date_x,High_x,Low_x,Upper bound_x,Lower bound_x,Close_x,negatives,positives,ratio,...,Low_y,Upper bound_y,Lower bound_y,Close_y,Date_y,High_y,Low_y.1,Upper bound_y.1,Lower bound_y.1,Close_y.1
0,2017-01-03,1/3/17,220.330002,210.960007,275.412503,158.220005,216.990005,,,,...,210.960007,275.412503,158.220005,216.990005,1/3/17,220.330002,210.960007,275.412503,158.220005,216.990005
1,2017-01-04,1/4/17,228.0,214.309998,285.0,160.732498,226.990005,,,,...,214.309998,285.0,160.732498,226.990005,1/4/17,228.0,214.309998,285.0,160.732498,226.990005
2,2017-01-05,1/5/17,227.479996,221.949997,284.349995,166.462498,226.75,,,,...,221.949997,284.349995,166.462498,226.75,1/5/17,227.479996,221.949997,284.349995,166.462498,226.75
3,2017-01-06,1/6/17,230.309998,225.449997,287.887497,169.087498,229.009995,,,,...,225.449997,287.887497,169.087498,229.009995,1/6/17,230.309998,225.449997,287.887497,169.087498,229.009995
4,2017-01-09,1/9/17,231.919998,228.0,289.899997,171.0,231.279999,,,,...,228.0,289.899997,171.0,231.279999,1/9/17,231.919998,228.0,289.899997,171.0,231.279999


In [105]:
new_df.to_csv(output_dataset_name)