Load in our labeled data

In [1]:
import pandas as pd
from dateutil import parser

dataset = 'data/output/vader/vader_tesla.xlsx'
output_dataset_name = 'ML_Data/TSLA_vader.csv'
price_data = 'data/price/TSLA.xlsx'

df = pd.read_excel(dataset)
df.columns = ['date', 'comment', 'label', 'compound_score']
df['date'] = pd.to_datetime(df['date'], dayfirst=True)

In [2]:
del df['comment']
del df['compound_score']

In [3]:
df.head()

Unnamed: 0,date,label
0,2019-05-03,1
1,2019-05-03,0
2,2019-05-03,0
3,2019-05-03,1
4,2019-05-03,1


Loop through all comments and create a new DataFrame that hosts the number of positives and negatives for each day.

In [4]:
rows = []
prices = []
grouped = df.groupby('date')
for date, group in grouped:
    positives = len(group[group['label'] == 1])
    negatives = len(group[group['label'] == 0])
        
    new_row = {
        'date': date,
        'positives': positives,
        'negatives': negatives,
        'ratio': positives / (positives + negatives)
    }
    
    rows.append(new_row)

In [5]:
new_df = pd.DataFrame(rows)

In [6]:
new_df.tail()

Unnamed: 0,date,negatives,positives,ratio
311,2019-04-29,20,13,0.393939
312,2019-04-30,15,12,0.444444
313,2019-05-01,33,14,0.297872
314,2019-05-02,24,25,0.510204
315,2019-05-03,24,15,0.384615


In [7]:
new_df = new_df.sort_values(['date'],ascending=[False])

In [8]:
new_df.head(5)

Unnamed: 0,date,negatives,positives,ratio
315,2019-05-03,24,15,0.384615
314,2019-05-02,24,25,0.510204
313,2019-05-01,33,14,0.297872
312,2019-04-30,15,12,0.444444
311,2019-04-29,20,13,0.393939


Loop through new DataFrame and add 2 lags to each day

In [9]:
lags = range(1, 8)

In [10]:
new_df = new_df.assign(**{
    '{} (t-{})'.format(col, t): new_df[col].shift(t)
    for t in lags
    for col in new_df
})

In [11]:
del new_df['date (t-1)']
del new_df['date (t-2)']
del new_df['date (t-3)']
del new_df['date (t-4)']
del new_df['date (t-5)']
del new_df['date (t-6)']
del new_df['date (t-7)']

KeyError: 'date (t-8)'

In [12]:
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,ratio (t-4),negatives (t-5),positives (t-5),ratio (t-5),negatives (t-6),positives (t-6),ratio (t-6),negatives (t-7),positives (t-7),ratio (t-7)
315,2019-05-03,24,15,0.384615,,,,,,,...,,,,,,,,,,
314,2019-05-02,24,25,0.510204,24.0,15.0,0.384615,,,,...,,,,,,,,,,
313,2019-05-01,33,14,0.297872,24.0,25.0,0.510204,24.0,15.0,0.384615,...,,,,,,,,,,
312,2019-04-30,15,12,0.444444,33.0,14.0,0.297872,24.0,25.0,0.510204,...,,,,,,,,,,
311,2019-04-29,20,13,0.393939,15.0,12.0,0.444444,33.0,14.0,0.297872,...,0.384615,,,,,,,,,
310,2019-04-26,64,38,0.372549,20.0,13.0,0.393939,15.0,12.0,0.444444,...,0.510204,24.0,15.0,0.384615,,,,,,
309,2019-04-25,92,41,0.308271,64.0,38.0,0.372549,20.0,13.0,0.393939,...,0.297872,24.0,25.0,0.510204,24.0,15.0,0.384615,,,
308,2019-04-24,69,32,0.316832,92.0,41.0,0.308271,64.0,38.0,0.372549,...,0.444444,33.0,14.0,0.297872,24.0,25.0,0.510204,24.0,15.0,0.384615
307,2019-04-23,20,12,0.375,69.0,32.0,0.316832,92.0,41.0,0.308271,...,0.393939,15.0,12.0,0.444444,33.0,14.0,0.297872,24.0,25.0,0.510204
306,2019-04-22,21,14,0.4,20.0,12.0,0.375,69.0,32.0,0.316832,...,0.372549,20.0,13.0,0.393939,15.0,12.0,0.444444,33.0,14.0,0.297872


In [13]:
new_df.fillna(0, inplace=True)

In [14]:
df_price = pd.read_excel(price_data, skiprows=[0])
df_price['date'] = pd.to_datetime(df_price['Date'])

new_df = pd.merge(new_df, df_price, how='inner', on='date')

In [15]:
df_price.head()

Unnamed: 0,Date,High,Low,Upper bound,Lower bound,Close,date
0,1/3/17,220.330002,210.960007,275.412503,158.220005,216.990005,2017-01-03
1,1/4/17,228.0,214.309998,285.0,160.732498,226.990005,2017-01-04
2,1/5/17,227.479996,221.949997,284.349995,166.462498,226.75,2017-01-05
3,1/6/17,230.309998,225.449997,287.887497,169.087498,229.009995,2017-01-06
4,1/9/17,231.919998,228.0,289.899997,171.0,231.279999,2017-01-09


In [16]:
new_df['label'] = new_df['Close'].shift() > new_df['Close']
new_df['label'] = new_df['label'].astype(int)

In [17]:
new_df.head(10)

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,negatives (t-7),positives (t-7),ratio (t-7),Date,High,Low,Upper bound,Lower bound,Close,label
0,2019-05-03,24,15,0.384615,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5/3/19,256.609985,243.490005,320.762481,182.617504,255.03,0
1,2019-05-02,24,25,0.510204,24.0,15.0,0.384615,0.0,0.0,0.0,...,0.0,0.0,0.0,5/2/19,247.130005,237.720001,308.912506,178.290001,244.100006,1
2,2019-05-01,33,14,0.297872,24.0,25.0,0.510204,24.0,15.0,0.384615,...,0.0,0.0,0.0,5/1/19,240.0,231.5,300.0,173.625,234.009995,1
3,2019-04-30,15,12,0.444444,33.0,14.0,0.297872,24.0,25.0,0.510204,...,0.0,0.0,0.0,4/30/19,244.210007,237.0,305.262509,177.75,238.690002,0
4,2019-04-29,20,13,0.393939,15.0,12.0,0.444444,33.0,14.0,0.297872,...,0.0,0.0,0.0,4/29/19,243.979996,232.169998,304.974995,174.127499,241.470001,0
5,2019-04-26,64,38,0.372549,20.0,13.0,0.393939,15.0,12.0,0.444444,...,0.0,0.0,0.0,4/26/19,246.679993,231.130005,308.349991,173.347504,235.139999,1
6,2019-04-25,92,41,0.308271,64.0,38.0,0.372549,20.0,13.0,0.393939,...,0.0,0.0,0.0,4/25/19,259.0,246.070007,323.75,184.552505,247.630005,0
7,2019-04-24,69,32,0.316832,92.0,41.0,0.308271,64.0,38.0,0.372549,...,24.0,15.0,0.384615,4/24/19,265.320007,258.0,331.650009,193.5,258.660004,0
8,2019-04-23,20,12,0.375,69.0,32.0,0.316832,92.0,41.0,0.308271,...,24.0,25.0,0.510204,4/23/19,265.600006,255.75,332.000007,191.8125,263.899994,0
9,2019-04-22,21,14,0.4,20.0,12.0,0.375,69.0,32.0,0.316832,...,33.0,14.0,0.297872,4/22/19,269.679993,262.480011,337.099991,196.860008,262.75,1


In [18]:
del new_df['Date']
#del new_df['Date_x']

In [19]:
new_df = new_df.drop(['High', 'Low', 'Upper bound', 'Lower bound', 'Close'], axis=1)

In [20]:
new_df.head()

Unnamed: 0,date,negatives,positives,ratio,negatives (t-1),positives (t-1),ratio (t-1),negatives (t-2),positives (t-2),ratio (t-2),...,negatives (t-5),positives (t-5),ratio (t-5),negatives (t-6),positives (t-6),ratio (t-6),negatives (t-7),positives (t-7),ratio (t-7),label
0,2019-05-03,24,15,0.384615,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2019-05-02,24,25,0.510204,24.0,15.0,0.384615,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,2019-05-01,33,14,0.297872,24.0,25.0,0.510204,24.0,15.0,0.384615,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,2019-04-30,15,12,0.444444,33.0,14.0,0.297872,24.0,25.0,0.510204,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2019-04-29,20,13,0.393939,15.0,12.0,0.444444,33.0,14.0,0.297872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [21]:
new_df.to_csv(output_dataset_name)