In [23]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [21]:
#Minor cleaning
data=pd.read_csv('headlinesNLPdata.csv')
print(data.columns)
data=data.drop(labels=['Unnamed: 0','Headlines','date'], axis='columns')
data.head()

Index(['Unnamed: 0', 'Headlines', 'ticker', 'date', 'return', 'direction',
       'headline_length', 'word_count', 'day_0', 'day_1', 'day_2', 'day_3',
       'day_4', 'day_5', 'day_6', 'neg_sentiment', 'neu_sentiment',
       'pos_sentiment', 'compound_sentiment', 'finbert_neg', 'finbert_neu',
       'finbert_pos', '01', '02', '03', '05', '06', '10', '19', '1h',
       'announces', 'bank', 'beat', 'beats', 'capital', 'ceo', 'covid', 'deal',
       'declares', 'dividend', 'earnings', 'energy', 'eps', 'estimates', 'ffo',
       'financial', 'gainers', 'group', 'guidance', 'high', 'launches', 'line',
       'losers', 'misses', 'new', 'notes', 'offering', 'pandemic', 'prices',
       'q2', 'reports', 'results', 'revenue', 'sales', 'says', 'sees',
       'shares', 'strong', 'study', 'technologies', 'therapeutics', 'vaccine'],
      dtype='object')


Unnamed: 0,ticker,return,direction,headline_length,word_count,day_0,day_1,day_2,day_3,day_4,...,revenue,sales,says,sees,shares,strong,study,technologies,therapeutics,vaccine
0,DSS,-0.08875,0,78,11,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BTBT,-0.243243,0,58,9,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,NURO,-0.071111,0,66,8,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,SCOR,0.066434,1,65,8,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,GBX,0.155157,1,39,5,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Creating dummy variables for the tickers:

In [22]:
#create dummy variables for the tickers
data=pd.get_dummies(data,columns=['ticker'],prefix='tick',dtype=int)
data.head()

Unnamed: 0,return,direction,headline_length,word_count,day_0,day_1,day_2,day_3,day_4,day_5,...,tick_ZM,tick_ZNTL,tick_ZS,tick_ZTCOF,tick_ZTO,tick_ZTR,tick_ZTS,tick_ZURVY,tick_ZYME,tick_ZYXI
0,-0.08875,0,78,11,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.243243,0,58,9,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.071111,0,66,8,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.066434,1,65,8,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.155157,1,39,5,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


Separating features:

In [24]:
#Separating features 
X=data.drop(columns=['return'])
y=data['return']

Standardizing numerical features:

In [25]:
#Standardizing numeric features (all features should be numeric anyways)
numeric_cols=X.select_dtypes(include=['int64','float64']).columns
scaler=StandardScaler()
X_scaled=X.copy()
X_scaled[numeric_cols]=scaler.fit_transform(X[numeric_cols])

Splitting our data into train and test sets:

In [26]:
#Split into train and test sets 80/20
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.2,random_state=42)


Checking if we have any remaining categorical data features:

In [29]:
#Checking if we have categorical data
print(data.dtypes)
print(data.select_dtypes(include=['object','category']).head())

return             float64
direction            int64
headline_length      int64
word_count           int64
day_0                int64
                    ...   
tick_ZTR             int64
tick_ZTS             int64
tick_ZURVY           int64
tick_ZYME            int64
tick_ZYXI            int64
Length: 3918, dtype: object
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


There are no remaining categorical columns, our dataset is entirely numeric features.

Checking the ranges of numeric features:

In [30]:
#Checking numeric feature ranges 
data.describe().T[['min','max']]


Unnamed: 0,min,max
return,-0.581818,3.18136
direction,0.000000,1.00000
headline_length,18.000000,160.00000
word_count,3.000000,21.00000
day_0,0.000000,1.00000
...,...,...
tick_ZTR,0.000000,1.00000
tick_ZTS,0.000000,1.00000
tick_ZURVY,0.000000,1.00000
tick_ZYME,0.000000,1.00000


We do have a mixture of ranges, which is why we use StandardScaler to rescale everything to mean of 0 and std of 1. 