In [23]:
from catboost import CatBoostClassifier
from itertools import chain
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from wordcloud import WordCloud

import datetime
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import seaborn as sns
import time
import warnings

%matplotlib inline
np.random.seed(2018)
stop = set(stopwords.words('english'))
py.init_notebook_mode(connected=True)
warnings.filterwarnings('ignore')

In [2]:
# turn off analytics for training, or running out of memory.
ANALYTICS = False

### Load Data

In [3]:
market_train_df = pd.read_csv('marketdata.csv')
news_train_df = pd.read_csv('newsdata.csv')

### Market Data

The data includes a subset of US-listed instruments. The set of included instruments changes daily and is determined based on the amount traded and the availability of information. This means that there may be instruments that enter and leave this subset of data. There may therefore be gaps in the data provided, and this does not necessarily imply that that data does not exist (those rows are likely not included due to the selection criteria).

The marketdata contains a variety of returns calculated over different timespans. All of the returns in this set of marketdata have these properties:

Returns are always calculated either open-to-open (from the opening time of one trading day to the open of another) or close-to-close (from the closing time of one trading day to the open of another).
Returns are either raw, meaning that the data is not adjusted against any benchmark, or market-residualized (Mktres), meaning that the movement of the market as a whole has been accounted for, leaving only movements inherent to the instrument.
Returns can be calculated over any arbitrary interval. Provided here are 1 day and 10 day horizons.
Returns are tagged with 'Prev' if they are backwards looking in time, or 'Next' if forwards looking.

In [4]:
print(market_train_df.shape)

(4072956, 17)


In [5]:
if ANALYTICS:
    market_train_df.head()

In [6]:
if ANALYTICS:
    market_train_df.describe()

From the price history of 5 randomly picked assets below, we learn that securities could have dramatically different lengths of history in this data set. Learning from securities with longer history is probably more meaningful.

In [7]:
if ANALYTICS:
    data = []
    for asset in np.random.choice(market_train_df['assetName'].unique(), 5):
        df = market_train_df[(market_train_df['assetName'] == asset)]

        data.append(go.Scatter(
            x = df['time'].dt.strftime(date_format='%Y-%m-%d').values,
            y = df['close'].values,
            name = asset
        ))
    layout = go.Layout(dict(title = "Price History of 5 Randomly Picked Assets",
                      yaxis = dict(title = 'Price (USD)'),
                      ),legend=dict(orientation="h"))
    py.iplot(dict(data=data, layout=layout), filename='basic-line')

How does the universe look? Do we see the impact of major news on the market?

In [8]:
if ANALYTICS:
    data = []
    for i in [0.1, 0.25, 0.5, 0.75, 0.9]:
        df = market_train_df.groupby('time')['close'].quantile(i).reset_index()

        data.append(go.Scatter(
            x = df['time'].dt.strftime(date_format='%Y-%m-%d').values,
            y = df['close'].values,
            name = f'{i} quantile'
        ))
    layout = go.Layout(dict(title = "Closing Prices in Quantiles",
                      yaxis = dict(title = 'Price (USD)'),
                      ), legend=dict(orientation="h"),
        annotations=[
            dict(
                x='2007-02-01 00:00:00+0000',
                y=75,
                xref='x',
                yref='y',
                text='<b>Housing crisis</b>',
                showarrow=True,
                font=dict(
                    size=12,
                    color='#ffffff'
                ),
                align='center',
                arrowhead=2,
                arrowsize=1,
                arrowwidth=2,
                arrowcolor='#636363',
                ax=0,
                ay=-30,
                borderpad=4,
                bgcolor='#0093D1',
                opacity=0.8
            ),
            dict(
                x='2008-09-01 22:00:00+0000',
                y=70,
                xref='x',
                yref='y',
                text='<b>Collapse of Lehman Brothers</b>',
                showarrow=True,
                font=dict(
                    size=12,
                    color='#ffffff'
                ),
                align='center',
                arrowhead=2,
                arrowsize=1,
                arrowwidth=2,
                arrowcolor='#636363',
                ax=0,
                ay=-70,
                borderpad=4,
                bgcolor='#0093D1',
                opacity=0.8
            ),
            dict(
                x='2011-08-01 22:00:00+0000',
                y=75,
                xref='x',
                yref='y',
                text='<b>Black Monday</b>',
                showarrow=True,
                font=dict(
                    size=12,
                    color='#ffffff'
                ),
                align='center',
                arrowhead=2,
                arrowsize=1,
                arrowwidth=2,
                arrowcolor='#636363',
                ax=0,
                ay=-100,
                borderpad=4,
                bgcolor='#0093D1',
                opacity=0.8
            ),
            dict(
                x='2015-06-01 00:00:00+0000',
                y=100,
                xref='x',
                yref='y',
                text='<b>Stock market selloff</b>',
                showarrow=True,
                font=dict(
                    size=12,
                    color='#ffffff'
                ),
                align='center',
                arrowhead=2,
                arrowsize=1,
                arrowwidth=2,
                arrowcolor='#636363',
                ax=0,
                ay=-60,
                borderpad=4,
                bgcolor='#0093D1',
                opacity=0.8
            ),
            dict(
                x='2015-09-01 00:00:00+0000',
                y=100,
                xref='x',
                yref='y',
                text='<b>Oil prices crash</b>',
                showarrow=True,
                font=dict(
                    size=12,
                    color='#ffffff'
                ),
                align='center',
                arrowhead=2,
                arrowsize=1,
                arrowwidth=2,
                arrowcolor='#636363',
                ax=0,
                ay=-100,
                borderpad=4,
                bgcolor='#0093D1',
                opacity=0.8
            ),
            dict(
                x='2016-06-23 00:00:00+0000',
                y=100,
                xref='x',
                yref='y',
                text='<b>Brexit</b>',
                showarrow=True,
                font=dict(
                    size=12,
                    color='#ffffff'
                ),
                align='center',
                arrowhead=2,
                arrowsize=1,
                arrowwidth=2,
                arrowcolor='#636363',
                ax=0,
                ay=-30,
                borderpad=4,
                bgcolor='#0093D1',
                opacity=0.8
            )
        ])
    py.iplot(dict(data=data, layout=layout), filename='basic-line')

We see that some events only left a small dent on stock prices, while others had more profound impact on the market. 

### Data Integrity Check

In [9]:
market_train_df['price_diff'] = market_train_df['close'] - market_train_df['open']

In [10]:
if ANALYTICS:
    grouped = market_train_df.groupby('time').agg({'price_diff': ['std', 'min', 'max']}).reset_index()
    grouped['price_diff'].describe()

In [11]:
if ANALYTICS:
    g = grouped.sort_values(('price_diff', 'std'), ascending=False)[:10]
    g['min_text'] = 'Maximum price drop: ' + (g['price_diff']['min']).astype(str)
    trace = go.Scatter(
        x = g['time'].dt.strftime(date_format='%Y-%m-%d').values,
        y = g['price_diff']['std'].values,
        mode='markers',
        marker=dict(
            size = g['price_diff']['std'].values,
            color = g['price_diff']['std'].values,
            colorscale='Portland',
            showscale=True
        ),
        text = g['min_text'].values
    )
    data = [trace]

    layout= go.Layout(
        autosize= True,
        title= 'Top 10 Outliers of 1-Day Price Change by Month',
        hovermode= 'closest',
        yaxis=dict(
            title= 'std',
            ticklen= 5,
            gridwidth= 2,
        ),
        showlegend= False
    )
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig,filename='top10_std_chg')

There's no way that the price of a stock can drop by $9948.99 in a day. The others also look suspicious. Let's first find out what securities they are and then verify them through other source. 

In [12]:
if ANALYTICS:
    market_train_df.sort_values('price_diff')[:10]

Put a ceiling and floor on these outliers with mean open or close price. 

In [13]:
market_train_df['close_to_open'] =  np.abs(market_train_df['close'] / market_train_df['open'])

In [14]:
if ANALYTICS:
    print(f"Number of securities whose prices doubled in one day: {(market_train_df['close_to_open'] >= 2).sum()}.")
    print(f"Number of securities whose prices fell by more then 50%: {(market_train_df['close_to_open'] <= 0.5).sum()}.")

In [15]:
market_train_df['assetName_mean_open'] = market_train_df.groupby('assetName')['open'].transform('mean')
market_train_df['assetName_mean_close'] = market_train_df.groupby('assetName')['close'].transform('mean')

# if open price is too far from mean open price for this company, replace it. Otherwise replace close price.
for i, row in market_train_df.loc[market_train_df['close_to_open'] >= 2].iterrows():
    if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):
        market_train_df.iloc[i,5] = row['assetName_mean_open']
    else:
        market_train_df.iloc[i,4] = row['assetName_mean_close']
        
for i, row in market_train_df.loc[market_train_df['close_to_open'] <= 0.5].iterrows():
    if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):
        market_train_df.iloc[i,5] = row['assetName_mean_open']
    else:
        market_train_df.iloc[i,4] = row['assetName_mean_close']

Now let's try to rebuild the graph and make sure all the outliers are taken care.

In [16]:
if ANALYTICS:
    market_train_df['price_diff'] = market_train_df['close'] - market_train_df['open']
    grouped = market_train_df.groupby(['time']).agg({'price_diff': ['std', 'min']}).reset_index()
    g = grouped.sort_values(('price_diff', 'std'), ascending=False)[:10]
    g['min_text'] = 'Maximum price drop: ' + (-1 * np.round(g['price_diff']['min'], 2)).astype(str)
    trace = go.Scatter(
        x = g['time'].dt.strftime(date_format='%Y-%m-%d').values,
        y = g['price_diff']['std'].values,
        mode='markers',
        marker=dict(
            size = g['price_diff']['std'].values * 5,
            color = g['price_diff']['std'].values,
            colorscale='Portland',
            showscale=True
        ),
        text = g['min_text'].values
    )
    data = [trace]

    layout= go.Layout(
        autosize= True,
        title= 'Top 10 Outliers of 1-Day Price Change by Month',
        hovermode= 'closest',
        yaxis=dict(
            title= 'Std',
            ticklen= 5,
            gridwidth= 2,
        ),
        showlegend= False
    )
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig,filename='scatter2010')

In [17]:
market_train_df.drop(['price_diff', 'assetName_mean_open', 'assetName_mean_close'], axis=1, inplace=True)

Now let's take a look at the target variable.

In [18]:
if ANALYTICS:
    data = []
    for i in [0.1, 0.25, 0.5, 0.75, 0.9]:
        df = market_train_df.groupby('time')['returnsOpenNextMktres10'].quantile(i).reset_index()

        data.append(go.Scatter(
            x = df['time'].dt.strftime(date_format='%Y-%m-%d').values,
            y = df['returnsOpenNextMktres10'].values,
            name = f'{i} quantile'
        ))
    layout = go.Layout(dict(title = "returnsOpenNextMktres10 by Quantiles",
                      yaxis = dict(title = 'Daily Return'),
                      ), legend=dict(orientation="h"))
    py.iplot(dict(data=data, layout=layout), filename='basic-line')

We see a wide dispersion of 10-day forward-looking market residual returns during major market events. Other times, the returns fluctuate around long-term mean.

In [19]:
if ANALYTICS:
    data = []
    for col in ['returnsOpenNextMktres10',
            'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
           'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
           'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
           'returnsClosePrevMktres10', 'returnsOpenPrevMktres10']:
        df = market_train_df.groupby('time')[col].mean().reset_index()
        data.append(go.Scatter(
            x = df['time'].dt.strftime(date_format='%Y-%m-%d').values,
            y = df[col].values,
            name = col
        ))

    layout = go.Layout(dict(title = "Average Returns Through Time (Before)",
                      yaxis = dict(title = 'Daily Return'), 
                      legend=dict(orientation="h")))
    py.iplot(dict(data=data, layout=layout), filename='basic-line')

I see a lot of people throw away data prior to 2009 because the returns were quite extreme during the financial crisis. I'm not sure whether that's the best thing to do. In my opinion, a better way to mitigate the impact of extreme data points around 2008 is to clip the data with a reasonable range.

In [20]:
market_train_df['returnsOpenNextMktres10'] = market_train_df['returnsOpenNextMktres10'].clip(-0.2, 0.2)
market_train_df['returnsClosePrevRaw1'] = market_train_df['returnsClosePrevRaw1'].clip(-0.1, 0.1)
market_train_df['returnsOpenPrevRaw1'] = market_train_df['returnsOpenPrevRaw1'].clip(-0.1, 0.1)
market_train_df['returnsClosePrevMktres1'] = market_train_df['returnsClosePrevMktres1'].clip(-0.1, 0.1)
market_train_df['returnsOpenPrevMktres1'] = market_train_df['returnsOpenPrevMktres1'].clip(-0.1, 0.1)
market_train_df['returnsClosePrevRaw10'] = market_train_df['returnsClosePrevRaw10'].clip(-0.2, 0.2)
market_train_df['returnsOpenPrevRaw10'] = market_train_df['returnsOpenPrevRaw10'].clip(-0.2, 0.2)
market_train_df['returnsClosePrevMktres10'] = market_train_df['returnsClosePrevMktres10'].clip(-0.2, 0.2)
market_train_df['returnsOpenPrevMktres10'] = market_train_df['returnsOpenPrevMktres10'].clip(-0.2, 0.2)

In [21]:
if ANALYTICS:
    data = []
    for col in ['returnsOpenNextMktres10', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
           'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
           'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
           'returnsClosePrevMktres10', 'returnsOpenPrevMktres10']:
        df = market_train_df.groupby('time')[col].mean().reset_index()
        data.append(go.Scatter(
            x = df['time'].dt.strftime(date_format='%Y-%m-%d').values,
            y = df[col].values,
            name = col
        ))

    layout = go.Layout(dict(title = "Average Returns Through Time (After)",
                      yaxis = dict(title = 'Price (USD)'),
                      legend=dict(orientation="h")))
    py.iplot(dict(data=data, layout=layout), filename='basic-line')

The target variable looks much more reasonable now. This prevents extreme values from distorting our model. Let's look at the other return variables.

### News Data

In [None]:
if ANALYTICS:
    news_train_df.head()

In [None]:
if ANALYTICS:
    print(news_train_df.shape)

Let's see the wordcloud of the first 100000 headlines.

In [None]:
if ANALYTICS:
    text = ' '.join(news_train_df['headline'].str.lower().values[:100000])
    wordcloud = WordCloud(max_font_size=None, stopwords=stop, background_color='white',
                          width=1200, height=1000).generate(text)
    plt.figure(figsize=(12, 8))
    plt.imshow(wordcloud)
    plt.title('Top words in headline')
    plt.axis("off")
    plt.show()

Disproportionate urgency code:

In [None]:
if ANALYTICS:
    (news_train_df['urgency'].value_counts() / 1000000).plot('bar');
    plt.xticks(rotation=0);
    plt.title('Urgency counts (million)');

In [None]:
if ANALYTICS:
    sns.violinplot(x=news_train_df['wordCount'])

Most of the news articles have 2500 or less words.

In [None]:
if ANALYTICS:
    sns.violinplot(x=news_train_df['sentenceCount'])

Most of the news articles have less than 100 sentences.

In [None]:
if ANALYTICS:
    news_train_df['provider'].value_counts().head(10)

It isn't surprising that Reuters is the largest news provider in this dataset.

In [None]:
if ANALYTICS:
    (news_train_df['headlineTag'].value_counts() / 1000)[:10].plot('barh');
    plt.title('headlineTag counts (thousands)');

Most of the news articles don't have a headline tag.

In [None]:
if ANALYTICS:
    pal = sns.cubehelix_palette(3, rot=-.5, dark=.3)
    sns.violinplot(data=news_train_df[['sentimentNegative', 'sentimentNeutral', 'sentimentPositive']], palette=pal, inner="points")

The sentiments are bipolar, with a large portion concentrated around neutrality and a small portion clustered around the apex.

In [None]:
if ANALYTICS:
    for i, j in zip([-1, 0, 1], ['negative', 'neutral', 'positive']):
        df_sentiment = news_train_df.loc[news_train_df['sentimentClass'] == i, 'assetName']
        print(f'Top mentioned companies for {j} sentiment are:')
        print(df_sentiment.value_counts().head(5))
        print('')

We see that a number of companies such as Apple, Citigroup, Barclays show up in more than one sentiment class. It makes sense as these large companies get more attention from the press. From time to time, they could receive very different sentiments.

## Modelling

The VM doesn't have enough memory. Temporarily shrinking the dataset. 

In [None]:
market_train_df = market_train_df.tail(3000000)
news_train_df = news_train_df.tail(6000000)

For market data, we extract more features from the original dataset directly. 

In [22]:
def extract_market_features(market_train_df):
    # downcast to reduce memory footprint
    market_train_df['volume'] = market_train_df['volume'].astype(np.float32)
    market_train_df['open'] = market_train_df['open'].astype(np.float32)
    market_train_df['close'] = market_train_df['close'].astype(np.float32)
    market_train_df['returnsClosePrevRaw1'] = market_train_df['returnsClosePrevRaw1'].astype(np.float32)
    market_train_df['returnsClosePrevRaw10'] = market_train_df['returnsClosePrevRaw10'].astype(np.float32)
    market_train_df['returnsClosePrevMktres1'] = market_train_df['returnsClosePrevMktres1'].astype(np.float32)
    market_train_df['returnsClosePrevMktres10'] = market_train_df['returnsClosePrevMktres10'].astype(np.float32)
    market_train_df['returnsOpenPrevRaw1'] = market_train_df['returnsOpenPrevRaw1'].astype(np.float32)
    market_train_df['returnsOpenPrevRaw10'] = market_train_df['returnsOpenPrevRaw10'].astype(np.float32)
    market_train_df['returnsOpenPrevMktres1'] = market_train_df['returnsOpenPrevMktres1'].astype(np.float32)
    market_train_df['returnsOpenPrevMktres10'] = market_train_df['returnsOpenPrevMktres10'].astype(np.float32)
    
    market_train_df['close_to_open'] = market_train_df['close'] / market_train_df['open']
    market_train_df['volume_to_mean'] = (market_train_df['volume'] / market_train_df['volume'].mean()).astype(np.float32)
    market_train_df['returns_close_to_open_prev_raw1'] = market_train_df['returnsClosePrevRaw1'] / market_train_df['returnsOpenPrevRaw1']
    market_train_df['returns_close_to_open_prev_raw10'] = market_train_df['returnsClosePrevRaw10'] / market_train_df['returnsOpenPrevRaw10']
    market_train_df['returns_close_to_open_prev_mktres1'] = market_train_df['returnsClosePrevMktres1'] / market_train_df['returnsOpenPrevMktres1']
    market_train_df['returns_close_to_open_prev_mktres10'] = market_train_df['returnsClosePrevMktres10'] / market_train_df['returnsOpenPrevMktres10']
    market_train_df['returns_prev_open_raw1_to_open_raw10'] = market_train_df['returnsOpenPrevRaw1'] / market_train_df['returnsOpenPrevRaw10']
    market_train_df['returns_prev_close_raw1_to_close_raw10'] = market_train_df['returnsClosePrevRaw1'] / market_train_df['returnsClosePrevRaw10']
    market_train_df['returns_prev_open_mktres1_to_open_mktres10'] = market_train_df['returnsOpenPrevMktres1'] / market_train_df['returnsOpenPrevMktres10']
    market_train_df['returns_prev_close_mktres1_to_close_mktres10'] = market_train_df['returnsClosePrevMktres1'] / market_train_df['returnsClosePrevMktres10']
    
    return market_train_df

* Convert "headline" from string to vector representation using TF-IDF.
* Create an one-hot representation of "subjects" simply by counting the top 20 subjects ordered by term frequency. 

In [24]:
def transform_headline(df, tfidf_vectorizer=None):
    if tfidf_vectorizer:
        headlines = tfidf_vectorizer.transform(df['headline']).toarray()
    else:
        tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 1), 
                                           min_df=10, max_features=20, dtype=np.float32)
        headlines = tfidf_vectorizer.fit_transform(df['headline']).toarray()
    
    for i in range(20):
        column_name = 'headline' + str(i+1)
        if (i < headlines.shape[1]):
            df[column_name] = headlines[:,i]
        else:
            df[column_name] = 0

    df.drop('headline', axis=1, inplace=True)
    
    return df, tfidf_vectorizer

def transform_subjects(df, cnt_vectorizer=None):
    if cnt_vectorizer:
        subjects = cnt_vectorizer.transform(df['subjects']).toarray()
    else:
        cnt_vectorizer = CountVectorizer(lowercase=True, ngram_range=(1, 1), min_df=10, 
                                         max_features=20, dtype=bool)
        subjects = cnt_vectorizer.fit_transform(df['subjects']).toarray()
    
    for i in range(20):
        column_name = 'subject' + str(i+1)
        if (i < subjects.shape[1]):
            df[column_name] = subjects[:,i]
        else:
            df[column_name] = 0
    
    df.drop('subjects', axis=1, inplace=True)
    
    return df, cnt_vectorizer

For news data, however, we may have multiple news articles associated with a stock on a given day. We collection a number of key statistics from these articles and aggregate them. 

In [25]:
def extract_news_features(news_train_df, tfidf_vectorizer=None, cnt_vectorizer=None):
    news_cols_agg = {
        'urgency': ['count'],
        'takeSequence': ['max'],
        'bodySize': ['mean', 'std'],
        'wordCount': ['mean', 'std'],
        'sentenceCount': ['mean', 'std'],
        'companyCount': ['mean', 'std'],
        'marketCommentary': ['mean'],
        'relevance': ['mean', 'std'],
        'sentimentNegative': ['min', 'max', 'mean', 'std'],
        'sentimentNeutral': ['min', 'max', 'mean', 'std'],
        'sentimentPositive': ['min', 'max', 'mean', 'std'],
        'sentimentWordCount': ['min', 'max', 'mean', 'std'],
        'noveltyCount12H': ['mean', 'std'],
        'noveltyCount24H': ['mean', 'std'],
        'noveltyCount3D': ['mean', 'std'],
        'noveltyCount5D': ['mean', 'std'],
        'noveltyCount7D': ['mean', 'std'],
        'volumeCounts12H': ['mean', 'std'],
        'volumeCounts24H': ['mean', 'std'],
        'volumeCounts3D': ['mean', 'std'],
        'volumeCounts5D': ['mean', 'std'],
        'volumeCounts7D': ['mean', 'std'],
        'headline1': ['mean'],
        'headline2': ['mean'],
        'headline3': ['mean'],
        'headline4': ['mean'],
        'headline5': ['mean'],
        'headline6': ['mean'],
        'headline7': ['mean'],
        'headline8': ['mean'],
        'headline9': ['mean'],
        'headline10': ['mean'],
        'headline11': ['mean'],
        'headline12': ['mean'],
        'headline13': ['mean'],
        'headline14': ['mean'],
        'headline15': ['mean'],
        'headline16': ['mean'],
        'headline17': ['mean'],
        'headline18': ['mean'],
        'headline19': ['mean'],
        'headline20': ['mean'],
        'subject1': ['mean'],
        'subject2': ['mean'],
        'subject3': ['mean'],
        'subject4': ['mean'],
        'subject5': ['mean'],
        'subject6': ['mean'],
        'subject7': ['mean'],
        'subject8': ['mean'],
        'subject9': ['mean'],
        'subject10': ['mean'],
        'subject11': ['mean'],
        'subject12': ['mean'],
        'subject13': ['mean'],
        'subject14': ['mean'],
        'subject15': ['mean'],
        'subject16': ['mean'],
        'subject17': ['mean'],
        'subject18': ['mean'],
        'subject19': ['mean'],
        'subject20': ['mean']
    }
    
    # text data -> vector representation
    news_train_df, tfidf_vectorizer = transform_headline(news_train_df, tfidf_vectorizer)
    news_train_df, cnt_vectorizer = transform_subjects(news_train_df, cnt_vectorizer)
    
    # Fix asset codes (str -> list)
    news_train_df['assetCodes'] = news_train_df['assetCodes'].str.findall(f"'([\w\./]+)'")    
    
    # Expand assetCodes
    assetCodes_expanded = list(chain(*news_train_df['assetCodes']))
    assetCodes_index = news_train_df.index.repeat( news_train_df['assetCodes'].apply(len) )

    assert len(assetCodes_index) == len(assetCodes_expanded)
    df_assetCodes = pd.DataFrame({'level_0': assetCodes_index, 'assetCode': assetCodes_expanded})

    # Create expandaded news (will repeat every assetCodes' row)
    news_cols = ['time', 'assetCodes'] + sorted(news_cols_agg.keys())
    news_train_df_expanded = pd.merge(df_assetCodes, news_train_df[news_cols], left_on='level_0', right_index=True, suffixes=(['','_old']))

    # Free memory
    del news_train_df, df_assetCodes

    # Aggregate numerical news features
    news_train_df_aggregated = news_train_df_expanded.groupby(['time', 'assetCode']).agg(news_cols_agg)
    
    # Free memory
    del news_train_df_expanded

    # Convert to float32 to save memory
    news_train_df_aggregated = news_train_df_aggregated.apply(np.float32)

    # Flat columns
    news_train_df_aggregated.columns = ['_'.join(col).strip() for col in news_train_df_aggregated.columns.values]
    
    return news_train_df_aggregated, tfidf_vectorizer, cnt_vectorizer

In [26]:
def merge_market_news(market_train_df, news_train_df):
    market_train_df = market_train_df.join(news_train_df, on=['time', 'assetCode'])
    
    return market_train_df

In [33]:
def label_encode(series, min_count):
    vc = series.value_counts()
    le = {c:i for i, c in enumerate(vc.index[vc >= min_count])}
    return le

def get_xy(market_train_df, news_train_df, le=None, tfidf_vectorizer=None, cnt_vectorizer=None):
    # Split date into before and after 22h (the time used in train data)
    # E.g: 2007-03-07 23:26:39+00:00 -> 2007-03-08 00:00:00+00:00 (next day)
    #      2009-02-25 21:00:50+00:00 -> 2009-02-25 00:00:00+00:00 (current day)
    news_train_df['time'] = (pd.to_datetime(news_train_df['time']) - np.timedelta64(22,'h')).dt.ceil('1D')

    # Round time of market_train_df to 0h of curret day
    market_train_df['time'] = pd.to_datetime(market_train_df['time']).dt.floor('1D')

    market_train_df = extract_market_features(market_train_df)
    news_train_df, tfidf_vectorizer, cnt_vectorizer = extract_news_features(news_train_df, 
                                                                            tfidf_vectorizer, 
                                                                            cnt_vectorizer)
    market_train_df = merge_market_news(market_train_df, news_train_df)

    # If not label-encoder... encode assetCode
    if le is None:
        le_assetCode = label_encode(market_train_df['assetCode'], min_count=10)
        le_assetName = label_encode(market_train_df['assetName'], min_count=5)
    else:
        # 'unpack' label encoders
        le_assetCode, le_assetName = le
        
    market_train_df['assetCode'] = market_train_df['assetCode'].map(le_assetCode).fillna(-1).astype(np.int16)
    market_train_df['assetName'] = market_train_df['assetName'].map(le_assetName).fillna(-1).astype(np.int16)
    market_train_df['dayofweek'] = market_train_df.time.dt.dayofweek.astype(np.int8)
    market_train_df['month'] = market_train_df.time.dt.month.astype(np.int8)
    
    try:
        y = market_train_df['returnsOpenNextMktres10'].astype(np.float32)
        market_train_df.drop(columns=['returnsOpenNextMktres10'], inplace=True)
    except:
        y = None
    
    try:
        time = market_train_df['time']
        market_train_df.drop(columns='time', inplace=True)
    except:
        time = None
    
    try:
        universe = market_train_df['universe']
        market_train_df.drop(columns=['universe'], inplace=True)
    except:
        universe = None
    
    return market_train_df, y, time, universe, (le_assetCode, le_assetName), tfidf_vectorizer, cnt_vectorizer

We then merge market data and news data, and remove the original dataframes to reduce memory footprint.

In [34]:
market_train_df, y, time, universe, le, tfidf_vectorizer, cnt_vectorizer = get_xy(market_train_df, news_train_df)

del news_train_df
gc.collect()

print(market_train_df.shape, y.shape)

ValueError: np.nan is an invalid document, expected byte or unicode string.

Determine what features to use for training and normalized training data: (No need to normalize data for tree based model)

In [29]:
feature_cols = market_train_df.columns.tolist()
# feature_cols = [c for c in X.columns if c not in ['time', 'universe', 'returnsOpenNextMktres10']]

# X = X[feature_cols]

# Normalize X values
# mins = np.nanmin(X[X != np.inf], axis=0)
# maxs = np.nanmax(X[X != np.inf], axis=0)
# rng = maxs - mins
# X = 1 - ((maxs - X) / rng)

In [None]:
market_train_df.describe()

Create training and validation datasets and add time and universe to validation dataset in order to calculate custom scores later. 

**XGBClassifier**

In [None]:
# up = y >= 0
# up = up.values
# r = y.values
# u = universe
# day = time.dt.date

# Scaling of X values
# It is good to keep these scaling values for later
# mins = np.min(X, axis=0)
# maxs = np.max(X, axis=0)
# rng = maxs - mins
# X = 1 - ((maxs - X) / rng)

# X_train, X_test, up_train, up_test, r_train, r_test, u_train, u_test, d_train, d_test\
# = train_test_split(X, up, r, u, day, test_size=0.25, random_state=99)

In [None]:
# Create XGB Classifier Model and fit to data
# from xgboost import XGBClassifier
# import time
# from sklearn.metrics import accuracy_score

# xgb_model = XGBClassifier(n_jobs=4,n_estimators=200,max_depth=8,eta=0.1)

# t = time.time()
# print('Training XGB Model')
# xgb_model.fit(X_train, up_train)
# print(f'Done, time = {time.time() - t}')

# accuracy_score(xgb_model.predict(X_test), up_test)

In [None]:
# Print XGB UP Confidence Graph
# confidence_test = xgb_model.predict_proba(X_test)[:,1]*2 -1
# plt.hist(confidence_test, bins='auto')
# plt.title("XGB predicted confidence")
# plt.show()

In [None]:
# Calculate Final Score Metric for XGB
# r_test = r_test.clip(-1,1) # get rid of outliers. Where do they come from??
# x_t_i = confidence_test * r_test * u_test
# data = {'day' : d_test, 'x_t_i' : x_t_i}
# df = pd.DataFrame(data)
# x_t = df.groupby('day').sum().values.flatten()
# mean = np.mean(x_t)
# std = np.std(x_t)
# score_test = mean / std
# print(f'XGBoost Up score: {score_test}')

In [None]:
# Feature Importance
# plt.figure(num=None, figsize=(10, 10), dpi=80, facecolor='w', edgecolor='k')
# plt.bar(range(len(xgb_model.feature_importances_)), xgb_model.feature_importances_)
# plt.title("XGB Feature Importance")
# plt.xticks(range(len(xgb_model.feature_importances_)), feature_cols, rotation='vertical');

**CatBoost Classifier**

In [None]:
# cat_model = CatBoostClassifier(thread_count=4, n_estimators=200, max_depth=10, 
#                                eta=0.1, loss_function='Logloss' , verbose=10)

# t = time.time()
# print('Fitting CatBoost Model')
# cat_model.fit(X_train, up_train)
# print(f'cat Done, time = {time.time() - t}')

In [None]:
# from sklearn.metrics import accuracy_score
# catconfidence_test = cat_model.predict_proba(X_test)[:,1]*2 -1

# print(accuracy_score(catconfidence_test>0,up_test))
# plt.hist(catconfidence_test, bins='auto')
# plt.title("CatBoost predicted confidence")
# plt.show()

In [None]:
# x_t_icat = catconfidence_test * r_test * u_test
# data2 = {'day' : d_test, 'x_t_icat' : x_t_icat}
# df2 = pd.DataFrame(data2)
# x_tcat = df2.groupby('day').sum().values.flatten()
# mean = np.mean(x_tcat)
# std = np.std(x_tcat)
# score_testcat = mean / std
# print(f'CatBoost score: {score_testcat}')

In [None]:
# # free up memory
# del X_train, X_test, up_train, up_test, r_train, r_test, u_train, u_test, d_train, d_test
# gc.collect()

**LightGBM Regressor**

In [None]:
# train_index, test_index = train_test_split(market_train_df.index, test_size=0.2)

Use LightGBM to build a regression model:

In [None]:
def score_function(preds, valid_data):
    time = valid_data.params['time']
    universe = valid_data.params['universe']
    labels = valid_data.get_label()

    x_t = preds * labels * universe
    x_t_sum = x_t.groupby(time).sum()
    score = x_t_sum.mean() / x_t_sum.std()
    
    return 'score', score, True

def evaluate_model(X, y, train_index, test_index, params):
    data_train = lgb.Dataset(X.loc[train_index].values, y.loc[train_index], feature_name=feature_cols, free_raw_data=False)
    data_test = lgb.Dataset(X.loc[test_index].values, y.loc[test_index], feature_name=feature_cols, free_raw_data=False)
    data_test.params = {
        'time': time.loc[test_index].factorize()[0],
        'universe': universe.loc[test_index].values
    }
    evals_result = {}
    
    model = lgb.train(params, train_set=data_train, num_boost_round=2000,
                      valid_sets=(data_test,), valid_names=('test',),
                      verbose_eval=50, early_stopping_rounds=100,
                      feval=score_function, evals_result=evals_result)

    df_result = pd.DataFrame(evals_result['test'])
    
    return df_result, model

#### Hyperparameter tuning

In [None]:
def run_bayesian_optimization():
    # optimize params in these ranges
    spaces = [
        (0.05, 0.22), #learning_rate
        (800, 2000), #num_leaves
        (500, 3000), #min_data_in_leaf
        (6, 15), # max_depth
        (0.5, 1.0), # bagging_fraction
        (1, 5), # bagging_freq
        (0.5, 1.0), # feature_fraction
        (200, 800), #max_bin
        (0.3, 0.9), # lambda_l1
        (0.3, 0.9) # lambda_l2
    ]

    def f(x):
        lgb_param = {
            'boosting': 'gbdt', 
            'objective': 'regression_l1',
            'metric': 'None',
            'learning_rate': x[0],
            'num_leaves': x[1],
            'min_data_in_leaf': x[2],
            'max_depth': x[3],
            'bagging_fraction': x[4],
            'bagging_freq': x[5],
            'feature_fraction': x[6],
            'max_bin': x[7],
            'lambda_l1': x[8],
            'lambda_l2': x[9],
            'seed': 2018
        }

        df_result, model = evaluate_model(market_train_df, y, train_index,test_index, lgb_param)

        return -df_result['score'].values[-1]

    # run optimization
    from skopt import gp_minimize
    res = gp_minimize(
        f, spaces,
        acq_func="EI",
        n_calls=15)

    # print tuned params
    print(res.x)

    # plot tuning process
    from skopt.plots import plot_convergence
    plot_convergence(res)

In [None]:
# run_bayesian_optimization()

In [None]:
lgb_params = {
    'boosting': 'gbdt', 
    'objective': 'regression_l1',
    'metric': 'None',
    'learning_rate': 0.22,
    'num_leaves': 2000,
    'min_data_in_leaf': 500,
    'max_depth': 15,
    'bagging_fraction': 1.0,
    'bagging_freq': 1,
    'feature_fraction': 1.0,
    'max_bin': 200,
    'lambda_l1': 0.30,
    'lambda_l2': 0.90,
    'seed': 2018
}

In [None]:
# df_result, model = evaluate_model(market_train_df, y, train_index, test_index, lgb_params)

In [None]:
# ax = df_result.plot(figsize=(12, 8))
# ax.scatter(df_result['score'].idxmax(), df_result['score'].max())
# ax.set_xlabel("iteration")
# ax.set_ylabel("score")

In [None]:
# num_boost_round, valid_score = df_result['score'].idxmax()+1, df_result['score'].max()
# print(lgb_params)
# print(f'Best score was {valid_score:.5f} on round {num_boost_round}')

We will use this optimized hyperparameter to train the full model later. 

In [None]:
# fig, ax = plt.subplots(1, 2, figsize=(16, 20))
# lgb.plot_importance(model, ax=ax[0], title='Feature Importance (split)', xlabel='Feature importance', ylabel='Features',
#                     importance_type='split', max_num_features=114)
# lgb.plot_importance(model, ax=ax[1], title='Feature Importance (gain)', xlabel='Feature importance', ylabel='Features',
#                     importance_type='gain', max_num_features=114)
# fig.tight_layout()

In [None]:
# lgb.plot_tree(model, figsize=(16, 16))

In [None]:
# free up memory
del time, universe
gc.collect()

## Train Full Model

#### Cat Boost Classifier

In [None]:
# # Final Model
# catModel = CatBoostClassifier(thread_count=4, n_estimators=200, max_depth=10, eta=0.1, loss_function='Logloss' , verbose=10)

# t = time.time()
# print('Training Model')
# catModel.fit(market_train_df, up)
# print(f'Done, time = {time.time() - t}')

# del up
# gc.collect()

# # data_train_full = lgb.Dataset(market_train_df, y, feature_name=feature_cols)
# del market_train_df, y
# gc.collect()
# # lgbModel = lgb.train(lgb_params, train_set=data_train_full, num_boost_round=num_boost_round)
# # del data_train_full
# # gc.collect()

In [None]:
# def make_predictions(market_obs_df, news_obs_df, predictions_template_df, le):
#     X, y, time, universe, le, tfidf_vectorizer, cnt_vectorizer = get_xy(market_obs_df, news_obs_df, le,
#                                                                         tfidf_vectorizer, cnt_vectorizer) 
    
#     catPrediction = np.clip(catModel.predict_proba(X)[:,1]*2 -1, -1, 1)
# #     lgbPrediction = np.clip(lgbModel.predict(X), -1, 1)
#     
# #     prediction = (catPrediction + lgbPrediction) / 2
#     prediction = catPrediction
#     predictions_template_df.confidenceValue = prediction

In [None]:
# days = env.get_prediction_days()

# for (market_obs_df, news_obs_df, predictions_template_df) in days:
#     make_predictions(market_obs_df, news_obs_df, predictions_template_df, le)
#     env.predict(predictions_template_df)

In [None]:
# env.write_submission_file()

**LightGBM Regressor**

In [None]:
num_boost_round = 1500

In [None]:
train_index, test_index = train_test_split(market_train_df.index, test_size=0.0)

In [None]:
data_train_full = lgb.Dataset(market_train_df.loc[train_index].values, y.loc[train_index], feature_name=feature_cols, free_raw_data=False)
del market_train_df, y
gc.collect()
model = lgb.train(lgb_params, train_set=data_train_full, num_boost_round=num_boost_round)
del data_train_full
gc.collect()

In [None]:
def make_predictions(market_obs_df, news_obs_df, predictions_template_df, le, tfidf_vectorizer, cnt_vectorizer):
    X, y, time, universe, le, tfidf_vectorizer, cnt_vectorizer = get_xy(market_obs_df, news_obs_df, le, 
                                                                        tfidf_vectorizer, cnt_vectorizer)
    # X_new = X[feature_cols].values
    # X_new = 1 - ((maxs - X_new) / rng)
    
    predictions_template_df.confidenceValue = np.clip(model.predict(X), -1, 1)

In [None]:
days = env.get_prediction_days()

for (market_obs_df, news_obs_df, predictions_template_df) in days:
    make_predictions(market_obs_df, news_obs_df, predictions_template_df, le, tfidf_vectorizer, cnt_vectorizer)
    env.predict(predictions_template_df)

In [None]:
env.write_submission_file()