# 1. Introduction

I used ideas and sometimes copy pasted the code from kernels:

EDA, outliers: https://www.kaggle.com/artgor/eda-feature-engineering-and-everything

NN: https://www.kaggle.com/christofhenkel/market-data-nn-baseline#

LSTM: https://www.kaggle.com/pablocastilla, https://www.kaggle.com/sergeykalutsky/lstm-model-on-market-data#,  https://www.kaggle.com/ashkaan/lstm-baseline# 

News processing: https://www.kaggle.com/bguberfain/a-simple-model-using-the-market-and-news-data# 

**Disclaimer:** currently the model's performance is not perfect.

**ToDo:**
 
1. Experiment with news resampling on different periods - 10 days. Now many market rows have** empty news** joined
1. Train on random time windows instead of sampling single records.
1. Try technical indicators on market data: MACD, RSI etc. 
1. Work with residuals instead of raw data


In [None]:
#####################################
# Libraries
#####################################
# Common libs
import pandas as pd
import numpy as np
import sys
import os
import os.path
import random
from pathlib import Path
from time import time
from itertools import chain

# Image processing
import imageio
import skimage
import skimage.io
import skimage.transform
#from skimage.transform import rescale, resize, downscale_local_mean

# Charts
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn as sns


# ML
import scipy
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
#from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
#from sklearn.impute import SimpleImputer
from sklearn.preprocessing import QuantileTransformer,StandardScaler, MinMaxScaler,OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
#from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.sequence import TimeseriesGenerator
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten, MaxPool2D, Dropout, BatchNormalization, LSTM, Embedding
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, ReduceLROnPlateau
from keras.utils import to_categorical
import tensorflow

#####################################
# Settings
#####################################
plt.style.use('seaborn')
# Set random seed to make results reproducable
np.random.seed(42)
tensorflow.set_random_seed(42)
os.environ['PYTHONHASHSEED'] = '42'
# Improve printed df readability
pd.options.display.float_format = '{:,.4f}'.format
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)

print(os.listdir("../input"))

In [None]:
# This competition settings
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()

In [None]:
# Read the data
# Read market and news data
(market, news) = env.get_training_data()

# Set time index at market data
market.time = market.time.astype('datetime64[D, UTC]')
#market.set_index(['time', 'assetCode'], inplace=True, drop=False)
news.time = news.time.astype('datetime64[D, UTC]')
#news.set_index(['time', 'assetCode'],inplace=True, drop=False)

# 2. Market data EDA



## General view of market data

In [None]:
market.tail()

In [None]:
# Look at column types
market.info()

In [None]:
# Look at min-max, quantiles
market.describe()

In [None]:
# How many total records and assets are in the data
nassets=len(market.assetName.unique().categories)
nrows=market.close.count()
print("Total count: %d records of %d assets" % (nrows, nassets))

## Look at label values


In [None]:
# Plot label column
market.returnsOpenNextMktres10.plot(figsize=(12,5))
plt.title('Label values: returnsOpenNextMktres10')
plt.ylabel('returnsOpenNextMktres10')
plt.xlabel('Observation no')
plt.show()

# Look at quantiles
market.returnsOpenNextMktres10.describe(percentiles=[0.01, 0.99])
#market.returnsOpenNextMktres10.describe()


As we can see, the most of labels lay between -0.2 and 0.2 and there are otliers. 

In [None]:
sns.distplot(market.returnsOpenNextMktres10.clip(-1,1))
plt.show()

## Price and volume chart of random asset.

In [None]:
def plot_random_asset(market):
    """
    Get random asset, show price, volatility and volume
    """
    # Get any asset
    ass = market.assetCode.sample(1, random_state=24).iloc[0]
    ass_market = market[market['assetCode'] == ass]
    ass_market.index = ass_market.time

    # Plotting
    f, axs = plt.subplots(3,1, sharex=True, figsize=(12,8))
    # Close price 
    ass_market.close.plot(ax=axs[0])
    axs[0].set_ylabel("Price")

    # Volatility (close-open)
    volat_df = (ass_market.close - ass_market.open)
    (ass_market.close - ass_market.open).plot(color='green', ax = axs[1])
    axs[1].set_ylabel("Volatility")

    # Volume
    ass_market.volume.plot(ax=axs[2], color='darkred')
    axs[2].set_ylabel("Volume")

    # Show the plot
    f.suptitle("Asset: %s" % ass, fontsize=22)
    plt.tight_layout()
    plt.subplots_adjust(top=0.93)
    plt.show()

plot_random_asset(market)

# 3. News data EDA

## General look

In [None]:
news.tail()

In [None]:
# See column types
news.info()

In [None]:
nnews = news.size
nassets = len(news.assetName.cat.categories)
print("Total %d news about %d assets" % (nnews, nassets))

## Positivity and negativity
Let's see which attitude prevails in news.

In [None]:
    # Barplot on negative, neutral and positive columns.
    news[['sentimentNegative', 'sentimentNeutral','sentimentPositive']].mean().plot(kind='bar')
    plt.title("News positivity chart")
    plt.show()

Neutral and positive a little bit higher. So according to the news the market is something like flat whith a little grow tendency.

# 4.  Preprocess the data
We are going to use data generator to feed the model. Generator yields data to the model batch by batch. For each batch we are doing following steps:
Preprocess news and market separately. Then join them and yield, so it will come to the **model.fit_generate** method.

## Split to train, validation and test

We are using indices with time only. Full features and labels will be prepared in generator per batch to save memory.

In [None]:
toy = True

def train_test_val_split(market):
    """
    Get sample of assets but each asset has full market data after 2009
    Split to time sorted train, validation and test.
    @return: train, validation, test df. Short variant - time and asset columns only
    """
    # Work with data after 2009
    market_idx = market[market.time > '2009'][['time', 'assetCode']]
    if toy: market_idx = market_idx.sample(100000)
    else: market_idx = market_idx.sample(1000000)
    # Split to train, validation and test
    market_idx = market_idx.sort_values(by=['time'])
    market_train_idx, market_test_idx = train_test_split(market_idx, shuffle=False, random_state=24)
    market_train_idx, market_val_idx = train_test_split(market_train_idx, test_size=0.1, shuffle=False, random_state=24)
    return(market_train_idx, market_val_idx, market_test_idx)

# Split
market_train_idx, market_val_idx, market_test_idx = train_test_val_split(market)

# Plot train/val/test size
sns.barplot(['Train', 'Validation', 'Test'],[market_train_idx.index.size,market_val_idx.index.size,market_test_idx.index.size])
plt.title('Train, validation, test split.')
plt.ylabel('Count')
plt.show()

market_train_idx.tail()

## Market preprocessor
Prepare market batch for generator - scale numeric columns, encode categorical etc.

In [None]:
class MarketPrepro:
    """
    Standard way to generate batches for model.fit_generator(generator, ...)
    Should be fit on train data and used on all train, validation, test
    """
    # Features
    assetcode_encoded = []
    assetcode_train_count = 0
    time_cols=['year', 'week', 'day', 'dayofweek']
    numeric_cols = ['volume', 'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevMktres1',
                    'returnsOpenPrevMktres1', 'returnsClosePrevRaw10', 'returnsOpenPrevRaw10', 'returnsClosePrevMktres10',
                    'returnsOpenPrevMktres10']
    feature_cols = ['assetCode_encoded']  + time_cols + numeric_cols
    
    # Labels
    label_cols = ['returnsOpenNextMktres10']   
    
    def __init__(self):
        self.cats={}
    
    def fit(self, market_train_df):
        """
        Fit preprocessing scalers, encoders on given train df.
        Store given indices to generate batches_from.
        @param market_train_df: train data to fit on
        """
        df = market_train_df.copy()
        # Clean bad data. We fit on train dataset and it's ok to remove bad data
        market_train_df = self.fix_train(market_train_df)
        
        # Extract day, week, year from time
        market_train_df = self.prepare_time_cols(market_train_df)
        
        # Fit for numeric and time
        self.numeric_scaler = StandardScaler()
        self.numeric_scaler.fit(market_train_df[self.numeric_cols + self.time_cols].astype(float))
        
        # Fit asset encoding
        market_train_df = self.encode_asset(market_train_df, True)
        
    def fix_train(self, train_df):
        """
        Remove bad data. For train dataset only
        """
        # Remove strange cases with close/open ratio > 2
        max_ratio  = 2
        train_df = train_df[(train_df['close'] / train_df['open']).abs() <= max_ratio].loc[:]
        # Fix outliers etc like for test set
        train_df = self.safe_fix(train_df)
        return(train_df)

    def safe_fix(self, df):
        """
        Fill na, fix outliers. Safe for test dataset, no rows removed.
        """
        # Fill nans
        df[self.numeric_cols] = df[ ['assetCode'] + self.numeric_cols].groupby('assetCode').transform(lambda g: g.fillna(method='bfill'))
        
        df[self.numeric_cols] = df[self.numeric_cols].fillna(0)
        # Fix outliers
        df[self.numeric_cols] = df[self.numeric_cols].clip(df[self.numeric_cols].quantile(0.01), df[self.numeric_cols].quantile(0.99), axis=1)
        return(df)
    
    def get_X(self,df):
        """
        Preprocess and return X without y
        """
        df = df.copy()
        # Fix bad data without removing rows
        df = self.safe_fix(df)

        # Add day, week, year
        df = self.prepare_time_cols(df)
        # Encode assetCode
        df = self.encode_asset(df, is_train=False)
        # Scale numeric features and labels
        df[self.numeric_cols+self.time_cols] = self.numeric_scaler.transform(df[self.numeric_cols+self.time_cols].astype(float))

        # Return X
        return df[self.feature_cols]
    
    def get_y(self, df):
        y=(df[self.label_cols] >=0).astype(float)
        return y

    def encode_asset(self, df, is_train):
        def encode(assetcode):
            """
            Encode categorical features to numbers
            """
            try:
                # Transform to index of name in stored names list
                index_value = self.assetcode_encoded.index(assetcode) +1
            except ValueError:
                # If new value, add it to the list and return new index
                self.assetcode_encoded.append(assetcode)
                index_value = len(self.assetcode_encoded)

            #index_value = 1.0/(index_value)
            index_value = index_value / (self.assetcode_train_count + 1)
            return(index_value)       
        
        if is_train:
            self.assetcode_train_count = len(df['assetCode'].unique())+1
        df['assetCode_encoded'] = df['assetCode'].apply(lambda assetcode: encode(assetcode))
        return(df)
        
    def prepare_time_cols(self, df):
        """ 
        Extract time parts, they are important for time series 
        """
        df = df.copy()
        df['year'] = df['time'].dt.year
        # Maybe remove month because week of year can handle the same info
        df['day'] = df['time'].dt.day
        # Week of year
        df['week'] = df['time'].dt.week
        df['dayofweek'] = df['time'].dt.dayofweek
        return(df)

# Create instance for global usage    
market_prepro = MarketPrepro()
print('market_prepro created')

### News preprocessor
Prepare news batch for generator.
Asset can have many news per day, so group them by asset, day and aggregate. Then normalize numerical values. News aggregation part is based on this kernel: https://www.kaggle.com/bguberfain/a-simple-model-using-the-market-and-news-data#

In [None]:
class NewsPrepro:
    """
    Aggregate news by day and asset. Normalize numeric values.
    """
    news_cols_agg = {
        'urgency': ['min', 'count'],
        'takeSequence': ['max'],
        'bodySize': ['min', 'max', 'mean', 'std'],
        'wordCount': ['min', 'max', 'mean', 'std'],
        'sentenceCount': ['min', 'max', 'mean', 'std'],
        'companyCount': ['min', 'max', 'mean', 'std'],
        'marketCommentary': ['min', 'max', 'mean', 'std'],
        'relevance': ['min', 'max', 'mean', 'std'],
        'sentimentNegative': ['min', 'max', 'mean', 'std'],
        'sentimentNeutral': ['min', 'max', 'mean', 'std'],
        'sentimentPositive': ['min', 'max', 'mean', 'std'],
        'sentimentWordCount': ['min', 'max', 'mean', 'std'],
        'noveltyCount12H': ['min', 'max', 'mean', 'std'],
        'noveltyCount24H': ['min', 'max', 'mean', 'std'],
        'noveltyCount3D': ['min', 'max', 'mean', 'std'],
        'noveltyCount5D': ['min', 'max', 'mean', 'std'],
        'noveltyCount7D': ['min', 'max', 'mean', 'std'],
        'volumeCounts12H': ['min', 'max', 'mean', 'std'],
        'volumeCounts24H': ['min', 'max', 'mean', 'std'],
        'volumeCounts3D': ['min', 'max', 'mean', 'std'],
        'volumeCounts5D': ['min', 'max', 'mean', 'std'],
        'volumeCounts7D': ['min', 'max', 'mean', 'std']
            }
    news_cols_numeric = set(news_cols_agg.keys()) - set(['assetCode', 'time'])
        
    def fit(self, news_train_df):
        """
        Fit preprocessing scalers, encoders on given train df.
        @param news_train_df: train data to fit on
        """
        # Fill na with previous value for the asset
        news_train_df = news_train_df.copy()
        #news_train_df = news_train_df.groupby('assetCode').transform(lambda g: g.fillna(method='bfill'))
        
        # Aggregation
        news_train_df_agg = self.aggregate_news(news_train_df)
        news_train_df_agg.fillna(0, inplace=True)
        #news_train_df_agg = news_train_df_agg._get_numeric_data().astype(float)
        
        #Fit scaler
        self.numeric_scaler = StandardScaler()
        self.numeric_scaler.fit(news_train_df_agg)
        # Save news feature cols
        self.feature_cols = list(news_train_df_agg.columns.values)

    def get_X(self, df):
        news_df = df.copy()
        # Fill na with previous value for the asset
        #news_df = df.groupby('assetCode').transform(lambda g: g.fillna(method='bfill'))
        
        # Aggregate by time, asset code
        news_df = self.aggregate_news(df)
        # Normalize, fillna etc. Don't remove rows.
        news_df.fillna(0, inplace=True)
        if not news_df.empty:
            news_df_numeric = news_df._get_numeric_data().astype(float)
            news_df[news_df_numeric.columns] = self.numeric_scaler.transform(news_df_numeric)
        return(news_df)
        
    def aggregate_news(self, df):
        # Fix asset codes (str -> list)
        df['assetCodes'] = df['assetCodes'].str.findall(f"'([\w\./]+)'")    

        # Leave only days in time
        if not df.empty: df.time = df.time.astype('datetime64[D, UTC]') #.tail()
        
        #Expand assetCodes
        assetCodes_expanded = list(chain(*df['assetCodes']))
        
        if(not df.empty): assetCodes_index = df.index.repeat(df['assetCodes'].apply(len)) 
        else: assetCodes_index = df.index
        assert len(assetCodes_index) == len(assetCodes_expanded)
        df_assetCodes = pd.DataFrame({'level_0': assetCodes_index, 'assetCode': assetCodes_expanded})

        # Create expanded news (will repeat every assetCodes' row)
        news_cols = ['time', 'assetCodes'] + sorted(list(self.news_cols_agg.keys()))
        df_expanded = pd.merge(df_assetCodes, df[news_cols], left_on='level_0', right_index=True, suffixes=(['','_old']))

        # Aggregate numerical news features
        df_aggregated = df_expanded.groupby(['time', 'assetCode']).agg(self.news_cols_agg)

        # Convert to float32 to save memory
        #df_aggregated = df_aggregated.apply(np.float32)

        # Flat columns
        df_aggregated.columns = ['_'.join(col).strip() for col in df_aggregated.columns.values]

        return df_aggregated    
        
# Create instance for global usage
news_prepro = NewsPrepro()
print('news_prepro created')

## Join market and news
Generator, tests and submission will call this facade to request joined market&news data.

In [None]:
class JoinedPreprocessor:
    def __init__(self, market_prepro, news_prepro):
        self.market_prepro = market_prepro
        self.news_prepro = news_prepro
        
    def fit(self, market_train_idx, market, news):
        # market has index [time, assetCode]
        market_train_df = market.loc[market_train_idx.index]
        self.market_prepro.fit(market_train_df)
        # We select news in train time interval
        news_train_df = news.merge(market_train_idx, on=['time'])
        self.news_prepro.fit(news_train_df)
    
    def get_X(self, market_df, news_df):
        # Market should already has index (time, assetCode)
        # Preprocess market X
        market_X = market_prepro.get_X(market_df)
        market_X['time'] = market_df['time']
        market_X['assetCode'] = market_df['assetCode']
        
        #news_X will have index [time, assetCode]
        news_X = news_prepro.get_X(news_df)
        # Join by index, which is time, assetCode. Some assets have no news at all, so left join and 0 nans
        X = market_X.merge(news_X, how='left', left_on=['time', 'assetCode'], right_on=['time','assetCode'],  right_index=True)
        
        # Some market data can be without news, fill nans
        X.fillna(0, inplace=True)
        # Return features market + news from joined df
        features = X[market_prepro.feature_cols + news_prepro.feature_cols]
        return(features)

    def get_y(self, market_df): 
        return(self.market_prepro.get_y(market_df))
    
    def get_Xy(self, market_df, news_df):
        return(self.get_X(market_df, news_df), self.get_y(market_df))
    
    def fix_train(self, market_df, news_df):
        """
        Clean train data. Here we can remove bad rows
        """
        return(market_prepro.fix_train(market_df), news_df)

    
# Market and news preprocessor instance
prepro = JoinedPreprocessor(market_prepro, news_prepro)
prepro.fit(market.loc[market_train_idx.index], market, news)
print('Preprocessor created, it is fit')

### Look at market and news X, y
We can preprocess a sample and check scaled X, y.  At this point there is no look back window for LSTM, it will be calculated in generator later.

In [None]:
def get_merged_Xy(idx):
    """
    Show min/max and quantiles for given sample
    """
    market_df = market.loc[idx.index]
    # Select subset of news for future merge by assetCode and time. 
    news_df = news.merge(idx, on=['time'])
    X, y = prepro.get_Xy(market_df, news_df)
    return pd.concat([X,y], axis=1)

# Look at statistics of preprocessed sample
get_merged_Xy(market_test_idx.sample(10000)).describe()

## Data generator
Keras standard approach to generate batches for **model.fit_generator()**. 
Get market and news data together here.

**Opened question:** how to better organize the data?
Ideally we could have one trained model per asset. But there are more than 3K assets - no resources for such a big train. There also are new unseen assets in future data. Still have no clear idea how to handle this.

**Current implementation: ** sort sample data by assetCode, time, like human trader looks at candlestick charts one by one.



In [None]:
class JoinedGenerator:
    """
    Keras standard approach to generage batches for model.fit_generator() call.
    """
    def __init__(self, prepro, market, news, index_df):
        """
        @param preprocessor: market and news join preprocessor
        @param market: full loaded market df
        @param news: full loaded news df
        @param index_df: df with assetCode and time of train or validation market data. Batches will be taken from them.
        """
        self.market = market
        self.prepro = prepro
        self.news = news
        self.index_df = index_df

    def flow_lstm(self, batch_size, is_train, look_back, look_back_step):
        """
        Generate batch data for LSTM NN
        Each cycle in a loop we yield a batch for one training step in epoch. 
        """
        while True:
            # Get market indices of random assets, sorted by assetCode, time.
            batch_index_df = self.get_random_assets_idx(batch_size)

            # Get X, y data for this batch, containing market and news, but without look back yet
            X, y = self.get_batch(batch_index_df, is_train)
            # Add look back data to X, y
            X, y = self.with_look_back(X,y,look_back,look_back_step)
            yield X,y
    
    def get_random_assets_idx(self, batch_size):
        """
        Get random asset and it's last market data indices.
        Repeat for next asset until we reach batch_size.
        """
        asset_codes = self.index_df['assetCode'].unique().tolist()

        # Insert first asset
        asset = np.random.choice(asset_codes)
        asset_codes.remove(asset)
        batch_index_df = self.index_df[self.index_df.assetCode == asset].tail(batch_size)
        # Repeat until reach batch_size records
        while (batch_index_df.index.size < batch_size) and (len(asset_codes) > 0):
            asset = np.random.choice(asset_codes)
            asset_codes.remove(asset)
            asset_index_df = self.index_df[self.index_df.assetCode == asset].tail(batch_size - batch_index_df.index.size)
            batch_index_df = pd.concat([batch_index_df, asset_index_df])
        
        return batch_index_df.sort_values(by=['assetCode', 'time'])
            
    def get_batch(self, batch_idx, is_train):
        """
        Get batch of market-news data withoutlook back yet.
        """
        market_df = self.market.loc[batch_idx.index]
        # Select subset of news for future merge by assetCode and time. 
        news_df = news.merge(batch_idx, on=['time'])
        # Remove bad rows, clean the data. It's ok for train.
        if is_train: 
            market_df, news_df = prepro.fix_train(market_df, news_df)
        # Join market and news using preprocessor       
        X = self.prepro.get_X(market_df, news_df)
        y = self.prepro.get_y(market_df)
        return(X, y)
    
    # convert an array of values into a dataset matrix
    def with_look_back(self, X, y, look_back, look_back_step):
        """
        Add look back window values to prepare dataset for LSTM
        """
        X_processed, y_processed = [], []
        # Fix last window in batch, can be not full
        if look_back > len(X): 
            look_back = len(X)
            look_back_step = min(look_back_step, look_back)
            
        for i in range(0,len(X)-look_back+1):
            # Add lookback to X
            x_window = X.values[i:(i+look_back):look_back_step, :]
            X_processed.append(x_window)
            # If input is X only, we'll not output y
            if y is None: continue
            # Add lookback to y
            y_window = y.values[i+look_back-1, :]
            y_processed.append(y_window)
        # Return Xy for train/test or X for prediction
        if(y is not None): return np.array(X_processed), np.array(y_processed)
        else: return np.array(X_processed)

    
# Train data generator instance
join_generator = JoinedGenerator(prepro, market, news, market_train_idx)

# Validation data generator instance
val_generator = JoinedGenerator(prepro, market, news, market_val_idx)
print('Generators created')

# X,y=next(join_generator.flow_lstm(20,True,10,2))
# print(X.shape)

# 5. Base LSTM model for market and news


## Define the model

In [None]:
class ModelFactory:
    """
    Generate different models. Actually only one of them is used in the kernel, 
    this factory is for experiments when debugging.
    """
    # LSTM look back window size
    look_back=90
    # In windows size look back each look_back_step days
    look_back_step=10

    def lstm_128():
        model = Sequential()
        # Add an input layer market + news
        input_size = len(market_prepro.feature_cols) + len(news_prepro.feature_cols)
        # input_shape=(timesteps, input features)
        model.add(LSTM(units=128, return_sequences=True, input_shape=(None,input_size)))
        model.add(LSTM(units=64, return_sequences=True ))
        model.add(LSTM(units=32, return_sequences=False))
        
        # Add an output layer 
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
        
        return(model)        

model = ModelFactory.lstm_128()
model.summary()


## Train market and news model

In [None]:
weights_file='best_weights.h5'

# We'll stop training if no improvement after some epochs
earlystopper = EarlyStopping(patience=5, verbose=1)

# Low, avg and high scor training will be saved here
# Save the best model during the traning
checkpointer = ModelCheckpoint(weights_file
    #,monitor='val_acc'
    ,verbose=1
    ,save_best_only=True
    ,save_weights_only=True)

#reduce_lr = ReduceLROnPlateau(factor=0.2, patience=3, min_lr=0.001)
reduce_lr = ReduceLROnPlateau(factor=0.1, patience=2, min_lr=0.001)

# Set fit parameters
# Rule of thumb: steps_per_epoch = TotalTrainingSamples / TrainingBatchSize
#                validation_steps = TotalvalidationSamples / ValidationBatchSize
if toy:
    batch_size=1000
    validation_batch_size=1000
    steps_per_epoch=5
    validation_steps=2
    epochs=5
    ModelFactory.look_back=30
    ModelFactory.look_back_step=5
else:
    batch_size=1000
    validation_batch_size=1000
    steps_per_epoch=20
    validation_steps=5
    epochs=20

print(f'Toy:{toy}, epochs:{epochs}, steps per epoch: {steps_per_epoch}, validation steps:{validation_steps}')
print(f'Batch_size:{batch_size}, validation batch size:{validation_batch_size}')

# Fit
training = model.fit_generator(join_generator.flow_lstm(batch_size=batch_size 
            , is_train=True 
            , look_back=ModelFactory.look_back 
            , look_back_step=ModelFactory.look_back_step) 
        , epochs=epochs 
        , validation_data=val_generator.flow_lstm(batch_size=validation_batch_size
            , is_train=False
            , look_back=ModelFactory.look_back
            , look_back_step=ModelFactory.look_back_step) 
        , steps_per_epoch=steps_per_epoch 
        , validation_steps=validation_steps 
        , callbacks=[earlystopper, checkpointer, reduce_lr])
# Load best weights saved
model.load_weights(weights_file)

## Evaluate market model

### Loss function by epoch 

In [None]:
# # Plotting
# f, axs = plt.subplots(3,1, sharex=True, figsize=(12,8))
# # Close price 
# ass_market.close.plot(ax=axs[0])
# axs[0].set_ylabel("Price")

plt.figure(1, figsize=(8,3))
plt.subplot(121)
plt.plot(training.history['loss'])
plt.plot(training.history['val_loss'])
plt.title("Loss and validation loss")
plt.legend(["Loss", "Validation loss"])
plt.xlabel("Epoch")
plt.ylabel("Loss")

plt.subplot(122)
plt.plot(training.history['acc'])
plt.plot(training.history['val_acc'])
plt.title("Acc and validation acc")
plt.legend(["Acc", "Validation acc"])
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.suptitle('Training history', fontsize=16)
plt.show()


### Predict on test data

In [None]:
def predict_on_test():
    # Predict on last test data
    pred_size=100
    pred_idx = market_test_idx.tail(pred_size + ModelFactory.look_back)
    market_df = market.loc[pred_idx.index]
    news_df = news.merge(pred_idx, on=['time'])
    # Get preprocessed X, y
    X_test, y_test = prepro.get_Xy(market_df, news_df)
    # Add there look back rows for LSTM
    X_test, y_test = join_generator.with_look_back(X_test, y_test, look_back = ModelFactory.look_back, look_back_step=ModelFactory.look_back_step)
    
    # Predict
    y_pred = model.predict(X_test)*2-1

    # Plot
    ax1 = plt.subplot2grid((2, 2), (0, 0), rowspan=2)
    ax1.plot(market_df['returnsOpenNextMktres10'].values, linestyle='none', marker='.', color='darkblue')
    ax1.plot(y_pred, linestyle='none', marker='.', color='darkorange')
    ax1.legend(["Ground truth","Predicted"])
    ax1.set_title("Both")
    ax1.set_xlabel("Epoch")
    ax2 = plt.subplot2grid((2, 2), (0, 1), colspan=1,rowspan=1)
    ax2.plot(market_df['returnsOpenNextMktres10'].values, linestyle='none', marker='.', color='darkblue')
    ax2.set_title("Ground truth")
    ax3 = plt.subplot2grid((2, 2), (1, 1), colspan=1,rowspan=1)
    ax3.plot(y_pred, linestyle='none', marker='.', color='darkorange')
    ax3.set_title("Predicted")
    plt.tight_layout()
    plt.show()

predict_on_test()

### Predict on random asset

In [None]:
def predict_random_asset():
    """
    Get random asset from test set, predict on it, plot ground truth and predicted value
    """
    # Get any asset
    asset = market_test_idx['assetCode'].sample(1, random_state=24).values[0]
    #asset_idx = market_test_idx[market_test_idx.assetCode == asset]
    market_df = market.loc[market.assetCode == asset].copy().set_index(['time'], drop=False)
    news_df = news.merge(market_df, on=['time'])
    
    # Preprocess market and news
    X,y = prepro.get_Xy(market_df, news_df)
    X,y = join_generator.with_look_back(X, y, look_back=ModelFactory.look_back, look_back_step=ModelFactory.look_back_step)
    
    # Prediction
    y_pred = model.predict(X)*2-1
    # Set time index from market df for predicted values
    y_pred = pd.DataFrame(y_pred, index = market_df.iloc[ModelFactory.look_back-1:]['time'].dt.date)

    # Plot
    plt.plot( market_df['returnsOpenNextMktres10'], linestyle='none', marker='.', color='darkblue')
    plt.plot(y_pred, linestyle='none', marker='.', color='darkorange')
    plt.xticks(rotation=45)
    plt.title(asset)
    plt.legend(["Ground truth", "predicted"])
    plt.show()
    
predict_random_asset()

In [None]:
# def get_score():
#     """
#     Calculation of actual metric that is used to calculate final score
#     @param r: returnsOpenNextMktres10
#     @param u: universe
#     where rti is the 10-day market-adjusted leading return for day t for instrument i, and uti is a 0/1 universe variable (see the data description for details) that controls whether a particular asset is included in scoring on a particular day.    
#     """
#     # Get test sample to calculate score on
#     idx = market_test_idx.sample(1000)
#     market_df = market.loc[idx.index]
#     news_df = news.merge(idx, on=['time'])
    
#     # Prepare X, y
#     X_test, y_test = prepro.get_Xy(market_df, news_df)
#     X_test, y_test = join_generator.with_look_back(X_test, y_test, ModelFactory.look_back, ModelFactory.look_back_step)
#     y_test = pd.DataFrame(y_test)

#     # Accuracy metric
#     confidence = model.predict(X_test)*2-1   
#     look_back=ModelFactory.look_back
#     r=market_df['returnsOpenNextMktres10']#.values[look_back:]
#     u=market_df['universe']#.values[look_back:]
# #     print(df.size)
#     print(len(confidence))
# #     print(r.size())
# #     print(u.size())
#     #print('df: %s, confidence: %s, r: %s, u: %s ' % (df.count(), confidence.count(), r.count(), u.count()))
#     # calculation of actual metric that is used to calculate final score
#     r = r.clip(-1,1) # get rid of outliers. Where do they come from??
#     x_t_i = confidence.values * r * u
#     print(x_t_i.iloc[0])
#     d = (market_df['time'].dt.day).values[look_back:]
# #     print('d.count() = %s, x_t_i count = %s' % (d.count(), x_t_i.count()))
#     data = {'day' : d, 'x_t_i' : x_t_i}
#     df = pd.DataFrame(data)
#     x_t = df.groupby('day').sum().values.flatten()
#     mean = np.mean(x_t)
#     std = np.std(x_t)
#     score = mean / std
#     print(score)
    


In [None]:
def calc_acc():
    # Get X_test, y_test with look back for LSTM
    market_df = market.loc[market_test_idx.index].sample(1000)
    news_df = news.merge(market_test_idx, on=['time'])
    X_test, y_test = prepro.get_Xy(market_df, news_df)
    X_test, y_test = join_generator.with_look_back(X_test, y_test, look_back=ModelFactory.look_back, look_back_step=ModelFactory.look_back_step)
    y_test = pd.DataFrame(y_test)
    
    # True labels
    labels = market_df.returnsOpenNextMktres10.iloc[ModelFactory.look_back-1:]
    
    # Accuracy metric
    y_pred = pd.DataFrame(model.predict(X_test))*2-1

    #y_pred = pd.DataFrame(market_prepro.y_scaler.inverse_transform(model.predict(X_test)))
    print("Accuracy: %f" % accuracy_score(labels >= 0, y_pred >= 0))
    #score = get_score(market_df, confidence, market_df.returnsOpenNextMktres10, market_df.universe)
    print('Predictions size: ', len(y_pred.values))
    print('y_test size:', len(y_test.values))
     # Show distribution of confidence that will be used as submission
    plt.hist(labels.values, bins='auto', alpha=0.3)
    plt.hist(y_pred.values, bins='auto', alpha=0.3, color='darkorange')
    plt.legend(['Ground truth', 'Predicted'])
    plt.xlabel("Confidence")
    plt.ylabel("Count")
    plt.title("predicted confidence")
    plt.show()

# Call accuracy calculation and plot    
calc_acc()

# Submission

In [None]:
def make_predictions(market_obs_df, news_obs_df, predictions_template_df):
    """
    Predict confidence for one day and update predictions_template_df['confidenceValue']
    @param market_obs_df: market_obs_df returned from env
    @param predictions_template_df: predictions_template_df returned from env.
    @return: None. prediction_template_df updated instead. 
    """
    # Preprocess the data
    X = prepro.get_X(market_obs_df, news_obs_df)
    # Add look back window for LSTM, passing X only - we don't know y, we are predicting them
    X = join_generator.with_look_back(X, None, look_back=ModelFactory.look_back, look_back_step=ModelFactory.look_back_step)
    # Predict
    y_pred = model.predict(X)
    confidence_df=pd.DataFrame(y_pred*2-1, columns=['confidence'])

    # Merge predicted confidence to predictions template
    pred_df = pd.concat([predictions_template_df, confidence_df], axis=1).fillna(0)
    predictions_template_df.confidenceValue = pred_df.confidence

In [None]:
##########################
# Submission code

# Save data here for later debugging on it
days_saved_data = []

# Store execution info for plotting later
predicted_days=[]
predicted_times=[]
last_predictions_template_df = None

# Predict day by day
days = env.get_prediction_days()
last_year=None
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    # Store the data for later debugging on it
    days_saved_data.append((market_obs_df, news_obs_df, predictions_template_df))
    # For later plotting
    predicted_days.append(market_obs_df.iloc[0].time.strftime('%Y-%m-%d'))
    time_start = time()
    # For logging
    cur_year = market_obs_df.iloc[0].time.strftime('%Y')
    if cur_year != last_year:
        print(f'Predicting {cur_year}...')
        last_year = cur_year

    # Call prediction func
    make_predictions(market_obs_df, news_obs_df, predictions_template_df)
    #!!!
    env.predict(predictions_template_df)
    
    # For later plotting
    last_predictions_template_df = predictions_template_df
    predicted_times.append(time()-time_start)
    #print("Prediction completed for ", predicted_days[-1])

In [None]:
# Plot execution time 
sns.barplot(np.array(predicted_days), np.array(predicted_times))
plt.title("Execution time per day")
plt.xlabel("Day")
plt.ylabel("Execution time, seconds")
plt.show()

# Plot predicted confidence for last day
last_predictions_template_df.plot(linestyle='none', marker='.', color='darkorange')
plt.title("Predicted confidence for last observed day: %s" % predicted_days[-1])
plt.xlabel("Observation No.")
plt.ylabel("Confidence")
plt.show()

In [None]:
# We've got a submission file!
# !!! Write submission after all days are predicted
env.write_submission_file()
print([filename for filename in os.listdir('.') if '.csv' in filename])