In [None]:
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()

(market_train_df, news_train_df) = env.get_training_data()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.dates as mdates
fig, ax= plt.subplots(1,2,figsize=(20, 5))
#TODO: what is locator used for?
years = mdates.YearLocator()   # every year
months = mdates.MonthLocator()  # every month
yearsFmt = mdates.DateFormatter('%Y')
monthsFmt = mdates.DateFormatter('%Y-%m')
mkt_apple = market_train_df[(market_train_df['assetCode'] == 'AAPL.O')& (market_train_df['time'] >= '2014-01-01')
                                & (market_train_df['time'] <= '2015-01-01')] 
#ax.plot(mkt_apple['time'],mkt_apple['close'],label='close price')
ax[0].plot(mkt_apple['time'],mkt_apple['returnsClosePrevMktres10'],label='returnsClosePrevMktres10')
ax[0].plot(mkt_apple['time'],mkt_apple['returnsOpenPrevMktres10'],label='returnsOpenPrevMktres10')
ax[0].plot(mkt_apple['time'],mkt_apple['returnsOpenNextMktres10'],label='returnsOpenNextMktres10')
ax[0].xaxis.set_major_formatter(monthsFmt)
#plt.setp(ax[0].get_xticklabels(), rotation=90)
ax[0].set_xlabel('Date')
ax[0].legend(loc='best')
ax[0].set_title('Apple Shares')


ax[1].plot(mkt_apple['time'],mkt_apple['volume'],label='volume')
ax[1].xaxis.set_major_formatter(monthsFmt)
#plt.setp(ax.get_xticklabels(), rotation=90)
ax[1].set_xlabel('Date')
ax[1].legend(loc='best')
ax[1].set_title('Apple Shares Volume')

mkt_apple['returnsOpenPrevMktres10'].describe()


In [None]:
from wordcloud import WordCloud
from nltk.corpus import stopwords
apple_news_sent = news_train_df[(news_train_df['assetCodes'].str.contains('AAPL.O',regex=False)) & (news_train_df['time'] >= '2014-01-01')
                                & (news_train_df['time'] <= '2015-01-01')]
apple_news_sent['time']= pd.to_datetime(apple_news_sent['time']).dt.date
apple_news_sent = apple_news_sent.groupby(['time','sentimentClass']).count()['sourceTimestamp'].unstack()
fig, ax= plt.subplots(1,2,figsize=(20, 5))
#multiplying by 5 to enlarge the graphs,doesnt change the relative ratio. NS if we can do this
ax[0].bar(apple_news_sent.index,5*apple_news_sent[-1],label='Negative',color='red',bottom=None)
ax[0].bar(apple_news_sent.index,5*apple_news_sent[0],label='Neutral',color='blue',bottom=apple_news_sent[-1])
ax[0].bar(apple_news_sent.index,5*apple_news_sent[1],label='Positive',color='green',bottom=apple_news_sent[0])
ax[0].plot(mkt_apple['time'],mkt_apple['close'],label='close price')
ax[0].plot(mkt_apple['time'],mkt_apple['open'],label='open price')
ax[0].xaxis.set_major_formatter(monthsFmt)
ax[0].set_ylim(ymin=0,ymax=1000)
ax[0].legend(loc='best')
ax[0].set_title('Apple News')
ax[0].set_xlabel('Time')
#ax.set_ylabel('Sentiment Levels')
stop = set(stopwords.words('english'))
apple_news_sent = news_train_df[(news_train_df['assetCodes'].str.contains('AAPL.O',regex=False)) & (news_train_df['time'] >= '2014-01-01')
                                & (news_train_df['time'] <= '2015-01-01')]
text = ' '.join(apple_news_sent['headline'].str.lower().values[-1000000:])
wordcloud = WordCloud(max_font_size=None, stopwords=stop, background_color='white',
                      width=1200, height=1000).generate(text)
#plt.figure(figsize=(12, 8))
ax[1].imshow(wordcloud)
ax[1].set_title('Top words in headline')
#ax[1].axis("off")
plt.show()


In [None]:
#code to test stationarity of apple share returns for next 10 days
from statsmodels.tsa.stattools import adfuller
fig,ax = plt.subplots(1,2,figsize=(20,5))
def test_stationarity(time_series,ax,tag):
    #moving statistics averaged over last year, ~250 values per year
    roll_mean = time_series.rolling(window=250).mean()
    roll_std = time_series.rolling(window=250).std()
    ax.plot(roll_mean,label='mean')
    ax.plot(roll_std,label='std')
    ax.plot(time_series,label=tag)
    ax.xaxis.set_major_formatter(monthsFmt)
    ax.legend(loc='best')
    ax.set_title(tag)
    ax.set_xlabel('Time')
    #Perform Dickey-Fuller test to check stationarity:
    #if test statistic less than critical value, null hypothesis can be rejected, and TS is stationary
    print ('Results of Dickey-Fuller Test:')
    dftest = adfuller(time_series, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print (dfoutput)
mkt_apple = market_train_df[(market_train_df['assetCode'] == 'AAPL.O')] 
#convert to time series by setting index as time,makes handling easier
mkt_apple=mkt_apple.set_index('time')
#to count entries by every year
#mkt_apple['time'] = mkt_apple['time'].dt.strftime('%Y')
#mkt_apple= mkt_apple.groupby('time').count()
test_stationarity(mkt_apple['returnsOpenNextMktres10'],ax[0],'returnsOpenNextMktres10')
test_stationarity(mkt_apple['open'],ax[1],'open price')

In [None]:
#making the open price stationary, by subtracting exponentially weighted moving avg
mkt_apple = market_train_df[(market_train_df['assetCode'] == 'AAPL.O') ] 
mkt_apple=mkt_apple.set_index('time')
#not transorfimng here, since it doesnt affect results
ts_log = np.log(mkt_apple['open'])
#ts_log = mkt_apple['open']
expwighted_avg = ts_log.ewm(halflife=12).mean()
fig,ax= plt.subplots(1,2,figsize=(20,5))
ax[0].plot(ts_log,label='open price')
ax[0].plot(expwighted_avg,label='EWMA')
ax[0].legend(loc='best')
ax[0].set_title('open price')
ax[0].set_xlabel('Time')
ts_log_ewma_diff = ts_log - expwighted_avg
test_stationarity(ts_log_ewma_diff,ax[1],'diff of open and ewma')

In [None]:
mkt_apple_shift = ts_log - ts_log.shift()
fig,ax= plt.subplots(1,1,figsize=(20,5))
mkt_apple_shift.dropna(inplace=True)
test_stationarity(mkt_apple_shift,ax,'stationarity with differencing')

In [None]:
#applying the ARIMA model, first determine p and q values using the autocorrelation and partial autocorrelation function
from statsmodels.tsa.stattools import acf, pacf
lag_acf = acf(mkt_apple_shift, nlags=20)
lag_pacf = pacf(mkt_apple_shift, nlags=20, method='ols')
fig,ax = plt.subplots(1,2,figsize=(20,5))
ax[0].plot(lag_acf)
ax[0].axhline(y=0,linestyle='--',color='gray')
ax[0].axhline(y=-1.96/np.sqrt(len(mkt_apple_shift)),linestyle='--',color='gray')
ax[0].axhline(y=1.96/np.sqrt(len(mkt_apple_shift)),linestyle='--',color='gray')
ax[0].set_title('Autocorrelation Function')
ax[1].plot(lag_pacf)
ax[1].axhline(y=0,linestyle='--',color='gray')
ax[1].axhline(y=-1.96/np.sqrt(len(mkt_apple_shift)),linestyle='--',color='gray')
ax[1].axhline(y=1.96/np.sqrt(len(mkt_apple_shift)),linestyle='--',color='gray')
ax[1].set_title('Partial Autocorrelation Function')

In [None]:
from statsmodels.tsa.arima_model import ARIMA
#AR model
fig,ax = plt.subplots(2,2,figsize=(20,10))
model = ARIMA(ts_log, order=(1, 1, 0))  #(p,d,q)
results_AR = model.fit(disp=-1)  
ax[0][0].plot(mkt_apple_shift)
ax[0][0].plot(results_AR.fittedvalues, color='red')
ax[0][0].set_title('RSS: %.4f'% sum((results_AR.fittedvalues-mkt_apple_shift)**2))
#MA model
model = ARIMA(ts_log, order=(0, 1, 1))  
results_AR = model.fit(disp=-1)  
ax[0][1].plot(mkt_apple_shift)
ax[0][1].plot(results_AR.fittedvalues, color='red')
ax[0][1].set_title('RSS: %.4f'% sum((results_AR.fittedvalues-mkt_apple_shift)**2))
#combined model
#MA model
model = ARIMA(ts_log, order=(1, 1, 1))  
results_AR = model.fit(disp=-1)  
ax[1][0].plot(mkt_apple_shift)
ax[1][0].plot(results_AR.fittedvalues, color='red')
ax[1][0].set_title('RSS: %.4f'% sum((results_AR.fittedvalues-mkt_apple_shift)**2))
#taking it back to original case
predictions_ARIMA_diff = pd.Series(results_AR.fittedvalues, copy=True)
predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum()
#print (predictions_ARIMA_diff_cumsum.head())
predictions_ARIMA_log = pd.Series(ts_log.ix[0], index=ts_log.index)
predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0)
predictions_ARIMA_log.head()
predictions_ARIMA = np.exp(predictions_ARIMA_log)
ax[1][1].plot(mkt_apple['open'])
ax[1][1].plot(predictions_ARIMA)
ax[1][1].set_title('RMSE: %.4f'% np.sqrt(sum((predictions_ARIMA-mkt_apple['open'])**2)/len(mkt_apple['open'])))

In [None]:
#General Analysis of closing prices
fig,ax = plt.subplots(1,1,figsize=(20,5))
for quant in [0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95]:
    price_df = market_train_df.groupby('time')['close'].quantile(quant)
    ax.plot(price_df,label='quantile'+str(quant))
ax.legend(loc='best')
ax.set_title('closing price')


In [None]:
market_train_df.isna().sum()

In [None]:
market_train_df['price_diff'] = market_train_df['close'] -market_train_df['open']
grouped = market_train_df.groupby('time')['price_diff'].agg(['std','min'])
fig,ax=plt.subplots(1,1,figsize=(20,5))
ax.plot(grouped['std'])
ax.set(ylabel="std",title="SD Price change in a day")

In [None]:
#since there seem to be some outliers, we need to eliminate them
market_train_df['close_to_open'] = market_train_df['close'] / market_train_df['open']
market_train_df['asset_mean_open'] = market_train_df.groupby('assetName')['open'].transform('mean')
market_train_df['asset_mean_close'] = market_train_df.groupby('assetName')['close'].transform('mean')
#replace with mean values if change in price is greater than 100%
for i, row in market_train_df.loc[market_train_df['close_to_open'] >= 2].iterrows():
    if np.abs(row['asset_mean_open'] - row['open']) > np.abs(row['asset_mean_close'] - row['close']):
        market_train_df.loc[i,'open'] = row['asset_mean_open']
    else:
        market_train_df.loc[i,'close'] = row['asset_mean_close']
        
for i, row in market_train_df.loc[market_train_df['close_to_open'] <= 0.5].iterrows():
    if np.abs(row['asset_mean_open'] - row['open']) > np.abs(row['asset_mean_close'] - row['close']):
        market_train_df.loc[i,'open'] = row['asset_mean_open']
    else:
        market_train_df.loc[i,'close'] = row['asset_mean_close']
market_train_df['price_diff'] = market_train_df['close'] -market_train_df['open']
grouped = market_train_df.groupby('time')['price_diff'].agg(['std','min'])
fig,ax=plt.subplots(1,1,figsize=(20,5))
ax.plot(grouped['std'])
ax.set(ylabel="std",title="SD of Price change in a day")

In [None]:
fig,ax = plt.subplots(1,1,figsize=(20,5))
market_train_df_sampled=market_train_df.loc[market_train_df['time']>='2011-01-01']
for col in ['returnsClosePrevRaw1','returnsClosePrevMktres1','returnsClosePrevRaw10','returnsClosePrevMktres10','returnsOpenNextMktres10']:
    grouped=market_train_df_sampled.groupby('time')[col].mean()
    ax.plot(grouped,label=col)
ax.legend(loc='best')
ax.set_title('Trend of Mean Values')


In [None]:
asset_training_by_day = market_train_df.groupby('time')['assetCode'].nunique()
fig, ax= plt.subplots(1,2,figsize=(20,5))
ax[0].plot(asset_training_by_day)
ax[0].set(ylabel="Unique Assets",title="Unique Assets per year")
assets_by_vol = market_train_df.groupby('assetCode')['volume'].sum()
assets_by_vol = assets_by_vol.sort_values(ascending=False)[0:10]
ax[1].pie(assets_by_vol.values,labels=assets_by_vol.index,autopct='%1.1f%%')
ax[1].set(title="Highest Trading volumes per asset")

In [None]:
#no growth, no decrease
print (len(market_train_df[market_train_df['returnsOpenNextMktres10'] == 0]))

In [None]:
#replacing NULL values in marketres columns with raw values
null_columns=market_train_df.columns[market_train_df.isnull().any()]
print(market_train_df[null_columns].isnull().sum())
column_market = ['returnsClosePrevMktres1','returnsOpenPrevMktres1','returnsClosePrevMktres10', 'returnsOpenPrevMktres10']
column_raw = ['returnsClosePrevRaw1', 'returnsOpenPrevRaw1','returnsClosePrevRaw10', 'returnsOpenPrevRaw10']
for i in range(len(column_raw)):
    market_train_df[column_market[i]] = market_train_df[column_market[i]].fillna(market_train_df[column_raw[i]])

In [None]:
#plotting the range of return values
fig,ax = plt.subplots(2,2,figsize=(20,5))
for i,col in enumerate(column_market):
    ax[i//2][i%2].hist(market_train_df[col].sample(10000).values)
    ax[i//2][i%2].set(title=col)
#removing outliers from returns columns
column_return = column_market + column_raw 
column_return.append('returnsOpenNextMktres10')
for col in column_return:
    makrket_train_df=market_train_df.loc[(market_train_df[col]>=-2) & (market_train_df[col]<=2)]
#Remove strange data: Here we remove data with unknown asset name or asset codes with strange behavior. 
#For more details, see here: https://www.kaggle.com/nareyko/market-return-estimation-and-bad-data-detection
market_train_df = market_train_df[~market_train_df['assetCode'].isin(['PGN.N','EBRYY.OB'])]

In [None]:
# Function to remove outliers from news data
def remove_outliers(data_frame, column_list, low=0.02, high=0.98):
    for column in column_list:
        this_column = data_frame[column]
        quant_df = this_column.quantile([low,high])
        low_limit = quant_df[low]
        high_limit = quant_df[high]
        data_frame[column] = data_frame[column].clip(lower=low_limit, upper=high_limit)
    return data_frame
columns_outlier = ['takeSequence', 'bodySize', 'sentenceCount', 'wordCount', 'sentimentWordCount', 'firstMentionSentence','noveltyCount12H',\
                  'noveltyCount24H', 'noveltyCount3D', 'noveltyCount5D', 'noveltyCount7D', 'volumeCounts12H', 'volumeCounts24H',\
                  'volumeCounts3D','volumeCounts5D','volumeCounts7D']
print('Clipping news outliers ...')
news_train_df = remove_outliers(news_train_df, columns_outlier)

In [None]:
news_train_df.head()

In [None]:
columns_news = ['firstCreated','relevance','sentimentClass','sentimentNegative','sentimentNeutral',
               'sentimentPositive','noveltyCount24H','noveltyCount7D','volumeCounts24H','volumeCounts7D','assetCodes','sourceTimestamp',
               'assetName','audiences', 'urgency', 'takeSequence', 'bodySize', 'companyCount', 
               'sentenceCount', 'firstMentionSentence','time']
asset_code_dict = {k: v for v, k in enumerate(market_train_df['assetCode'].unique())}
def data_prep(market_df,news_df):
    market_df['date'] = market_df.time.dt.date
    market_df.drop(['time','asset_mean_open','asset_mean_close'], axis=1, inplace=True)
    
    news_df = news_df[columns_news]
    news_df['sourceTimestamp']= news_df.sourceTimestamp.dt.hour
    news_df['firstCreated'] = news_df.firstCreated.dt.date
    news_df['assetCodesLen'] = news_df['assetCodes'].map(lambda x: len(eval(x)))
    news_df['assetCodes'] = news_df['assetCodes'].map(lambda x: list(eval(x))[0])
    news_df['asset_sentiment_count'] = news_df.groupby(['assetName', 'sentimentClass'])['time'].transform('count')
    news_df['len_audiences'] = news_train_df['audiences'].map(lambda x: len(eval(x)))
    print(news_df.head())
    kcol = ['firstCreated', 'assetCodes']
    news_df = news_df.groupby(kcol, as_index=False).mean()
    market_df = pd.merge(market_df, news_df, how='left', left_on=['date', 'assetCode'], 
                            right_on=['firstCreated', 'assetCodes'])
    del news_df
    market_df['assetCodeT'] = market_df['assetCode'].map(asset_code_dict)
    market_df = market_df.drop(columns = ['firstCreated','assetCodes','assetName']).fillna(0) 
    return market_df
market_train_df = data_prep(market_train_df, news_train_df)
market_train_df.head()

In [None]:
num_columns = ['volume', 'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevMktres1', 'returnsOpenPrevMktres1', 'returnsClosePrevRaw10', 'returnsOpenPrevRaw10', 
               'returnsClosePrevMktres10', 'returnsOpenPrevMktres10', 'close_to_open', 'sourceTimestamp', 'urgency', 'companyCount', 'takeSequence', 'bodySize', 'sentenceCount',
               'relevance', 'sentimentClass', 'sentimentNegative', 'sentimentNeutral', 'sentimentPositive',
               'noveltyCount24H','noveltyCount7D','volumeCounts24H','volumeCounts7D','assetCodesLen', 'asset_sentiment_count', 'len_audiences']
cat_columns = ['assetCodeT']
feature_columns = num_columns+cat_columns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

data_scaler = StandardScaler()
#data_scaler = MinMaxScaler()
market_train_df[num_columns] = data_scaler.fit_transform(market_train_df[num_columns])
market_train_df = market_train_df.reset_index()
market_train_df = market_train_df.drop(columns='index')
# Random train-test split
train_indices, val_indices = train_test_split(market_train_df.index.values,test_size=0.1, random_state=92)

def get_input(market_train, indices):
    X = market_train.loc[indices, feature_columns].values
    y = market_train.loc[indices,'returnsOpenNextMktres10'].map(lambda x: 0 if x<0 else 1).values
    r = market_train.loc[indices,'returnsOpenNextMktres10'].values
    u = market_train.loc[indices, 'universe']
    d = market_train.loc[indices, 'date']
    return X,y,r,u,d

# r, u and d are used to calculate the scoring metric
X_train,y_train,r_train,u_train,d_train = get_input(market_train_df, train_indices)
X_val,y_val,r_val,u_val,d_val = get_input(market_train_df, val_indices)

In [None]:
#parameter search takes long time
import lightgbm as lgb
'''
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform

tune_params = {'n_estimators': [200,500,1000,2500,5000],
              'max_depth': sp_randint(4,12),
              'colsample_bytree':sp_uniform(loc=0.8, scale=0.15),
              'min_child_samples':sp_randint(60,120),
              'subsample': sp_uniform(loc=0.75, scale=0.25),
              'reg_lambda':[1e-3, 1e-2, 1e-1, 1]}

fit_params = {'early_stopping_rounds':40,
              'eval_metric': 'accuracy',
              'eval_set': [(X_train, y_train), (X_val, y_val)],
              'verbose': 20,
              'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_power)]}
              
lgb_clf = lgb.LGBMClassifier(n_jobs=4, objective='binary',random_state=1)
gs = RandomizedSearchCV(estimator=lgb_clf, 
                        param_distributions=tune_params, 
                        n_iter=40,
                        scoring='f1',
                        cv=5,
                        refit=True,
                        random_state=1,
                        verbose=True)
'''
def learning_rate_power(current_round):
    base_learning_rate = 0.19000424246380565
    min_learning_rate = 0.01
    lr = base_learning_rate * np.power(0.995,current_round)
    return max(lr, min_learning_rate)

fit_params = {'early_stopping_rounds':40,
              'eval_metric': 'accuracy',
              'eval_set': [(X_train, y_train), (X_val, y_val)],
              'verbose': 20,
              'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_power)]}

lgb_clf = lgb.LGBMClassifier(n_jobs=4,
                             objective='multiclass',
                            random_state=100)
opt_params = {'n_estimators':500,
              'boosting_type': 'dart',
              'objective': 'binary',
              'num_leaves':2452,
              'min_child_samples':212,
              'reg_lambda':0.01}
lgb_clf.set_params(**opt_params)
lgb_clf.fit(X_train, y_train,**fit_params)

In [None]:
features_imp = pd.DataFrame()
features_imp['features'] = list(feature_columns)[:]
features_imp['importance'] = lgb_clf.feature_importances_
features_imp = features_imp.sort_values(by='importance', ascending=False).reset_index()

y_plot = -np.arange(15)
plt.figure(figsize=(10,6))
plt.barh(y_plot, features_imp.loc[:14,'importance'].values)
plt.yticks(y_plot,(features_imp.loc[:14,'features']))
plt.xlabel('Feature importance')
plt.title('Features importance')
plt.tight_layout()


In [None]:
# This code is inspired from this kernel: https://www.kaggle.com/skooch/lgbm-w-random-split-2
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.externals.joblib import Parallel, delayed
from sklearn.base import clone

clfs = []
for i in range(20):
    clf = lgb.LGBMClassifier(learning_rate=0.1, random_state=1200+i, silent=True,
                             n_jobs=4, n_estimators=2500)
    clf.set_params(**opt_params)
    clfs.append(('lgbm%i'%i, clf))

def split_data(X, y, test_percentage=0.2, seed=None):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_percentage)
    return X_train, y_train, X_test, y_test 

def _parallel_fit_estimator(estimator, X, y, sample_weight=None, **fit_params):
    
    # randomly split the data so we have a test set for early stopping
    X_train, y_train, X_test, y_test = split_data(X, y, seed=1992)
    
    # update the fit params with our new split
    fit_params["eval_set"] = [(X_train,y_train), (X_test,y_test)]
    
    # fit the estimator
    if sample_weight is not None:
        estimator.fit(X_train, y_train, sample_weight=sample_weight, **fit_params)
    else:
        estimator.fit(X_train, y_train, **fit_params)
    return estimator

class VotingClassifierLGBM(VotingClassifier):
    '''
    This implements the fit method of the VotingClassifier propagating fit_params
    '''
    def fit(self, X, y, sample_weight=None, **fit_params):
        
        if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
            raise NotImplementedError('Multilabel and multi-output'
                                      ' classification is not supported.')

        if self.voting not in ('soft', 'hard'):
            raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
                             % self.voting)

        if self.estimators is None or len(self.estimators) == 0:
            raise AttributeError('Invalid `estimators` attribute, `estimators`'
                                 ' should be a list of (string, estimator)'
                                 ' tuples')

        if (self.weights is not None and
                len(self.weights) != len(self.estimators)):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d estimators'
                             % (len(self.weights), len(self.estimators)))

        if sample_weight is not None:
            for name, step in self.estimators:
                if not has_fit_parameter(step, 'sample_weight'):
                    raise ValueError('Underlying estimator \'%s\' does not'
                                     ' support sample weights.' % name)
        names, clfs = zip(*self.estimators)
        self._validate_names(names)

        n_isnone = np.sum([clf is None for _, clf in self.estimators])
        if n_isnone == len(self.estimators):
            raise ValueError('All estimators are None. At least one is '
                             'required to be a classifier!')

        self.le_ = LabelEncoder().fit(y)
        self.classes_ = self.le_.classes_
        self.estimators_ = []

        transformed_y = self.le_.transform(y)

        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y,
                                                 sample_weight=sample_weight, **fit_params)
                for clf in clfs if clf is not None)

        return self
    
vc = VotingClassifierLGBM(clfs, voting='soft')
vc.fit(X_train, y_train, **fit_params)
filename = 'VotingClassifierLGBM.sav'
pickle.dump(vc, open(filename, 'wb'))
vc = pickle.load(open(filename, 'rb'))
vc.voting = 'soft'
predicted_class = vc.predict(X_val)
predicted_return = vc.predict_proba(X_val)
#predicted_return = confidence_out(predicted_return)
predicted_return = vc.predict_proba(X_val)[:,1]*2-1
predicted_return = rescale(predicted_return, r_train)

vc.voting = 'soft'
global_accuracy_soft = accuracy_score(y_val, predicted_class)
global_f1_soft = f1_score(y_val, predicted_class)
print('Accuracy score clfs: %f' % global_accuracy_soft)
print('F1 score clfs: %f' % global_f1_soft)

r_val = r_val.clip(-1,1) # get rid of outliers. Where do they come from??
x_t_i = predicted_return * r_val * u_val
data = {'day' : d_val, 'x_t_i' : x_t_i}
df = pd.DataFrame(data)
x_t = df.groupby('day').sum().values.flatten()
mean = np.mean(x_t)
std = np.std(x_t)
score_valid = mean / std
print('Validation score', score_valid)

In [None]:
days = env.get_prediction_days()
n_days = 0
prep_time = 0
prediction_time = 0
packaging_time = 0
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    n_days +=1
    if n_days % 50 == 0:
        print(n_days,end=' ')

    t = time.time()
    column_market = ['returnsClosePrevMktres1','returnsOpenPrevMktres1','returnsClosePrevMktres10', 'returnsOpenPrevMktres10']
    column_raw = ['returnsClosePrevRaw1', 'returnsOpenPrevRaw1','returnsClosePrevRaw10', 'returnsOpenPrevRaw10']
    market_obs_df['close_open_ratio'] = np.abs(market_obs_df['close']/market_obs_df['open'])
    for i in range(len(column_raw)):
        market_obs_df[column_market[i]] = market_obs_df[column_market[i]].fillna(market_obs_df[column_raw[i]])

    market_obs_df = market_obs_df[market_obs_df.assetCode.isin(predictions_template_df.assetCode)]
    market_obs_df = market_obs_df[market_obs_df.assetCode.isin(asset_code_dict.keys())]
    market_obs = data_prep(market_obs_df, news_obs_df)
    market_obs[num_columns] = data_scaler.transform(market_obs[num_columns])
    X_live = market_obs[feature_columns].values
    prep_time += time.time() - t

    t = time.time()
    lp = vc.predict_proba(X_live)
    prediction_time += time.time() -t

    t = time.time()
    confidence = lp[:,1] - lp[:,0]
    #confidence = confidence_out(lp)
    confidence = rescale(confidence, r_train)
    preds = pd.DataFrame({'assetCode':market_obs['assetCode'],'confidence':confidence})
    predictions_template_df = predictions_template_df.merge(preds,how='left').drop('confidenceValue',axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})
    env.predict(predictions_template_df)
    packaging_time += time.time() - t

env.write_submission_file()