# Imports and reading data

##### Import all libraries: pandas, sklearn, matplotlib, seaborn, etc.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
import matplotlib.pyplot as plt
import seaborn as sns

import datetime 
from datetime import datetime as dt

import sklearn # ML
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
#from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

from kaggle.competitions import twosigmanews

# Any results you write to the current directory are saved as output.

##### Load data from the environment.

In [None]:
# Retreive the environment of the competition
# You can only call make_env() once, so don't lose it!
env = twosigmanews.make_env()
print('Data loaded!')

In [None]:
# Retrieve all training data
(market_train_df, news_train_df) = env.get_training_data()
print("Fetching training data finished... ")
print('Data obtained!')

# Market data analysis

### Types and example

##### We want to get an idea of what is there inside the market data (types, content, etc.)

In [None]:
# Market data analysis
# Types of the columns
print(market_train_df.dtypes)
market_train_df.head()

### Analysis of main variables of market

In [None]:
# Lets remove universe as it is useless for predicting
market_train_df.drop("universe", axis=1, inplace=True)

##### Correlations (in terms of Pearson coefficient).

In [None]:
# Correlation between the numericals (except universe)
# Note that this removes the null values from the computation
market_train_df.iloc[:, 3:].corr(method='pearson')

In [None]:
sns.set(style="white")
# Compute the correlation matrix
corr = market_train_df.iloc[:, 3:].corr(method='pearson').corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(9, 7))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5})

##### Most relevant assets by volume.

In [None]:
# Let's begin understanding what do the market_train_df has. Order assets by volume
market_train_df.sort_values(by = "volume", ascending = False)["assetName"].unique()

##### Evolution of market prices at closing time for some assets.

In [None]:
apple = market_train_df.loc[market_train_df['assetName'] == 'Apple Inc',:]
facebook = market_train_df.loc[market_train_df['assetName'] == 'Facebook Inc',:]
microsoft = market_train_df.loc[market_train_df['assetName'] =='Microsemi Corp',:]
oracle = market_train_df.loc[market_train_df['assetName'] == 'Oracle Corp',:]
bank_america = market_train_df.loc[market_train_df['assetName'] == 'Bank of America Corp']
#print(apple.head(), facebook.head(),  microsoft.head(), oracle.head(), bank_america.head())
#microsoft.head()
plt.plot(apple['time'], apple['close'], color='blue')
plt.plot(facebook['time'], facebook['close'], color='red')
plt.plot(microsoft['time'], microsoft['close'], color='g')
plt.plot(oracle['time'], oracle['close'], color='magenta')
plt.plot(bank_america['time'], bank_america['close'], color='yellow')
plt.legend(["Apple", "Facebook", "Microsoft", "Oracle","Bank of America"])
plt.title("Close prices over time")
plt.show()

##### Evolution of returnsOpenNextMktres10 (target variable) for the same assets.

In [None]:
apple = market_train_df.loc[market_train_df['assetName'] == 'Apple Inc',:]
facebook = market_train_df.loc[market_train_df['assetName'] == 'Facebook Inc',:]
microsoft = market_train_df.loc[market_train_df['assetName'] =='Microsemi Corp',:]
oracle = market_train_df.loc[market_train_df['assetName'] == 'Oracle Corp',:]
bank_america = market_train_df.loc[market_train_df['assetName'] == 'Bank of America Corp']
#print(apple.head(), facebook.head(),  microsoft.head(), oracle.head(), bank_america.head())
#microsoft.head()
plt.plot(apple['time'], apple['returnsOpenNextMktres10'], color='blue')
plt.plot(facebook['time'], facebook['returnsOpenNextMktres10'], color='red')
plt.plot(microsoft['time'], microsoft['returnsOpenNextMktres10'], color='g')
plt.plot(oracle['time'], oracle['returnsOpenNextMktres10'], color='magenta')
plt.plot(bank_america['time'], bank_america['returnsOpenNextMktres10'], color='yellow')
plt.legend(["Apple", "Facebook", "Microsoft", "Oracle","Bank of America"])
plt.title("Target variable over time")
plt.show()

##### Distribution of returnsOpenNextMktres10 (target variable).
##### The variable is actually centered in 0 with only a few outliers higher than 0.25. This makes sense considering that the returns of the market for 10 days are really small. From a business perspective, if the mean had a value much higher than 0, it would imply that everybody would get rich with the stock market, and if very negative that it would be a machine of losing money. Our goal then should be to detect those times in which the wins or loses are really high by making use of the news.

In [None]:
# Lets analyze further the target variable
# Very big outliers, lets see their number and distribution

fig, axes = plt.subplots(3,2, figsize=(20, 12)) # create figure and axes
print("# Rows with |value| > 1 =", market_train_df[market_train_df["returnsOpenNextMktres10"].abs()>1].shape[0])
print("# Rows with |value| > 0.5 =", market_train_df[market_train_df["returnsOpenNextMktres10"].abs()>0.5].shape[0])
print("# Rows with |value| > 0.25 =", market_train_df[market_train_df["returnsOpenNextMktres10"].abs()>0.25].shape[0])
print("# Rows with |value| > 0.1 =", market_train_df[market_train_df["returnsOpenNextMktres10"].abs()>0.1].shape[0])

# Boxplot with all values
market_train_df.boxplot(column="returnsOpenNextMktres10", ax=axes.flatten()[0])
axes.flatten()[0].set_xlabel('Boxplot with all values', fontsize=18)
# Removing rows with outliers (bigger or smaller than 1)
market_train_df[market_train_df["returnsOpenNextMktres10"].abs()<1].boxplot(column="returnsOpenNextMktres10", ax=axes.flatten()[1])
axes.flatten()[1].set_xlabel('Boxplot with values such that |val| < 1', fontsize=18)
# Removing rows with outliers (bigger or smaller than 0.5)
market_train_df[market_train_df["returnsOpenNextMktres10"].abs()<0.5].boxplot(column="returnsOpenNextMktres10", ax=axes.flatten()[2])
axes.flatten()[2].set_xlabel('Boxplot with values such that |val| < 0.5', fontsize=18)
# Removing rows with outliers (bigger or smaller than 0.25)
market_train_df[market_train_df["returnsOpenNextMktres10"].abs()<0.25].boxplot(column="returnsOpenNextMktres10", ax=axes.flatten()[3])
axes.flatten()[3].set_xlabel('Boxplot with values such that |val| < 0.25', fontsize=18)
# Removing rows with outliers (bigger or smaller than 0.1)
market_train_df[market_train_df["returnsOpenNextMktres10"].abs()<0.1].boxplot(column="returnsOpenNextMktres10", ax=axes.flatten()[4])
axes.flatten()[4].set_xlabel('Boxplot with values such that |val| < 0.1', fontsize=18)
# Distribution of the target value (not including values bigger or smaller than 1)
market_train_df[market_train_df["returnsOpenNextMktres10"].abs()<0.25].hist(column="returnsOpenNextMktres10", bins=100, ax=axes.flatten()[5])
axes.flatten()[5].set_xlabel('Histogram for values such that |val| < 0.25', fontsize=18)


### Nulls values (replace with the mean of the column)

##### We considered several approaches here; ended up replacing them with the mean of its column

In [None]:
# Number of null values
market_train_df.isna().sum()
# Where are those null values in terms of dates?
#rows_with_null=market_train_df[pd.isnull(market_train_df).any(axis=1)]
#dates_with_null=rows_with_null["time"].unique()
#nulls_per_date=[rows_with_null[rows_with_null["time"]==d].shape[0] for d in dates_with_null]
#res=pd.DataFrame({'date': dates_with_null, 'nulls': nulls_per_date })
#res.head()
# Where are those null values in terms of assets?
#rows_with_null=market_train_df[pd.isnull(market_train_df).any(axis=1)]
#assets_with_null=rows_with_null["assetCode"].unique()
#nulls_per_asset=[rows_with_null[rows_with_null["assetCode"]==a].shape[0] for a in assets_with_null]
#res=pd.DataFrame({'asset': assets_with_null, 'nulls': nulls_per_asset})
#res.sort_values(by=['nulls'], ascending=False, inplace=True)
#res.head()

In [None]:
# Lets replace null values with the means without taking into account the possible outliers. 
tol = 0.15
cols = ['returnsClosePrevMktres1','returnsOpenPrevMktres1','returnsClosePrevMktres10','returnsOpenPrevMktres10']
replace_nans_market = {}
for c in cols:
    m = market_train_df[c][market_train_df[c].abs()<= tol].mean()
    print (c, "Mean--> ", m)
    replace_nans_market[c] = m
market_train_df.fillna(value=replace_nans_market, inplace=True)

# News data analysis

### Types and example

##### We wanted to know what was there inside the news dataset (types of the data, data included on each column, etc..).

In [None]:
# News data analysis
# Types of the columns
print(news_train_df.dtypes)
news_train_df.head()

### Nulls values

##### There are not nulls on this dataset. Nice!

In [None]:
# Number of null values
# Perfect! There are no nulls
news_train_df.isna().sum()

### Analysis of the main variables 

##### Comparing the relevance of all news... 2/3 of the news with relevance 1 (maximum) and most of the others with very close to 0. We will keep all of them in any case for the model

In [None]:
# Relevance
plt.hist(news_train_df["relevance"], bins=25)
plt.show()

In [None]:
# remove non relevant
# news_train_df = news_train_df[news_train_df["relevance"]>0.9]

##### Checking how is the sentiment distributed (there are slightly less negative than positive). This could be due to the news being in general positive in this topic or because of a bias when calculating the sentiments the company.

In [None]:
# Sentiment vars
#fig, axes = plt.subplots(1,3, figsize=(20, 12)) # create figure and axes
#news_train_df.boxplot(column="sentimentNegative", ax=axes.flatten()[0])
#axes.flatten()[0].set_xlabel('Boxplot with all values', fontsize=18)
#news_train_df.boxplot(column="sentimentNeutral", ax=axes.flatten()[1])
#axes.flatten()[1].set_xlabel('Boxplot with all values', fontsize=18)
#news_train_df.boxplot(column="sentimentPositive", ax=axes.flatten()[2])
#axes.flatten()[2].set_xlabel('Boxplot with all values', fontsize=18)

In [None]:
# Sentiment vars
values = [news_train_df["sentimentNegative"].mean(), news_train_df["sentimentNeutral"].mean(), news_train_df["sentimentPositive"].mean()]
labels = 'Negative', 'Neutral', 'Positive'
colors = ['lightcoral', 'gold' , 'lightskyblue']
plt.pie(values, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True)
plt.show()

# Data Preprocessing

### Removing some vars and creating new ones (only news)

##### Since here we are not considering the novelty and volume anymore, and we are creating a new variable to represent at the same time the sentiment and the urgency and relevance of the news together.

In [None]:
# Let us remove some not important columns
cols_to_keep_news = ["time", "urgency", "wordCount", "assetName", "relevance", "sentimentClass", "sentimentNegative", "sentimentNeutral", "sentimentPositive"]
news_train_df = news_train_df[cols_to_keep_news]
# And create some others
news_train_df["sent_rel"] = news_train_df["urgency"] * news_train_df["relevance"] * (news_train_df["sentimentPositive"]-news_train_df["sentimentNegative"]) 
news_train_df.head()

### Removing hour and minutes from the timestamps

##### We are normalizing the times so we do not consider the hours, minutes, etc. anymore, only the date.

In [None]:
# Minor changes on dates so they match the day
news_train_df['time'] = news_train_df['time'].dt.tz_convert(None).dt.normalize()
market_train_df['time'] = market_train_df['time'].dt.tz_convert(None).dt.normalize()
#print(news_train_df.dtypes)

### Aggregate news for same day and asset in a single row

##### We will merge all info from news of same day and asset into a single row (in genereal we will take the mean, except for urgency).

In [None]:
# Convert news of same day and asset to one row only
#d = {'sent_rel': 'mean', 'name': 'first', 'amount': 'sum'}
d_aggs = {'urgency' : 'max', 'wordCount': 'mean', 'relevance' : 'mean', 'sentimentClass' : 'mean', 'sentimentNegative' : 'mean', 'sentimentNeutral' : 'mean', 'sentimentPositive' : 'mean', 'sent_rel': 'mean'}
reduced_df=news_train_df[["time", "assetName"]+list(d_aggs.keys())].groupby(['time', 'assetName'], as_index=False).aggregate(d_aggs)#.reindex(columns=news_train_df.columns)
reduced_df.head()

In [None]:
print("Size before transforming: ", len(news_train_df))
print("Size after transforming: ", len(reduced_df))

### Merging datasets of news and market

##### After all the previous work we are now able to merge both datasets, assigning the news to the market cases.

In [None]:
# Create the final dataset to train and test
# Final merge by assetName and time
df_final = market_train_df.merge(reduced_df, how = 'left', on = ['time', 'assetName'])
print(len(df_final))
df_final.head()

# Analysis of the merged data

### Sentiment variables with regards to the target variable

##### We would expect the return to be higher when the news were clearly positive, and lower if clearly negative. Nevertheless following we have the mean return for negative, positive and so so sentiment news, which is not very clear.

In [None]:
print("Returns with negative news:", df_final[(df_final["sentimentClass"]< -0.4) & (df_final["returnsOpenNextMktres10"].abs()<0.5)]["returnsOpenNextMktres10"].mean())
print("Returns with neutral news", df_final[(df_final["sentimentClass"].isnull()) & (df_final["returnsOpenNextMktres10"].abs()<0.5)]["returnsOpenNextMktres10"].mean())
print("Returns with positive news:", df_final[(df_final["sentimentClass"]> 0.4) & (df_final["returnsOpenNextMktres10"].abs()<0.5)]["returnsOpenNextMktres10"].mean())

##### And the sentiment mean for high loses and wins:

In [None]:
print("Mean sentiment for loses:", df_final[(df_final["returnsOpenNextMktres10"] < -0.2) & (df_final["returnsOpenNextMktres10"] > -0.5)]["sentimentClass"].mean())
print("Mean sentiment for wins", df_final[(df_final["returnsOpenNextMktres10"] > 0.2) & (df_final["returnsOpenNextMktres10"]< 0.5)]["sentimentClass"].mean())

##### We can also compare the distributions of all (very positive, very negative and soso). Ignoring the fact that there are more of a type than the other we can observe that the distribution is basicaly the same for all cases, which means that those sentiments may not be really useful to predict...

In [None]:
bins = np.linspace(-0.25, 0.25, 200)
#plt.hist(df_final[(df_final["sentimentClass"].isnull()) & (df_final["returnsOpenNextMktres10"].abs()<0.25)]["returnsOpenNextMktres10"], bins = bins, alpha=0.8, label='Non news', color = 'grey')
plt.hist(df_final[(df_final["sentimentClass"]> 0.4) & (df_final["returnsOpenNextMktres10"].abs()<0.25)]["returnsOpenNextMktres10"], bins = bins, alpha=0.8, label='Positive')
plt.hist(df_final[(df_final["sentimentClass"].abs()< 0.4) & (df_final["returnsOpenNextMktres10"].abs()<0.25)]["returnsOpenNextMktres10"], bins = bins, alpha=0.5, label='Neutral', color ='green')
plt.hist(df_final[(df_final["sentimentClass"]<= -0.4) & (df_final["returnsOpenNextMktres10"].abs()<0.25)]["returnsOpenNextMktres10"], bins = bins, alpha=0.8, label='Negative', color = 'orange')
plt.legend(loc='upper right')
plt.title("returnsOpenNextMktres10")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()

##### And also adding when there are no news at all...

In [None]:
bins = np.linspace(-0.25, 0.25, 200)
plt.hist(df_final[(df_final["sentimentClass"].isnull()) & (df_final["returnsOpenNextMktres10"].abs()<0.25)]["returnsOpenNextMktres10"], bins = bins, alpha=0.8, label='No news', color = 'grey')
plt.hist(df_final[(df_final["sentimentClass"]> 0.4) & (df_final["returnsOpenNextMktres10"].abs()<0.25)]["returnsOpenNextMktres10"], bins = bins, alpha=0.8, label='Positive')
plt.hist(df_final[(df_final["sentimentClass"].abs()< 0.4) & (df_final["returnsOpenNextMktres10"].abs()<0.25)]["returnsOpenNextMktres10"], bins = bins, alpha=0.5, label='Neutral', color ='green')
plt.hist(df_final[(df_final["sentimentClass"]<= -0.4) & (df_final["returnsOpenNextMktres10"].abs()<0.25)]["returnsOpenNextMktres10"], bins = bins, alpha=0.8, label='Negative', color = 'orange')
plt.legend(loc='upper right')
plt.title("returnsOpenNextMktres10")
plt.xlabel("Value")
plt.ylabel("Frequency")
plt.show()

 ### Remove nans for asset/days with no news and encode assets names and codes

##### As final steps before modeling we will replace nans appearing after the merge with their means, encode the strings with numbers so sklearn can deal with them and normalize the real numerical data.

In [None]:
# After merging we have some nulls, as there are asset/date keys with no news (lets replace them)
cols = list(d_aggs.keys())
replace_nans_global = {}
for c in cols:
    m = df_final[c].mean()
    replace_nans_global[c] = m
#df_final.fillna(value=replace_nans_global, inplace=True)
df_final.fillna(value=-9999, inplace=True)
# Lets encode the assetCode and assetName for the ML algorithms
# Asset codes
ac_encoder = preprocessing.LabelEncoder()
assetCodes = df_final["assetCode"].unique()
print(len(assetCodes))
ac_encoder.fit(assetCodes)
df_final["assetCodeNum"]=ac_encoder.transform(df_final["assetCode"]) 
# Asset names
an_encoder = preprocessing.LabelEncoder()
assetNames = df_final["assetName"].unique()
print(len(assetNames))
an_encoder.fit(assetNames)
df_final["assetNameNum"]=an_encoder.transform(df_final["assetName"])
df_final.head()

In [None]:
# Normalize data??
"""
cols_not_normalize = ['time', 'assetCode', 'assetName','assetCodeNum', 'assetNameNum', 'returnsOpenNextMktres10']
cols_normalize = [x for x in df_final.columns if x not in cols_not_normalize]
tmp = df_final[cols_normalize].copy()
scaler = StandardScaler()
# scaler = MinMaxScaler()
scaler.fit(tmp)
tmp =  pd.DataFrame(scaler.transform(tmp), columns=cols_normalize)
tmp = pd.concat([df_final[cols_not_normalize].copy(), tmp], axis=1, sort=False)
tmp.head()"""

# Modeling

### Getting data to train and test

##### We will take here the periods for train (2014 and 2015) and test (2016 and an samll part of 2017) and split both into features and target variables.

In [None]:
# Lets do the splitting for ML !
tmp = df_final.copy()
# 2014-2016 to train, 2016-2017 to test
init_date=np.datetime64('2014-01-01')
init_test=np.datetime64('2016-01-01')
tmp = tmp[tmp["time"]>=init_date]
# delete outliers to train
tmp = tmp[tmp["returnsOpenNextMktres10"].abs()<0.25]
# split train and test
train = tmp[tmp["time"]<init_test]
test = tmp[tmp["time"]>=init_test]
# remove useless cols
#tmp.drop(["time", "assetCode", "assetName"]+list(d_aggs.keys()), axis=1, inplace=True)
train.drop(["time", "assetCode", "assetName"], axis=1, inplace=True)
test.drop(["time", "assetCode", "assetName"], axis=1, inplace=True)
# redistribute training dataset
#c1=train[train["returnsOpenNextMktres10"].abs()>0.08].copy().iloc[:, :]
c1=train[train["returnsOpenNextMktres10"].abs()>0.03].copy().iloc[:, :]
#c2=train[(train["returnsOpenNextMktres10"].abs()<=0.1) & (train["returnsOpenNextMktres10"].abs()>0.01)].copy().iloc[-len(c1):, :]
#train = pd.concat([c1, c2], axis=0, sort=False)
train=c1.copy()
# split in features and target variable
X_train = train.loc[:, train.columns != 'returnsOpenNextMktres10']
X_test = test.loc[:, test.columns != 'returnsOpenNextMktres10']
y_train = train['returnsOpenNextMktres10'].values
y_test = test['returnsOpenNextMktres10'].values
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=False) 
# Save cols order for the final prediction data
cols_order=X_train.columns
print(cols_order)

In [None]:
#plt.hist(y_train, bins = 100)
#plt.show()

### Train and predict (with some finetuning of hyperparameters)

##### We have tried here LinearRegression, RandomForest, GradientBoosting, ExtraTrees and Neural Networks (all of them with different parameters). Then we take the one with better score in terms of MAE as our final model for predicting. Note that due to the kernel constraints we couldn't carry out a better hyperparameter tuning (with gridsearch for instance) and that we had to avoid temporal crossvalidation.

In [None]:
# Predict
#regr1 = LinearRegression(n_jobs=-1)
regr2 = RandomForestRegressor(n_jobs=-1, max_depth=200, n_estimators=30, min_samples_leaf=10, criterion="mse", random_state=15)
#regr3 = GradientBoostingRegressor(max_depth=3, n_estimators=10,  criterion="mse")
#regr4 = ExtraTreesRegressor(n_jobs=-1, max_depth=10, n_estimators=10, criterion="mse")
#regr5 = MLPRegressor(hidden_layer_sizes=(10,5), activation='relu', alpha=0.0001, solver='adam', learning_rate='adaptive', max_iter=1000)
estimators = [regr2]
maes = []
mses = []
for e in estimators:
    e.fit(X_train, y_train)
    y_predicted=e.predict(X_test)
    maes += [mean_absolute_error(y_test, y_predicted)]
    mses += [mean_squared_error(y_test, y_predicted)]
    print("MAE:", mean_absolute_error(y_test, y_predicted))
    print("MSE:", mean_squared_error(y_test, y_predicted))
print("Acabó")
print(maes)
print(mses)
regr = estimators[np.argmin(maes)]

### Analysis of results

##### Not only the target variable but the confidence we have to provide for the final leaderboard of the competition. The goal more specifically is knowing if the returns are positives or negatives so lets see how many times we predicted it properly.

In [None]:
# accuracy
y_test2 = [0 if y < 0 else 1 for y in y_test]
y_pred2 = [0 if y < 0 else 1 for y in y_predicted]
print("Accuracy: ", accuracy_score(y_test2, y_pred2))

In [None]:
##### Feature relevance for the model (basically returns backwards are the most relevant ones)

In [None]:
#for i in range(X_train.shape[1]):
#    print("%s (%f)" % (X_train.columns[i], regr.feature_importances_[i]))

##### Brief comparison between real and predicted variable.

In [None]:
df_results = X_test.copy()
df_results.insert(loc=df_results.shape[1], column="y_real", value=y_test)
df_results.insert(loc=df_results.shape[1], column="y_pred", value=y_predicted)
df_results.head()

##### Evolution of real and predicted variable along the test period (for Oracle Corp only).

In [None]:
example=df_results[df_results["assetNameNum"]==an_encoder.transform(["Oracle Corp"])[0]]
x=range(len(example))
plt.plot(x, example["y_real"], color='blue')        # specify color by name
plt.plot(x, example["y_pred"], color='red')
plt.legend()
plt.show()

# Final prediction

### Predicting for the validation time of the Kaggle competition

##### Here we will make use of our resulting model to predict for the final validation set of the competition. Note that we have to perform again all the preprocessing tasks over the new data and that we have to compute a final confidence value instead of the target variable as such.

In [None]:
def preprocessing(market, news, replace_nans_market, cols_to_keep_news, d_aggs, replace_nans_global, ac_encoder, an_encoder, cols_order):
    # replace_nans_market, cols_to_keep_news, d_aggs, replace_nans_global, ac_encoder, an_encoder, cols_order
    # Remove nans on market
    market = market.fillna(value=replace_nans_market)
    # Transform cols
    news = news[cols_to_keep_news]
    news["sent_rel"] = news["urgency"] * news["relevance"] * (news["sentimentPositive"]-news["sentimentNegative"]) 
    # Minor changes on dates so they match the day
    news['time'] = news['time'].dt.tz_convert(None).dt.normalize()
    market['time'] = market['time'].dt.tz_convert(None).dt.normalize()
    # Convert news of same day and asset to one row only
    red = news[["time", "assetName"]+list(d_aggs.keys())].groupby(['time', 'assetName'], as_index=False).aggregate(d_aggs)#.reindex(columns=news_train_df.columns)
    # Final merge by assetName and time
    df = market.merge(red, how = 'left', on = ['time', 'assetName'])
    # Now we have some nulls as there are asset/date keys with no news (lets replace them)
    #df = df.fillna(value=replace_nans_global)
    df = df.fillna(value=-9999)
    # Encode the assetCode and assetName for the ML algorithms (note new values would make it crash)
    le_dict = dict(zip(ac_encoder.classes_, ac_encoder.transform(ac_encoder.classes_)))
    df["assetCodeNum"]=df["assetCode"].apply(lambda x: le_dict.get(x, -9999))
    le_dict = dict(zip(an_encoder.classes_, an_encoder.transform(an_encoder.classes_)))
    df["assetNameNum"]=df["assetName"].apply(lambda x: le_dict.get(x, -9999))
    # Normalize data
    """cols_not_normalize =  ['time', 'assetCode', 'assetName','assetCodeNum', 'assetNameNum']
    cols_normalize = [x for x in df.columns if x not in cols_not_normalize]
    tmp = df[cols_normalize].copy()
    tmp = pd.DataFrame(scaler.transform(tmp), columns=cols_normalize)
    df = pd.concat([df[cols_not_normalize].copy(), tmp], axis=1, sort=False)"""
    # Remove vars unneeded
    #df.drop(["time", "assetCode", "assetName"], axis=1, inplace=True)
    df = df[cols_order]
    return df

In [None]:
def predictions(market, news, predictions_template_df):
    print(market["time"][0])
    # Preprocessing
    tmp = preprocessing(market, news, replace_nans_market, cols_to_keep_news, d_aggs, replace_nans_global, ac_encoder, an_encoder, cols_order)
    # Predicting
    y_predicted=regr.predict(tmp)
    # Final output
    #mn=min(y_predicted)
    #mx=max(y_predicted)
    mn=-0.25
    mx=0.25
    # Converting into the confidence value, from -1 to 1
    predictions_template_df.confidenceValue = [((y-mn)/(mx-mn)*2-1) for y in y_predicted]

In [None]:
# Retrieve all days to iterate through
# You can only iterate through a result from `get_prediction_days()` once
# so be careful not to lose it once you start iterating.
days = env.get_prediction_days()

In [None]:
# Generate the predictions
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    predictions(market_obs_df, news_obs_df, predictions_template_df)
    env.predict(predictions_template_df)
print('Prediction finished!')

In [None]:
#env.predict(predictions_template_df)

In [None]:
# Write submission file
# Note that for submitting the results we have to commit and then upload the resulting csv file
env.write_submission_file()
print([filename for filename in os.listdir('.') if '.csv' in filename])