# Versions
- V1: Basic LGBM model with 20 estimators;
- V2: Set the max_depth parameter from 6 to 4 so that 2 ^ max_depth <= max_leaves;
- V3: Use a keyword argument method to set the parameters for the LGBM; Clip the confidence to [-0.99999, 0.99999];
- V4: Remove some outliers with unknown asset names or extreme return;
- V5: Add voting model with 20 LGBM; Rewrite the score method; Add get_confidence method;
- V6: Rewrite the get_confidence method, where I use a standard scaler instead of a min-max scaler;
- V7: GPU on in V6, which is not permitted;
- V8: Increase the n_estimator from 50 to 500;
- V9: Decrease the n_estimator from 500 to 100; Set num_leaves to 1000.

In [None]:
import datetime
import gc
import time
import warnings

import lightgbm as lgb
import numpy as np
import pandas as pd
import xgboost as xgb
from kaggle.competitions import twosigmanews
from matplotlib import pyplot as plt
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder


warnings.filterwarnings("ignore")

# 1 Initialize the Environment and Retrieve the Data

In [None]:
# Initialize the environment
if "env" not in globals():
    env = twosigmanews.make_env()
# Retrieve the data
mkt_train, news_train = env.get_training_data()

# 2 Data Preparation

In [None]:
print(mkt_train.info())

In [None]:
mkt_train.head()

In [None]:
print(news_train.info())

In [None]:
news_train.head()

## 2.1 Data Cleaning

### 2.1.1 Fill the Missing Values

In [None]:
mkt_cols = mkt_train.columns

for col in mkt_cols:
    null_pct = mkt_train[col].isnull().sum() / mkt_train.shape[0]
    print("{}: {:.4f}%.".format(col, null_pct * 100))

del mkt_cols
del null_pct

In [None]:
news_cols = news_train.columns

for col in news_cols:
    null_pct = news_train[col].isnull().sum() / news_train.shape[0]
    print("{}: {:.4f}%.".format(col, null_pct * 100))

del news_cols
del null_pct

Fill the missing values of market residual returns with raw returns.

In [None]:
mkt_train["returnsClosePrevMktres1"].fillna(mkt_train["returnsClosePrevRaw1"], inplace=True)
mkt_train["returnsOpenPrevMktres1"].fillna(mkt_train["returnsOpenPrevRaw1"], inplace=True)
mkt_train["returnsClosePrevMktres10"].fillna(mkt_train["returnsClosePrevRaw10"], inplace=True)
mkt_train["returnsOpenPrevMktres10"].fillna(mkt_train["returnsOpenPrevRaw10"], inplace=True)

### 2.1.2 Remove Outliers

In [None]:
log_ret = np.log(mkt_train["close"].values / mkt_train["open"].values)
outlier_idx = ((log_ret > 0.5).astype(int) + (log_ret < -0.5).astype(int)).astype(bool)
mkt_train = mkt_train.loc[~outlier_idx, :]
print("{} outliers removed.".format(np.sum(outlier_idx.astype(int))))

unknown_idx = mkt_train["assetName"] == "Unknown"
mkt_train = mkt_train.loc[~unknown_idx, :]
print("{} outliers removed.".format(np.sum(unknown_idx.astype(int))))

short_ret_cols = ['returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevMktres1', 'returnsOpenPrevMktres1']
long_ret_cols = ['returnsClosePrevRaw10', 'returnsOpenPrevRaw10', 'returnsClosePrevMktres10', 'returnsOpenPrevMktres10']

ori_len = mkt_train.shape[0]

for col in short_ret_cols:
    mkt_train = mkt_train.loc[mkt_train[col].abs() < 1]

for col in long_ret_cols:
    mkt_train = mkt_train.loc[mkt_train[col].abs() < 2]
print("{} outliers removed.".format(ori_len - mkt_train.shape[0]))

del log_ret
del outlier_idx
del unknown_idx
gc.collect()

## 2.2 Feature Engineering

### 2.2.1 Add and Transform Features

In [None]:
mkt_train["time"] = mkt_train["time"].dt.date
mkt_train.rename(columns={"time": "date"}, inplace=True)
mkt_train["returnsToday"] = np.log(mkt_train["close"].values / mkt_train["open"].values)
mkt_train["relVol"] = mkt_train.groupby(["date"])["volume"].transform(lambda x: (x - x.mean()) / x.std())

In [None]:
news_train["sourceTimestamp"] = news_train["sourceTimestamp"].dt.date
news_train.rename(columns={"sourceTimestamp": "date"}, inplace=True)
news_train["rel1stMentionPos"] = news_train["firstMentionSentence"].values / news_train["sentenceCount"].values
news_train["relSentimentWord"] = news_train["sentimentWordCount"].values / news_train["wordCount"].values
news_train["relSentCnt"] = news_train.groupby(["date"])["sentenceCount"].transform(lambda x: (x - x.mean()) / x.std())
news_train["relWordCnt"] = news_train.groupby(["date"])["wordCount"].transform(lambda x: (x - x.mean()) / x.std())
news_train["relBodySize"] = news_train.groupby(["date"])["bodySize"].transform(lambda x: (x - x.mean()) / x.std())
news_train["assetCodes"] = news_train["assetCodes"].map(lambda x: list(eval(x))[0])
news_train.rename(columns={"assetCodes": "assetCode"}, inplace=True)

### 2.2.2 Drop the Useless Features

Drop the useless feautres in both market data and news data.

In [None]:
mkt_train.drop(["assetName", "volume", "close", "open"], axis=1, inplace=True)

news_train.drop(["time", "sourceId", "headline", "provider", "subjects",
                 "audiences", "bodySize", "sentenceCount", "wordCount",
                 "assetName", "firstMentionSentence", "sentimentWordCount",
                 "headlineTag"], axis=1, inplace=True)
gc.collect()

In [None]:
def add_lags(data, lag_cols, windows, data_train=False):
    
    if isinstance(data_train, pd.DataFrame):
        data["test"] = np.ones(data.shape[0])
        data_train["test"] = np.zeros(data_train.shape[0])
        data = pd.concat([data, data_train], axis=0)
    
    data = data.sort_values(["assetCode", "date"])
    data_by_asset = data.groupby(["assetCode"])
    
    for col in lag_cols:
        
        for win in windows:
            
            lag_col_name = "_".join([col, "lag", str(win)])
            lag_col_mean = data_by_asset.rolling(win)[col].mean().values.flatten()
            lag_col_std = data_by_asset.rolling(win)[col].std().values.flatten()
            lag_col_min = data_by_asset.rolling(win)[col].min().values.flatten()
            lag_col_max = data_by_asset.rolling(win)[col].max().values.flatten()
            data[lag_col_name + "_mean"] = lag_col_mean
            data[lag_col_name + "_std"] = lag_col_std
            data[lag_col_name + "_min"] = lag_col_min
            data[lag_col_name + "_max"] = lag_col_max
    
    data.fillna(0, inplace=True)
    
    if isinstance(data_train, pd.DataFrame):
        data = data.loc[data["test"] == 1, :]
        data.drop("test", axis=1, inplace=True)
        data_train.drop("test", axis=1, inplace=True)
    
    return data

In [None]:
lag_cols = ["returnsClosePrevMktres1", "returnsClosePrevMktres10"]
windows = [3, 7]
mkt_train = add_lags(mkt_train, lag_cols, windows)
gc.collect()

### 2.2.3 Take Average on the News Data

Take average on the news data, which is grouped by date and asset code.

In [None]:
news_train = news_train.groupby(["date", "assetCode"], as_index=False).mean()

### 2.2.4 Remove All Data Before the Crisis

In [None]:
mkt_train = mkt_train.loc[mkt_train["date"].values > datetime.date(2009, 1, 1)]

### 2.2.4 Merge two datasets

In [None]:
data_train = pd.merge(mkt_train, news_train, how="left", left_on=["date", "assetCode"], right_on=["date", "assetCode"])
data_train.head()

del mkt_train
del news_train
gc.collect()

## 2.3 Clean the Data, Again

### 2.3.1 Fill the Missing Values

In [None]:
cols = data_train.columns

for col in cols:
    null_pct = pd.isnull(data_train[col]).sum() / data_train.shape[0]
    print("{}: {:.4f}%.".format(col, null_pct * 100))
del cols
del null_pct

In [None]:
na_cols = data_train.columns
fillna_dict = {}

for col in na_cols:
    
    if col != "sentimentNeutral":
        fillna_dict[col] = 0
    else:
        fillna_dict[col] = 1
data_train.fillna(value=fillna_dict, inplace=True)

### 2.3.2 Normalize the Data

In [None]:
feature_cols = [col for col in data_train.columns.values if col not in
                ["date", "assetCode", "returnsOpenNextMktres10", "universe"]]

In [None]:
feature_scalers = [StandardScaler() for i in range(len(feature_cols))]

for i in range(len(feature_cols)):
    data_train[feature_cols[i]] = feature_scalers[i].fit_transform(data_train[feature_cols[i]].values.reshape((-1, 1)))
    gc.collect()

## 2.4 Set the Target Variable and Split the Dataset

In [None]:
data_train["y"] = (data_train["returnsOpenNextMktres10"] > 0).astype(int)

seed = np.random.randint(1, 100)
data_train, data_test = train_test_split(data_train, random_state=seed, test_size=0.2)

# 3 Model

## 3.1 Simple LGBM Model

In [None]:
class LGBModel(lgb.LGBMClassifier):
    
    def evaluate(self, y_true, y_pred):
        
        y_true = y_true.astype(int).reshape((-1, 1))
        y_pred = y_pred.astype(int).reshape((-1, 1))
        
        accuracy = accuracy_score(y_true, y_pred)
        precision, recall, f1_score, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
        
        print("Accuracy: {:.4f}.".format(accuracy))
        print("Precision: {:.4f}.".format(precision))
        print("Recall: {:.4f}.".format(recall))
        print("F1 score: {:.4f}".format(f1_score))
        
        return accuracy, precision, recall, f1_score

    def get_confidence(self, x_test, ref_scaler):
        
        pred_prob = self.predict_proba(x_test)
        confidence = (pred_prob[:, 1] - pred_prob[:, 0]).reshape((-1, 1))
        conf_scaler = StandardScaler()
        confidence = conf_scaler.fit_transform(confidence)
        confidence = ref_scaler.inverse_transform(confidence)
        confidence = np.clip(confidence, -0.99999, 0.99999)
        
        return confidence.flatten()
    
    def score(self, x_test, data_test, ref_scaler):
        
        confidence = self.get_confidence(x_test, ref_scaler)
        
        y_ret_pred = np.zeros(confidence.shape[0])
        for i in range(y_ret_pred.shape[0]):
            y_ret_pred[i] = confidence[i] * data_test["returnsOpenNextMktres10"].values[i] * data_test["universe"].values[i]
        pred_data = pd.DataFrame({"date": data_test["date"], "y_ret_pred": y_ret_pred})
        pred_data = pred_data.groupby(["date"])["y_ret_pred"].sum().values.flatten()
        score = np.mean(pred_data) / np.std(pred_data)
        print("Validation score: {:.4f}.".format(score))
        
        return score, confidence

In [None]:
seed = np.random.randint(1, 100)
lgbm_params = {
    "max_depth": 8,
    "num_leaves": 1000,
    "learning_rate": 0.1,
    "n_estimators": 100,
    "boosting_type": "dart",
    "n_jobs": -1,
    "reg_lambda": 0.01,
    "random_state": seed
}
model = LGBModel(**lgbm_params)

In [None]:
x_train = data_train[feature_cols].values
y_train = data_train["y"].values

In [None]:
start = time.clock()
model.fit(x_train, y_train, eval_set=[(x_train, y_train)], early_stopping_rounds=10)
time_elapsed = int(time.clock() - start)
print("Total traninig time {} seconds.".format(time_elapsed))

In [None]:
x_test = data_test[feature_cols].values
y_test = data_test["y"].values
y_pred = model.predict(x_test)
model.evaluate(y_test, y_pred)

In [None]:
ref_scaler = StandardScaler()
ref_scaler.fit(data_train["returnsOpenNextMktres10"].values.reshape((-1, 1)))
_, confidence = model.score(x_test, data_test, ref_scaler)

In [None]:
plt.hist(confidence, bins="auto", label="Confidence")
plt.hist(data_test["returnsOpenNextMktres10"], bins="auto", alpha=0.8, label="True return")
plt.title("Confidence & True Return")
plt.legend(loc='best')
plt.xlim(-1,1)
plt.show()

In [None]:
feature_importance = pd.DataFrame()
feature_importance["feature"] = feature_cols
feature_importance["importance"] = model.feature_importances_
feature_importance.sort_values(by=["importance"], ascending=False, inplace=True)
feature_importance.reset_index(inplace=True, drop=True)

plt.barh(-np.arange(10), feature_importance.values[:10, 1])
plt.yticks(-np.arange(10), feature_importance.values[:10, 0])
plt.xlabel("importance")
plt.tight_layout()
plt.show()

### 3.1.1 Fine-tuning Hyperparameters

In [None]:
# base_model = LGBModel(boosting_type="dart", n_jobs=-1)
# params_dist = {
#     "n_estimators": [200, 500, 1000],
#     "max_depth": sp_randint(4, 10),
#     "colsample_bytree": sp_uniform(loc=0.8, scale=0.15),
#     "min_child_samples": sp_randint(50, 100),
#     "sub_sample": sp_uniform(loc=0.75, scale=0.25),
#     "reg_lambda": [1e-3, 1e-2, 1e-1, 1]
# }
# param_tuner = RandomizedSearchCV(base_model, param_distributions=params_dist,
#                                  n_iter=20, scoring="f1", refit=True, verbose=True)

In [None]:
# param_tuner.fit(x_train, y_train)
# cv_results = pd.DataFrame(param_tuner.cv_results_)
# print(cv_results)
# print(param_tuner.best_params_)
# best_model = param_tuner.best_estimator_

In [None]:
del x_train
del x_test
del y_train
del y_test
del y_pred
del confidence

In [None]:
def prepare_data(mkt, news, scalers, lag_data):
    
    mkt["returnsClosePrevMktres1"].fillna(mkt["returnsClosePrevRaw1"], inplace=True)
    mkt["returnsOpenPrevMktres1"].fillna(mkt["returnsOpenPrevRaw1"], inplace=True)
    mkt["returnsClosePrevMktres10"].fillna(mkt["returnsClosePrevRaw10"], inplace=True)
    mkt["returnsOpenPrevMktres10"].fillna(mkt["returnsOpenPrevRaw10"], inplace=True)
    
    mkt["time"] = mkt["time"].dt.date
    mkt.rename(columns={"time": "date"}, inplace=True)
    mkt["returnsToday"] = np.log(mkt["close"].values / mkt["open"].values)
    mkt["relVol"] = mkt.groupby(["date"])["volume"].transform(lambda x: (x - x.mean()) / x.std())
    
    news["sourceTimestamp"] = news["sourceTimestamp"].dt.date
    news.rename(columns={"sourceTimestamp": "date"}, inplace=True)
    news["rel1stMentionPos"] = news["firstMentionSentence"].values / news["sentenceCount"].values
    news["relSentimentWord"] = news["sentimentWordCount"].values / news["wordCount"].values
    news["relSentCnt"] = news.groupby(["date"])["sentenceCount"].transform(lambda x: (x - x.mean()) / x.std())
    news["relWordCnt"] = news.groupby(["date"])["wordCount"].transform(lambda x: (x - x.mean()) / x.std())
    news["relBodySize"] = news.groupby(["date"])["bodySize"].transform(lambda x: (x - x.mean()) / x.std())
    news["assetCodes"] = news["assetCodes"].map(lambda x: list(eval(x))[0])
    news.rename(columns={"assetCodes": "assetCode"}, inplace=True)
    
    mkt.drop(["assetName", "volume", "close", "open"], axis=1, inplace=True)

    news.drop(["time", "sourceId", "headline", "provider", "subjects",
               "audiences", "bodySize", "sentenceCount", "wordCount",
               "assetName", "firstMentionSentence", "sentimentWordCount",
               "headlineTag"], axis=1, inplace=True)
    
    news = news.groupby(["date", "assetCode"], as_index=False).mean()
    
    data = pd.merge(mkt, news, how="left", left_on=["date", "assetCode"], right_on=["date", "assetCode"])
    del mkt
    del news
    gc.collect()
    
    lag_cols = lag_cols = ["returnsClosePrevMktres1", "returnsClosePrevMktres10"]
    windows = [3, 7]
    data = add_lags(data, lag_cols, windows, lag_data)
    
    feature_cols = [col for col in data.columns.values if col not in
                    ["date", "assetCode", "returnsOpenNextMktres10", "universe", "y"]]
    
    fillna_dict = {}

    for col in feature_cols:

        if col != "sentimentNeutral":
            fillna_dict[col] = 0
        else:
            fillna_dict[col] = 1

    data.fillna(value=fillna_dict, inplace=True)

    for i in range(len(feature_cols)):
        data[feature_cols[i]] = scalers[i].transform(data[feature_cols[i]].values.reshape((-1, 1)))
    
    gc.collect()

    return data

In [None]:
# Submission with the single LGBM model
if "days" not in globals():
    days = env.get_prediction_days()

max_lag = 7
lag_data = data_train
del data_train
del data_test
day_idx = 1

for (mkt, news, pred) in days:
    
    start = mkt["time"].dt.date.min() - datetime.timedelta(days=max_lag)
    lag_data = lag_data.loc[lag_data["date"] >= start, :]
    
    mkt = mkt[mkt["assetCode"].isin(pred["assetCode"].unique())]
    data = prepare_data(mkt, news, feature_scalers, lag_data)
    x_test = data[feature_cols].values
    confidence = model.get_confidence(x_test, ref_scaler)
    confidence = pd.DataFrame({"assetCode": data["assetCode"].values, "confidenceValue": confidence})
    pred.drop(["confidenceValue"], axis=1, inplace=True)
    pred = pd.merge(pred, confidence, how="left", left_on=["assetCode"], right_on=["assetCode"])
    pred.fillna(0, inplace=True)
    env.predict(pred)
    
    lag_data = pd.concat([lag_data, data], axis=0)
    print("Day {}".format(day_idx))
    day_idx += 1

env.write_submission_file()

## 3.2 Voting Model with LGBM

In [None]:
# class VotingLGBM(VotingClassifier):
    
#     def evaluate(self, y_true, y_pred):
        
#         y_true = y_true.astype(int).reshape((-1, 1))
#         y_pred = y_pred.astype(int).reshape((-1, 1))
        
#         accuracy = accuracy_score(y_true, y_pred)
#         precision, recall, f1_score, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
        
#         print("Accuracy: {:.4f}.".format(accuracy))
#         print("Precision: {:.4f}.".format(precision))
#         print("Recall: {:.4f}.".format(recall))
#         print("F1 score: {:.4f}".format(f1_score))
        
#         return accuracy, precision, recall, f1_score
    
#     def get_confidence(self, x_test, ret_ref):
        
#         pred_prob = self.predict_proba(x_test)
#         confidence = (pred_prob[:, 1] - pred_prob[:, 0]).reshape((-1, 1))
#         conf_scaler = StandardScaler()
#         ref_scaler = StandardScaler()
#         confidence = conf_scaler.fit_transform(confidence)
#         ref_scaler.fit(ret_ref.reshape((-1, 1)))
#         confidence = ref_scaler.inverse_transform(confidence)
#         confidence = np.clip(confidence, -0.99999, 0.99999)
        
#         return confidence.flatten()
    
#     def score(self, x_test, data_test, ret_ref):
        
#         confidence = self.get_confidence(x_test, data_test, ret_ref)
        
#         y_ret_pred = np.zeros(confidence.shape[0])
#         for i in range(y_ret_pred.shape[0]):
#             y_ret_pred[i] = confidence[i] * data_test["returnsOpenNextMktres10"].values[i] * data_test["universe"].values[i]
#         pred_data = pd.DataFrame({"date": data_test["date"], "y_ret_pred": y_ret_pred})
#         pred_data = pred_data.groupby(["date"]).sum().values.flatten()
#         score = np.mean(pred_data) / np.std(pred_data)
#         print("Validation score: {:.4f}.".format(score))
        
#         return score, confidence

In [None]:
# clfs = []
# num_model = 20
# for i in range(num_model):
#     seed = np.random.randint(1, 100)
#     lgb_params = {
#         "max_depth": 4,
#         "learning_rate": 0.5,
#         "n_estimators": 50,
#         "boosting_type": "dart",
#         "random_state": seed
#     }
#     clfs.append((str(i), lgb.LGBMClassifier(**lgb_params)))
    
# voting_model = VotingLGBM(clfs, voting="soft", n_jobs=-1)

In [None]:
# voting_model.fit(x_train, y_train)
# y_pred = voting_model.predict(x_test)
# voting_model.evaluate(y_test, y_pred)

In [None]:
# confidence = voting_model.get_confidence(x_test)
# plt.hist(confidence, bins=100)
# plt.show()

In [None]:
# del data_train
# del data_test
# del x_train
# del x_test
# del y_train
# del y_test
# del y_pred
# del confidence

In [None]:
# Submission with the voting model
# if "days" not in globals():
#     days = env.get_prediction_days()

# for (mkt, news, pred) in days:
    
#     mkt = mkt[mkt["assetCode"].isin(pred["assetCode"].unique())]
#     data = prepare_data(mkt, news, feature_scaler)
#     x_test = data[feature_cols].values
#     confidence = voting_model.get_confidence(x_test)
#     confidence = pd.DataFrame({"assetCode": data["assetCode"].values, "confidenceValue": confidence})
#     pred.drop(["confidenceValue"], axis=1, inplace=True)
#     pred = pd.merge(pred, confidence, how="left", left_on=["assetCode"], right_on=["assetCode"])
#     pred.fillna(0, inplace=True)
#     env.predict(pred)

# env.write_submission_file()