**Imports**

In [None]:
import gc
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import log_loss, r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV

%matplotlib inline

**Setup Environment**

In [None]:
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()

**Get Training Data**

In [None]:
(market_train, news_train) = env.get_training_data()

In [None]:
market_train.head()

In [None]:
market_train.shape

**Preprocess Data**

In [None]:
def preprocess_news(news_train):
    drop_list = [
        'audiences', 'subjects', 'assetName',
        'headline', 'firstCreated', 'sourceTimestamp',
    ]
    news_train.drop(drop_list, axis=1, inplace=True)
    
    # Factorize categorical columns
    for col in ['headlineTag', 'provider', 'sourceId']:
        news_train[col], uniques = pd.factorize(news_train[col])
        del uniques
    
    # Remove {} and '' from assetCodes column
    news_train['assetCodes'] = news_train['assetCodes'].apply(lambda x: x[1:-1].replace("'", ""))
    return news_train

news_train = preprocess_news(news_train)

**Split Asset Codes in News Data**

In [None]:
def unstack_asset_codes(news_train):
    codes = []
    indexes = []
    for i, values in news_train['assetCodes'].iteritems():
        explode = values.split(", ")
        codes.extend(explode)
        repeat_index = [int(i)]*len(explode)
        indexes.extend(repeat_index)
    index_df = pd.DataFrame({'news_index': indexes, 'assetCode': codes})
    del codes, indexes
    gc.collect()
    return index_df

index_df = unstack_asset_codes(news_train)
index_df.head()

In [None]:
def merge_news_on_index(news_train, index_df):
    news_train['news_index'] = news_train.index.copy()

    # Merge news on unstacked assets
    news_unstack = index_df.merge(news_train, how='left', on='news_index')
    news_unstack.drop(['news_index', 'assetCodes', 'sourceId'], axis=1, inplace=True)
    return news_unstack

news_unstack = merge_news_on_index(news_train, index_df)
del news_train, index_df
gc.collect()
news_unstack.head(3)

**Group Data**

In [None]:
gc.collect()
def group_news(news_frame):
    news_frame['date'] = news_frame.time.dt.date  # Add date column
    
    aggregations = ['mean']
    gp = news_frame.groupby(['assetCode', 'date']).agg(aggregations)
    gp.columns = pd.Index(["{}_{}".format(e[0], e[1]) for e in gp.columns.tolist()])
    gp.reset_index(inplace=True)
    # Set datatype to float32
    float_cols = {c: 'float32' for c in gp.columns if c not in ['assetCode', 'date']}
    return gp.astype(float_cols)

news_agg = group_news(news_unstack)
del news_unstack; gc.collect()
news_agg.head(3)

**Merge News with Market Data**

In [None]:
market_train['date'] = market_train.time.dt.date
df = market_train.merge(news_agg, how='left', on=['assetCode', 'date'])
del market_train, news_agg
gc.collect()
df.head(3)

**EDA**

**Market Data**
* time(datetime64[ns, UTC]) - the current time (in marketdata, all rows are taken at 22:00 UTC)
* assetCode(object) - a unique id of an asset
* assetName(category) - the name that corresponds to a group of assetCodes. These may be "Unknown" if the corresponding assetCode does not have any rows in the news data.
* universe(float64) - a boolean indicating whether or not the instrument on that day will be included in scoring. This value is not provided outside of the training data time period. The trading universe on a given date is the set of instruments that are avilable for trading (the scoring function will not consider instruments that are not in the trading universe). The trading universe changes daily.
* volume(float64) - trading volume in shares for the day
* close(float64) - the close price for the day (not adjusted for splits or dividends)
* open(float64) - the open price for the day (not adjusted for splits or dividends)
* returnsClosePrevRaw1(float64) - see returns explanation above
* returnsOpenPrevRaw1(float64) - see returns explanation above
* returnsClosePrevMktres1(float64) - see returns explanation above
* returnsOpenPrevMktres1(float64) - see returns explanation above
* returnsClosePrevRaw10(float64) - see returns explanation above
* returnsOpenPrevRaw10(float64) - see returns explanation above
* returnsClosePrevMktres10(float64) - see returns explanation above
* returnsOpenPrevMktres10(float64) - see returns explanation above
* returnsOpenNextMktres10(float64) - 10 day, market-residualized return. This is the target variable used in competition scoring. The market data has been filtered such that returnsOpenNextMktres10 is always not null.

**News Data**

* time(datetime64[ns, UTC]) - UTC timestamp showing when the data was available on the feed (second precision)
* sourceTimestamp(datetime64[ns, UTC]) - UTC timestamp of this news item when it was created
* firstCreated(datetime64[ns, UTC]) - UTC timestamp for the first version of the item
* sourceId(object) - an Id for each news item
* headline(object) - the item's headline
* urgency(int8) - differentiates story types (1: alert, 3: article)
* takeSequence(int16) - the take sequence number of the news item, starting at 1. For a given story, alerts and articles have separate sequences.
* provider(category) - identifier for the organization which provided the news item (e.g. RTRS for Reuters News, BSW for Business Wire)
* subjects(category) - topic codes and company identifiers that relate to this news item. Topic codes describe the news item's subject matter. These can cover asset classes, geographies, events, industries/sectors, and other types.
* audiences(category) - identifies which desktop news product(s) the news item belongs to. They are typically tailored to specific audiences. (e.g. "M" for Money International News Service and "FB" for French General News Service)
* bodySize(int32) - the size of the current version of the story body in characters
* companyCount(int8) - the number of companies explicitly listed in the news item in the subjects field
* headlineTag(object) - the Thomson Reuters headline tag for the news item
* marketCommentary(bool) - boolean indicator that the item is discussing general market conditions, such as "After the Bell" summaries
* sentenceCount(int16) - the total number of sentences in the news item. Can be used in conjunction with firstMentionSentence to determine the relative position of the first mention in the item.
* wordCount(int32) - the total number of lexical tokens (words and punctuation) in the news item
* assetCodes(category) - list of assets mentioned in the item
* assetName(category) - name of the asset
* firstMentionSentence(int16) - the first sentence, starting with the headline, in which the scored asset is mentioned.
1: headline
2: first sentence of the story body
3: second sentence of the body, etc
0: the asset being scored was not found in the news item's headline or body text. As a result, the entire news item's text (headline + body) will be used to determine the sentiment score.
* relevance(float32) - a decimal number indicating the relevance of the news item to the asset. It ranges from 0 to 1. If the asset is mentioned in the headline, the relevance is set to 1. When the item is an alert (urgency == 1), relevance should be gauged by firstMentionSentence instead.
* sentimentClass(int8) - indicates the predominant sentiment class for this news item with respect to the asset. The indicated class is the one with the highest probability.
* sentimentNegative(float32) - probability that the sentiment of the news item was negative for the asset
* sentimentNeutral(float32) - probability that the sentiment of the news item was neutral for the asset
* sentimentPositive(float32) - probability that the sentiment of the news item was positive for the asset
* sentimentWordCount(int32) - the number of lexical tokens in the sections of the item text that are deemed relevant to the asset. This can be used in conjunction with wordCount to determine the proportion of the news item discussing the asset.
* noveltyCount12H(int16) - The 12 hour novelty of the content within a news item on a particular asset. It is calculated by comparing it with the asset-specific text over a cache of previous news items that contain the asset.
* noveltyCount24H(int16) - same as above, but for 24 hours
* noveltyCount3D(int16) - same as above, but for 3 days
* noveltyCount5D(int16) - same as above, but for 5 days
* noveltyCount7D(int16) - same as above, but for 7 days
* volumeCounts12H(int16) - the 12 hour volume of news for each asset. A cache of previous news items is maintained and the number of news items that mention the asset within each of five historical periods is calculated.
* volumeCounts24H(int16) - same as above, but for 24 hours
* volumeCounts3D(int16) - same as above, but for 3 days
* volumeCounts5D(int16) - same as above, but for 5 days
* volumeCounts7D(int16) - same as above, but for 7 days

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
def do_eda_for_col(col_name):
    print("Number of Nulls : {}".format(df[col_name].isna().sum()))
    print(df[col_name].describe())

**Time Column**

In [None]:
do_eda_for_col('time')

**AssetCode**

In [None]:
do_eda_for_col('assetCode')

**Volume**

In [None]:
do_eda_for_col('volume')

**close**

In [None]:
do_eda_for_col("close")

**open**

In [None]:
do_eda_for_col("open")

**returnsClosePrevRaw1**

In [None]:
do_eda_for_col("returnsClosePrevRaw1")

**returnsOpenPrevRaw1**

In [None]:

do_eda_for_col("returnsOpenPrevRaw1")

**returnsClosePrevMktres1**

In [None]:
do_eda_for_col("returnsClosePrevMktres1")


**returnsOpenPrevMktres1**

In [None]:
do_eda_for_col("returnsOpenPrevMktres1")


**returnsClosePrevRaw10**

In [None]:
do_eda_for_col("returnsClosePrevRaw10")


**returnsOpenPrevRaw10**

In [None]:
do_eda_for_col("returnsOpenPrevRaw10")


**returnsClosePrevMktres10**

In [None]:
do_eda_for_col("returnsClosePrevMktres10")


**returnsOpenPrevMktres10**

In [None]:
do_eda_for_col("returnsOpenPrevMktres10")


**returnsOpenNextMktres10**

In [None]:
do_eda_for_col("returnsOpenNextMktres10")


**universe**

In [None]:
do_eda_for_col("universe")


**date**

In [None]:
do_eda_for_col("date")


**urgency_mean**

In [None]:
do_eda_for_col("urgency_mean")


**takeSequence_mean**

In [None]:
do_eda_for_col("takeSequence_mean")


**provider_mean**

In [None]:
do_eda_for_col("provider_mean")


**bodySize_mean**

In [None]:
do_eda_for_col("bodySize_mean")


**companyCount_mean**

In [None]:
do_eda_for_col("companyCount_mean")

**headlineTag_mean**

In [None]:
do_eda_for_col("headlineTag_mean")


**marketCommentary_mean**

In [None]:
do_eda_for_col("marketCommentary_mean")


**sentenceCount_mean**

In [None]:
do_eda_for_col("sentenceCount_mean")


**wordCount_mean**

In [None]:
do_eda_for_col("wordCount_mean")


**firstMentionSentence_mean**

In [None]:
do_eda_for_col("firstMentionSentence_mean")


**relevance_mean**

In [None]:
do_eda_for_col("relevance_mean")


**sentimentClass_mean**

In [None]:
do_eda_for_col("sentimentClass_mean")


**sentimentNegative_mean**

In [None]:
do_eda_for_col("sentimentNegative_mean")


**sentimentNeutral_mean**

In [None]:
do_eda_for_col("sentimentNeutral_mean")


**sentimentPositive_mean**

In [None]:
do_eda_for_col("sentimentPositive_mean")


**sentimentWordCount_mean**

In [None]:
do_eda_for_col("sentimentWordCount_mean")


**noveltyCount12H_mean**

In [None]:
do_eda_for_col("noveltyCount12H_mean")


**noveltyCount24H_mean**

In [None]:
do_eda_for_col("noveltyCount24H_mean")


**noveltyCount3D_mean**

In [None]:
do_eda_for_col("noveltyCount3D_mean")


**noveltyCount5D_mean**

In [None]:
do_eda_for_col("noveltyCount5D_mean")


**noveltyCount7D_mean**

In [None]:
do_eda_for_col("noveltyCount7D_mean")


**volumeCounts12H_mean**

In [None]:
do_eda_for_col("volumeCounts12H_mean")


**volumeCounts24H_mean**

In [None]:
do_eda_for_col("volumeCounts24H_mean")


**volumeCounts3D_mean**

In [None]:
do_eda_for_col("volumeCounts3D_mean")


**volumeCounts5D_mean**

In [None]:
do_eda_for_col("volumeCounts5D_mean")


**volumeCounts7D_mean**

In [None]:
do_eda_for_col("volumeCounts7D_mean")

**Data Cleanup**

In [None]:
floaty_col = df.head(3).dtypes[
    (df.head(3).dtypes=="float64")|(df.head(3).dtypes=="float32")].index.values

In [None]:
def missing_value_impute(data):
    grouped_data = data.groupby('assetCode')
    for i in data.columns:
        if data[i].dtype == "object":
            data[i] = data[i].fillna("other")
        elif (data[i].dtype == "float32" or data[i].dtype == "float64"):
            data[i] = data[i].fillna(grouped_data[i].mean())
            data[i] = data[i].fillna(0)
        else:
            pass
    return data

In [None]:
gc.collect()
missing_value_impute(df)
# scaler = preprocessing.Normalizer()
# df[floaty_col] = scaler.fit_transform(df[floaty_col])

In [None]:
df.isna().sum()

In [None]:
gc.collect()
# feature_rel = sns.PairGrid(df, hue=None, palette=sns.color_palette("cubehelix", 3), 
#                  vars=[
#                      'returnsOpenNextMktres10',
#                    'close', 'open', 
# #                      'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
# #                    'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
#                    'returnsClosePrevRaw10', #'returnsOpenPrevRaw10',
#                    'returnsClosePrevMktres10', #'returnsOpenPrevMktres10',
#                    'urgency_mean', 'provider_mean',
# #                    'companyCount_mean', 'marketCommentary_mean', 
# #                      'firstMentionSentence_mean', 'relevance_mean',
# #                    'sentimentNegative_mean', 'sentimentPositive_mean',
# #                    'noveltyCount12H_mean', 'noveltyCount24H_mean',
# #                    'noveltyCount3D_mean', 
# #                      'volumeCounts12H_mean','volumeCounts24H_mean', 
# #                      'volumeCounts3D_mean'
#                  ])
# feature_rel.map(plt.scatter)
# plt.show()

**Metrics**

In [None]:
def custom_metric(date, pred_proba, num_target, universe):
    y = pred_proba*2 - 1
    r = num_target.clip(-1,1) # get rid of outliers
    x = y * r * universe
    result = pd.DataFrame({'day' : date, 'x' : x})
    x_t = result.groupby('day').sum().values
    return np.mean(x_t) / np.std(x_t)

In [None]:
date = df.date
num_target = df.returnsOpenNextMktres10.astype('float32')
bin_target = (df.returnsOpenNextMktres10 >= 0).astype('int8')
# universe = df.universe.astype('int8')
# Drop columns that are not features
df.drop([
    'returnsOpenNextMktres10', 
    'date', 
    'universe', 
    'assetCode', 
    'assetName', 
    'time'], 
        axis=1, inplace=True)
df = df.astype('float32')  # Set all remaining columns to float32 datatype
gc.collect()

In [None]:
train_index, test_index = train_test_split(df.index.values, test_size=0.1, shuffle=False)

**Baseline Model - ExtraTreesClassifier**

In [None]:
def plot_model_performance(y, y_hat):
    print("R2 score: ", r2_score(y, y_hat))
    print('Mean Absolute Error:', mean_absolute_error(y, y_hat))
    mse = mean_squared_error(y, y_hat)
    print('Mean Squared Error:', mse)  
    print('Root Mean Squared Error:', np.sqrt(mse)) 

In [None]:
def do_baseline_modeling(inp_features, target_feature, train_index, test_index):
    model = ExtraTreesClassifier(bootstrap=True, oob_score=True,
                                    n_estimators=10, class_weight="balanced_subsample")
    prev_oob_score = 0.0
    for i in range(1, 10):
        gc.collect()
        model.set_params(n_estimators=i)    
        model.fit(inp_features.iloc[train_index], target_feature.iloc[train_index])
        print(model.oob_score_)
        if i > 1 and (model.oob_score_ - prev_oob_score < 0.05):
            gc.collect()
            break
        prev_oob_score = model.oob_score_
        gc.collect()
    y_hat = model.predict(inp_features.iloc[test_index])
    plot_model_performance(bin_target[test_index], y_hat)
    return model

baseline_model = do_baseline_modeling(df, bin_target, 
                                    train_index, test_index)
imp_features = baseline_model.feature_importances_
print(imp_features)

In [None]:
def drop_unimp_features(model, data):
    top_n = -20
    ft_indices = np.argsort(model.feature_importances_)[top_n:]
    print(ft_indices)
    print("Top n features", data.columns[ft_indices])
    cols_to_ignore = data.columns[np.argsort(model.feature_importances_)[:len(data.columns)+top_n]]
    print("Columns to ignore", cols_to_ignore)
    data.drop(cols_to_ignore, axis=1, inplace=True)
    return cols_to_ignore

**LightGBM - Get Best Params**

In [None]:
def evaluate_model(df, target, train_index, test_index, params):
    params['n_jobs'] = 2  # Use 2 cores/threads
    #model = XGBClassifier(**params)
    model = LGBMClassifier(**params)
    model.fit(df.iloc[train_index], target.iloc[train_index])
    return log_loss(target.iloc[test_index], model.predict_proba(df.iloc[test_index]))

In [None]:
param_grid = {
    'learning_rate': [0.01],
    'num_leaves': [30],
    'n_estimators': [200],
    'min_child_samples': [20],
    'colsample_bytree': [0.8],
    'subsample': [1.0],
    'reg_alpha': [0.8],
    'reg_lambda': [0.4],
}
best_eval_score = 0
for i in range(2):  # Hundred runs
    print("Iteration: ", i)
    params = {k: np.random.choice(v) for k, v in param_grid.items()}
    score = evaluate_model(df, bin_target, train_index, test_index, params)
    if score < best_eval_score or best_eval_score == 0:
        best_eval_score = score
        best_params = params
        print("Current Best Param", best_params)
print("Best evaluation logloss", best_eval_score)
print("Best Params", best_params)

**Train Predictions**

In [None]:
# Train model with test data and evaluate against Baseline Model
gc.collect()
clf = LGBMClassifier(**best_params)
clf.fit(df.iloc[train_index], bin_target.iloc[train_index])
plot_model_performance(bin_target[test_index], clf.predict(df.iloc[test_index]))

In [None]:
gc.collect()
d_train = lgb.Dataset(df.iloc[train_index], label=bin_target.iloc[train_index])
d_test = lgb.Dataset(df.iloc[test_index], label=bin_target.iloc[test_index])
watchlist = [d_test]
model = lgb.train(best_params, d_train, watchlist, verbose_eval=1)
plot_model_performance(bin_target[test_index], model.predict(df.iloc[test_index]))

In [None]:
# Train model with full data
# clf = LGBMClassifier(**best_params)
gc.collect()
d_train = lgb.Dataset(df, label=bin_target)
clf = lgb.train(best_params, d_train, [bin_target], verbose_eval=1)

In [None]:
clf.predict_pro

**Write Submission**

In [None]:
days = env.get_prediction_days()

In [None]:
def write_submission(model, env, days):
    for (market_obs_df, news_obs_df, predictions_template_df) in days:
        news_obs_df = preprocess_news(news_obs_df)
        # Unstack news
        index_df = unstack_asset_codes(news_obs_df)
        news_unstack = merge_news_on_index(news_obs_df, index_df)
        # Group and and get aggregations (mean)
        news_obs_agg = group_news(news_unstack)

        # Join market and news frames
        market_obs_df['date'] = market_obs_df.time.dt.date
        obs_df = market_obs_df.merge(news_obs_agg, how='left', on=['assetCode', 'date'])
        del market_obs_df, news_obs_agg, news_obs_df, news_unstack, index_df
        gc.collect()
        obs_df = obs_df[obs_df.assetCode.isin(predictions_template_df.assetCode)]
#         print(obs_df.columns)
        # Drop cols that are not features
        dropped_features = cols_to_ignore.tolist() + ['universe', 'date', 
                                             'assetCode', 'assetName', 'time']
        feats = [c for c in obs_df.columns if c not in dropped_features]

        preds = model.predict_proba(obs_df[feats])[:, 1] * 2 - 1
        sub = pd.DataFrame({'assetCode': obs_df['assetCode'], 'confidence': preds})
        predictions_template_df = predictions_template_df.merge(sub, how='left').drop(
            'confidenceValue', axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})
        
        env.predict(predictions_template_df)
        del obs_df, predictions_template_df, preds, sub
        gc.collect()
    env.write_submission_file()

if 'days' not in globals():
    days = env.get_prediction_days()   
write_submission(clf, env, days)

**Feature Importance**

In [None]:
feat_importance = pd.DataFrame()
feat_importance["feature"] = df.columns
feat_importance["gain"] = clf.booster_.feature_importance(importance_type='gain')
feat_importance.sort_values(by='gain', ascending=False, inplace=True)
plt.figure(figsize=(8,10))
ax = sns.barplot(y="feature", x="gain", data=feat_importance)