In [None]:
import pandas as pd
import numpy as np
from sklearn import *
import lightgbm as lgb
from kaggle.competitions import twosigmanews

env = twosigmanews.make_env()
(market_train, news_train) = env.get_training_data()

In [None]:
def more_news(df):
    df['assetCodes'] = df['assetCodes'].astype(str)
    df['assetCodes'] = df['assetCodes'].map(lambda x: eval(x.replace('{','[').replace('}',']')))
    news_col = [c for c in df.columns if c not in ['assetCodes']]
    df = df.set_index(news_col)['assetCodes'].apply(pd.Series).stack().reset_index()
    df.columns = news_col + ['assetCodes_count', 'assetCodes']
    return df

market_train['time'] = pd.to_datetime(market_train['time']).dt.date.astype(str)
news_train.sourceTimestamp = news_train.sourceTimestamp.dt.hour
news_train.firstCreated = pd.to_datetime(news_train['firstCreated']).dt.date.astype(str)
lbl = {k: v for v, k in enumerate(market_train['assetCode'].unique())}
market_train['assetCodeT'] = market_train['assetCode'].map(lbl)

In [None]:
market_train = market_train[market_train['time']>= '2016-07-01'].reset_index(drop=True)
news_train = news_train[news_train['firstCreated'].astype(str) >= '2016-07-01'].reset_index(drop=True)

nt_col = [c for c in news_train if c not in ['assetName', 'audiences', 'headline', 'headlineTag', 'marketCommentary', 'provider', 'sourceId', 'subjects']]
news_train = more_news(news_train[nt_col])
news_train = news_train.groupby(['firstCreated', 'assetCodes'], as_index=False).agg([np.sum, np.mean, np.min, np.max]).reset_index()
market_train = pd.merge(market_train, news_train, how='left', left_on=['time', 'assetCode'], right_on=['firstCreated', 'assetCodes'])
market_train.columns = [''.join(c) for c in market_train.columns]
market_train.dropna(axis=0, inplace=True)
news_train = []

In [None]:
fcol = [c for c in market_train.columns if c not in ['assetCode', 'assetCodes', 'assetCodes_count', 'assetCodeT', 'assetName', 'audiences', 'firstCreated', 'headline', 'headlineTag', 'marketCommentary', 'provider', 'returnsOpenNextMktres10', 'sourceId', 'subjects', 'time', 'time_x', 'universe']]
x1, x2 = model_selection.train_test_split(market_train, test_size=0.25, random_state=99)

def lgb_2s(preds, y):
    try:
        w = np.array(list(y.get_weight()))
        y = np.array(list(y.get_label()))
        xt = preds * y * w
        score= np.mean(xt) / np.std(xt)
    except:
        score = 0.0
    return '2_S', score, True

params = {'learning_rate': 0.08, 'boosting': 'gbdt', 'objective': 'regression', 'seed': 18}
lgb_model = lgb.train(params, lgb.Dataset(x1[fcol], label=x1['returnsOpenNextMktres10'], weight=x1['universe']), 500, lgb.Dataset(x2[fcol], label=x2['returnsOpenNextMktres10'], weight=x2['universe']), feval=lgb_2s, verbose_eval=10, early_stopping_rounds=20)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.DataFrame({'imp': lgb_model.feature_importance(importance_type='gain'), 'col':fcol})
df = df.sort_values(['imp','col'], ascending=[True, False])

df = pd.DataFrame({'imp': lgb_model.feature_importance(importance_type='split'), 'col':fcol})
df = df.sort_values(['imp','col'], ascending=[True, False])

In [None]:
for (market_test, news_test, sub) in env.get_prediction_days():
    market_test.time = pd.to_datetime(market_test['time']).dt.date.astype(str)
    news_test.sourceTimestamp= news_test.sourceTimestamp.dt.hour
    news_test.firstCreated = pd.to_datetime(news_test['firstCreated']).dt.date.astype(str)
    news_test = more_news(news_test)
    news_test = news_test.groupby(['firstCreated', 'assetCodes'], as_index=False).agg([np.sum, np.mean, np.min, np.max])
    market_test = pd.merge(market_test, news_test, how='left', left_on=['time', 'assetCode'], right_on=['firstCreated', 'assetCodes'])
    market_test.columns = [''.join(c) for c in market_test.columns]
    market_test['assetCodeT'] = market_test['assetCode'].map(lambda x: lbl[x] if x in lbl else 0)
    market_test['confidenceValue'] = lgb_model.predict(market_test[fcol], num_iteration=lgb_model.best_iteration).clip(-0.99, 0.99)
    sub = market_test[['assetCode','confidenceValue']]
    env.predict(sub)
env.write_submission_file()