**Load the data from `env`**

In [None]:
from kaggle.competitions import twosigmanews
env = twosigmanews.make_env()
(market_train_df, news_train_df) = env.get_training_data()
print (market_train_df.shape, news_train_df.shape)

## Merge market and news

In [None]:
def normalize_market(market_df,minmax,mcols):
    market_tmp=market_df.merge(minmax,how='left',on='assetCode')
    market_norm=market_df.copy()
    for mcol in mcols:
        market_norm[mcol]=(market_tmp[mcol+'_max']-market_tmp[mcol])/(market_tmp[mcol+'_max']-market_tmp[mcol+'_min'])
    market_norm['close/open']=market_norm['close']/market_norm['open']
    #market_norm['day']=market_norm['time'].dt.day
    #market_norm['month']=market_norm['time'].dt.month
    return market_norm
def assign_code(source_codes,target_codes):
    for code in source_codes:
        if code in target_codes:
            return code
    return "NA"
def clean_news(news_df,codes):
    ## process news_df
    news_df["code"]=news_df['assetCodes'].apply(lambda x:assign_code(codes,list(eval(x))))
    news_drop_columns=['sourceTimestamp','firstCreated','sourceId','headline','provider','subjects','audiences','headlineTag','assetName']
    news_df=news_df.drop(news_drop_columns,axis=1)
    return news_df
def clean_merge(market_df,news_df,minmax,mcols):
    market_norm=normalize_market(market_df,minmax,mcols)
    news_df['date']=news_df['time'].dt.date
    market_norm["date"]=market_norm['time'].dt.date
    codes=list(market_norm['assetCode'].unique())
    news_df=clean_news(news_df,codes)
    news_agg=news_df.drop(['assetCodes'],axis=1).groupby(['code','date']).agg(['mean'])
    news_clm=news_agg.columns.droplevel(1)
    news_agg=news_agg.reset_index()
    news_agg.columns=['code','date']+list(news_clm)
    merge_df=market_norm.merge(news_agg,how='left',left_on=['assetCode','date'],right_on=['code','date'])
    return merge_df

In [None]:
mm_cols=[]
for clm in market_train_df.columns:
    if clm not in ['time','assetCode','assetName','returnsOpenNextMktres10','universe','open','close']:
        if clm.find("return")==-1:
            mm_cols.append(clm)
minmax=market_train_df.groupby('assetCode')[mm_cols].agg(['min','max'])
minmax.columns = ["_".join(x) for x in minmax.columns.ravel()]
minmax=minmax.reset_index()

lbl = {k: v for v, k in enumerate(market_train_df['assetCode'].unique())}
merge_df=clean_merge(market_train_df,news_train_df,minmax,mm_cols)
merge_df['assetCodeT'] = merge_df['assetCode'].map(lbl)
y=1*(merge_df['returnsOpenNextMktres10']>0)
X=merge_df.drop(['time','assetCode','assetName','returnsOpenNextMktres10','universe','close','open','date','code','marketCommentary'],axis=1)

In [None]:
X.shape

In [None]:
import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

xgb_para={'learning_rate':0.1,'n_jobs':4,'n_estimators':200,'max_depth':8}
xgb = XGBClassifier(**xgb_para)
X_train, X_test, y_train, y_test = train_test_split(
    X.values, y.values, test_size=0.25, random_state=1001)
xgb.fit(X_train,y_train)
y_test_p=xgb.predict(X_test)
print (accuracy_score(y_test_p,y_test))

In [None]:
import xgboost
from xgboost import XGBClassifier
xgb_para={'learning_rate':0.1,'n_jobs':4,'n_estimators':200,'max_depth':8}
xgb = XGBClassifier(**xgb_para)
X.fillna(0)
xgb.fit(X,y)
days = env.get_prediction_days()

In [None]:
import pandas as pd
n_days = 0
prep_time = 0
prediction_time = 0
packaging_time = 0
for (market_obs_df, news_obs_df, predictions_template_df) in days:
    n_days +=1
    print(n_days,end=' ')
    #market_obs_norm=normalize_market(market_obs_df,minmax,mm_cols)
    
    market_obs_df = clean_merge(market_obs_df, news_obs_df,minmax,mm_cols)
    market_obs_df = market_obs_df[market_obs_df.assetCode.isin(predictions_template_df.assetCode)]
    market_obs_df['assetCodeT'] = market_obs_df['assetCode'].map(lbl)
    
    X_v=market_obs_df[list(X.columns)]
    X_v.fillna(0)
    lp = xgb.predict_proba(X_v)
    
    confidence = 2* lp[:,1] -1
    preds = pd.DataFrame({'assetCode':market_obs_df['assetCode'],'confidence':confidence})
    predictions_template_df = predictions_template_df.merge(preds,how='left').drop('confidenceValue',axis=1).fillna(0).rename(columns={'confidence':'confidenceValue'})
    env.predict(predictions_template_df)

In [None]:
env.write_submission_file()