In [None]:
####################################################################
## Part 0: Init
####################################################################

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
####################################################################
## Part 1: DataPrep 1: Creating Train and Test Sets
####################################################################

raw_train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv',nrows=300000)
#raw_train = raw_train.drop(['Unnamed: 0'],axis=1)
train = raw_train.copy()
train = train[train['weight'] != 0]
y = train.loc[:,'resp']
y = np.where(y>0,1,0)
#X = train.iloc[:,7:(len(train.columns)-1)]
X = train
from sklearn.model_selection import train_test_split
All_TrainValid, All_Test, y_TrainValid, y_Test = train_test_split(X, y, test_size=0.2, random_state=42,shuffle=False)
print('All_TrainValid: {} All_Test: {} y_TrainValid: {} y_Test: {} '.format(All_TrainValid.shape, All_Test.shape, y_TrainValid.shape, y_Test.shape))

# All_TrainValid['date'].value_counts()
# All_Test['date'].value_counts()

features = [c for c in All_TrainValid.columns if 'feature' in c]
X_TrainValid = All_TrainValid[features]
X_Test = All_Test[features]

In [None]:
####################################################################
## Part 2: DataPrep 2: Missing Inputation, Input Scaling
####################################################################

# Missing Inputation
from sklearn.impute import SimpleImputer

imputerF = SimpleImputer(missing_values=np.nan,strategy='mean')
float_TrainValid = X_TrainValid.select_dtypes(include='float')
X_TrainValid.loc[:,float_TrainValid.columns] = imputerF.fit_transform(float_TrainValid)
#Input Test Set
float_Test = X_Test.select_dtypes(include='float')
X_Test.loc[:,float_Test.columns] = imputerF.transform(float_Test)


imputerI = SimpleImputer(missing_values=np.nan,strategy='median')
int_TrainValid = X_TrainValid.select_dtypes(include='int')
X_TrainValid.loc[:,int_TrainValid.columns] = imputerI.fit_transform(int_TrainValid)
#Input Test Set
int_Test = X_Test.select_dtypes(include='int')
X_Test.loc[:,int_Test.columns] = imputerI.transform(int_Test)

# Scaling Inputs
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_TrainValid = sc.fit_transform(X_TrainValid)
#Scale Test Set
X_Test = sc.transform(X_Test)

In [None]:
####################################################################
## Part 3: Model Training
####################################################################

import tensorflow as tf

import random
import os
def set_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

set_all_seeds(42)

model = tf.keras.models.Sequential(
    [
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64,activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(64,activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1,activation='sigmoid'),
    ]
)
optimizer = 'adam'
model.compile(optimizer, loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.001),metrics=tf.keras.metrics.Precision(name='precision'))
model.fit(X_TrainValid,y_TrainValid,epochs=40, batch_size=int(len(X_TrainValid)/5))

def utility_score_bincount(date, weight, resp, action):
    count_i = len(np.unique(date))
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return u

y_pred = model.predict(X_TrainValid).reshape((-1,))
print(utility_score_bincount(All_TrainValid['date'].values, All_TrainValid['weight'].values,
                       All_TrainValid['resp'].values, np.where(y_pred>0.5,1,0))/len(X_TrainValid))

y_pred = model.predict(X_Test).reshape((-1,))
print(utility_score_bincount(All_Test['date'].values, All_Test['weight'].values,
                       All_Test['resp'].values, np.where(y_pred>0.5,1,0))/len(X_Test))




In [None]:
####################################################################
## Part 4: Model Use for Testing
####################################################################

import janestreet
env = janestreet.make_env()

f_mean = np.mean(train[features[1:]].values,axis=0)
from tqdm import tqdm
for (test_df, pred_df) in tqdm(env.iter_test()): 
    if test_df['weight'].item() > 0:
        
        test_df = test_df.loc[:, features]
        #Input Test Set: float
        float_Test = test_df.loc[:,float_TrainValid.columns]
        test_df.loc[:,float_Test.columns] = imputerF.transform(float_Test)
        #Input Test Set: int
        int_Test = test_df.loc[:,int_TrainValid.columns]
        test_df.loc[:,int_Test.columns] = imputerI.transform(int_Test)
        #Scale Test Set
        test_df = sc.transform(test_df)

        x_tt = test_df
        x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
            
        pred = np.median(model(x_tt))
        pred_df.action = np.where(pred >= 0.5, 1, 0).astype(int)
        
    else:
        pred_df.action = 0
        
    env.predict(pred_df)