In [10]:
import numpy as np
import pandas as pd
import gc

In [11]:
import tensorflow as tf
tf.__version__

'2.13.0-rc1'

In [12]:
input_folder = './predict-student-performance-from-game-play/'

In [13]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [14]:
# Separate the input features and target variables
categorical_ft = ["event_name", "name", "page", "fqid", "room_fqid", "text_fqid"]
text_ft = ["text"]
numerical_ft = ["elapsed_time", "level", "room_coor_x", "room_coor_y", "screen_coor_x", "screen_coor_y", "hover_duration", "fullscreen", "hq", "music"]

In [15]:
df = pd.read_csv(f"{input_folder}/train.csv", nrows=1)
categorical_ft_idx = {col: df.columns.get_loc(col) for col in categorical_ft}
text_ft_idx = {col: df.columns.get_loc(col) for col in text_ft}
numerical_ft_idx = {col: df.columns.get_loc(col) for col in numerical_ft}
del df
gc.collect()

859

### Read data in chunks - by group

In [16]:
from multiprocessing import Pool, cpu_count
from itertools import repeat
from functools import partial
import os

def get_chunk_size(path):
    import psutil
    svmem = psutil.virtual_memory()
    df_sample = pd.read_csv(path, nrows=10)
    df_sample_size = df_sample.memory_usage(index=True).sum()
    # we divide by 10 because we have selected 10 lines in our df_sample
    my_chunk = (1000000000 / df_sample_size)/10
    my_chunk = int(my_chunk//1) # we get the integer part
    print(f"Chunk size: {my_chunk}")
    return my_chunk

def read_data(dataset, group):

    # Check if filtered_df exists and load it if available
    file_name = f'filtered_df_{dataset}_{group}.pkl'
    if os.path.exists(file_name):
        filtered_df = pd.read_pickle(file_name)
    else:
        path = f'{input_folder}/{dataset}.csv'
        chunk_size=get_chunk_size(path)
        # Use pandas to read the CSV file and apply the filter function
        df_chunks = pd.read_csv(path, chunksize=chunk_size)
        filtered_chunks = [chunk[chunk['level_group']==group] for chunk in df_chunks]
        # Concatenate the filtered rows into a single DataFrame
        filtered_df = reduce_mem_usage(pd.concat(filtered_chunks))
        # additional operations
        filtered_df['text'] = filtered_df['text'].astype(str)
        filtered_df[numerical_ft] = filtered_df[numerical_ft].astype(np.float32).fillna(0)
        # Store the filtered_df for future use
        filtered_df.to_pickle(file_name)
    return filtered_df

### Create LabelEncoder and Scaler

In [17]:
import os
import joblib
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler

# Check if saved transformers exist
tokenizer_path = "tokenizer.pkl"
label_encoder_path = "label_encoder.pkl"
scaler_path = "scaler.pkl"
max_time_steps_path = "max_time_steps.pkl"

# Check if tokenizer was already trained and saved
if os.path.exists(tokenizer_path):
    tokenizer = joblib.load(tokenizer_path)
else:
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(pd.read_csv(f"{input_folder}/train.csv", usecols=[text_ft_idx['text']]))
    joblib.dump(tokenizer, tokenizer_path)
    print("Tokenizer trained and saved.")

# Check if label encoder was already trained and saved
if os.path.exists(label_encoder_path):
    label_encoder = joblib.load(label_encoder_path)
else:
    label_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=299).fit(pd.read_csv(f"{input_folder}/train.csv", usecols=[idx for idx in categorical_ft_idx.values()]))
    joblib.dump(label_encoder, label_encoder_path)
    print("Label encoder trained and saved.")

# Check if scaler was already trained and saved
if os.path.exists(scaler_path):
    scaler = joblib.load(scaler_path)
else:
    scaler = MinMaxScaler().fit(pd.read_csv(f"{input_folder}/train.csv", usecols=[idx for idx in numerical_ft_idx.values()]))
    joblib.dump(scaler, scaler_path)
    print("Scaler trained and saved.")

# Check if max time steps dictionary was already calculated and saved
# We calculate here the max time steps to use in padding sequence
if os.path.exists(max_time_steps_path):
    max_time_steps = joblib.load(max_time_steps_path)
else:
    df = pd.read_csv(f"{input_folder}/train.csv", nrows=1)
    a, b = df.columns.get_loc('level_group'), df.columns.get_loc('index')
    df = pd.read_csv(f"{input_folder}/train.csv", usecols=[a, b])
    max_time_steps = df.groupby(['level_group'])['index'].describe()['75%'].to_dict()
    max_time_steps = {k: int(min(v, 300)) for k, v in max_time_steps.items()}
    joblib.dump(max_time_steps, max_time_steps_path)
    print("Max time steps calculated and saved.")



Tokenizer trained and saved.
Label encoder trained and saved.
Scaler trained and saved.
Max time steps calculated and saved.


### Format the target

In [18]:
targets = pd.read_csv(f'{input_folder}/train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]) )
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )
targets['correct'] = targets['correct'].astype(np.float32)
targets["level_group"] = "13-22"
targets.loc[(targets.q>3) & (targets.q<=11), "level_group"] = "5-12"
targets.loc[targets.q<=3, "level_group"] = "0-4"
print( targets.shape )
targets.head()

(424116, 5)


Unnamed: 0,session_id,correct,session,q,level_group
0,20090312431273200_q1,1.0,20090312431273200,1,0-4
1,20090312433251036_q1,0.0,20090312433251036,1,0-4
2,20090312455206810_q1,1.0,20090312455206810,1,0-4
3,20090313091715820_q1,0.0,20090313091715820,1,0-4
4,20090313571836404_q1,1.0,20090313571836404,1,0-4


In [19]:
grouped_targets = targets.groupby(['session', 'level_group'])["correct"].apply(lambda x: x.values.tolist()).reset_index()
grouped_targets.rename({"session":"session_id"}, axis=1, inplace=True)

del targets
gc.collect()

0

In [20]:
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense, Embedding, Concatenate, GlobalAveragePooling1D, Masking, Conv1D, GlobalMaxPooling1D, Lambda, TimeDistributed, BatchNormalization, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.layers import MultiHeadAttention
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences


### Data preparation functions

In [21]:
import tensorflow as tf
try:
    from keras_preprocessing.sequence import pad_sequences
except:
    from tensorflow.keras.utils import pad_sequences

def process_cat_ft(x, training=True):
    # Apply label encoding to categorical variables
    for column in x.columns:
        x[column] = label_encoders[column].transform(np.array(x[column].values).reshape(-1,1))
    return x

def padding_sequences(data, idx, cols, padding_len=None):
    input = [np.stack(seq) for seq in data.iloc[idx][cols].values]
    if padding_len:
        return pad_sequences(input, maxlen=padding_len)
    return input

def prep_data(data, group, grouped_targets=None, training=True):

    # Tokenize text field
    #if training:
    #    tokenizer.fit_on_texts(data['text'])
    X_text_sequences = tokenizer.texts_to_sequences(data['text'])
    max_text_length = max(len(seq) for seq in X_text_sequences)
    data['text_tkn'] = pad_sequences(X_text_sequences, maxlen=max_text_length).tolist()
    print("Done tokenizing =>", end=' ')

    # min/max scaler
    data[numerical_ft] = scaler.transform(data[numerical_ft])
    data[categorical_ft] = label_encoder.transform(data[categorical_ft])
    if training:
        # Get the unique session IDs
        unique_session_ids = data.session_id.unique()

        # Calculate the number of session IDs to sample
        sample_size = int(0.8 * len(unique_session_ids))

        # Sample without replacement
        train_session_ids = data[data.session_id.isin(training_ids)].session_id.unique()

    print("Done scaling numericals =>", end=' ')
    num_group = data.groupby(['session_id', 'level_group'])[numerical_ft].apply(lambda x: x.values.tolist()).reset_index()
    session_ids = num_group.session_id
    numerical_result = num_group.rename({0: "numerical_ft"}, axis=1)
    categorical_result = data.groupby(['session_id', 'level_group'])[categorical_ft].apply(lambda x: x.values.tolist()).reset_index().rename({0: "categorical_ft"}, axis=1)
    text_result = data.groupby(['session_id', 'level_group'])['text_tkn'].apply(lambda x: x.values.tolist()).reset_index().rename({0: "text_ft"}, axis=1).dropna()
    
    print("Done aggregating =>", end=' ')

    if training: 
        idx = categorical_result[categorical_result.session_id.isin(train_session_ids)].index
        val_idx = categorical_result[~categorical_result.session_id.isin(train_session_ids)].index
        
        y_target = numerical_result.merge(grouped_targets, on=['session_id', 'level_group'], how='left')['correct']
        y = np.array([np.stack(_y) for _y in y_target.iloc[idx].values])

        categorical_input_val = padding_sequences(data=categorical_result, idx=val_idx, cols='categorical_ft', padding_len=max_time_steps[group])
        text_input_val = padding_sequences(data=text_result, idx=val_idx, cols='text_tkn', padding_len=max_time_steps[group])
        numerical_input_val = padding_sequences(data=numerical_result, idx=val_idx, cols='numerical_ft', padding_len=max_time_steps[group])
        y_val = np.array([np.stack(_y) for _y in y_target.iloc[val_idx].values])
    else:
        idx = categorical_result.index
        categorical_input_val = None
        text_input_val=None
        numerical_input_val=None
        y_val=None
        y=None

    # Convert input data to numpy arrays
    categorical_input = padding_sequences(data=categorical_result, idx=idx, cols='categorical_ft', padding_len=max_time_steps[group])
    text_input = padding_sequences(data=text_result, idx=idx, cols='text_tkn', padding_len=max_time_steps[group])
    numerical_input = padding_sequences(data=numerical_result, idx=idx, cols='numerical_ft', padding_len=max_time_steps[group])
    
    print("Done padding.")

    return session_ids, ([categorical_input, text_input, numerical_input], y)\
        , ([categorical_input_val, text_input_val, numerical_input_val], y_val)

### TFT Model - Keras

In [22]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.layers import MultiHeadAttention
from tensorflow.keras.models import load_model

def create_tf_model(max_time_steps, num_categories, max_text_length, num_time_series, nb_questions):
    # Define input layers
    categorical_input = Input(shape=(max_time_steps, num_categories), name='categorical_input')
    text_input = Input(shape=(max_time_steps, max_text_length), name='text_input')
    time_series_input = Input(shape=(max_time_steps, num_time_series), name='time_series_input')
    
    # Concatenate all inputs
    input_concat = Concatenate()([categorical_input, text_input, time_series_input])
    
    # Temporal fusion transformer encoding layers
    num_layers = 4  # Number of encoding layers
    num_heads = 4  # Number of attention heads
    hidden_units = num_categories + max_text_length + num_time_series  # Number of units in the hidden layer
    
    x = input_concat
    for _ in range(num_layers):
        # Self-attention layer
        attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=hidden_units)(x, x)
        attn_output = BatchNormalization()(attn_output)
        attn_output = Dropout(0.2)(attn_output)
        x = x + attn_output
    
        # Feed-forward network
        ff_output = Dense(hidden_units, activation='relu')(x)
        ff_output = BatchNormalization()(ff_output)
        ff_output = Dropout(0.2)(ff_output)
        x = x + ff_output

    from tensorflow.keras import initializers
    
    x = Flatten()(x)
    # Fully connected layers
    x = Dense(128, activation='relu', kernel_initializer=initializers.RandomNormal(stddev=0.1))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    x = Dense(64, activation='relu', kernel_initializer=initializers.RandomNormal(stddev=0.1))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    x = Dense(32, activation='relu', kernel_initializer=initializers.RandomNormal(stddev=0.1))(x)
    x = BatchNormalization()(x)
    x = Dropout(0.3)(x)
    
    # Output layer
    output = Dense(nb_questions, activation='sigmoid')(x)


    # Create the model
    model = Model(inputs=[categorical_input, text_input, time_series_input], outputs=output)

    return model


In [23]:
import tensorflow.keras.backend as K

try:
    from tensorflow.keras.optimizers.legacy import Adam
except:
    from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping
import copy 

def get_f1_score(num_classes):
    try:
        from tensorflow.keras.metrics import F1Score
        f1_score = F1Score(average='macro')
    except:
        from tensorflow_addons.metrics import F1Score
        f1_score = F1Score(num_classes=num_classes, average='macro')
    return f1_score

# Learning rate scheduler
def lr_scheduler(epoch, lr):
    decay_rate = 0.1
    decay_step = 5
    if epoch % decay_step == 0 and epoch > 0:
        return lr * decay_rate
    return lr

def focal_loss(gamma=2.0, alpha=0.25):
    def loss_function(y_true, y_pred):
        # Cast y_true to the same data type as y_pred
        y_true = K.cast(y_true, K.dtype(y_pred))
        # Calculate the focal loss for each element in the output vector
        epsilon = K.epsilon()
        y_pred = K.clip(y_pred, epsilon, 1.0 - epsilon)
        cross_entropy = -y_true * K.log(y_pred)
        loss = alpha * K.pow(1 - y_pred, gamma) * cross_entropy

        # Calculate the average loss across all elements
        loss = K.mean(loss, axis=-1)

        return loss
    
    return loss_function

def jaccard_loss():
    def loss_function(y_true, y_pred):
        intersection = K.sum(y_true * y_pred, axis=-1)
        union = K.sum(y_true + y_pred, axis=-1) - intersection
        jaccard = (intersection + 1e-6) / (union + 1e-6)
        loss = 1 - jaccard
        return loss
    return loss_function

def find_optimal_threshold(y_true, y_pred):
    from sklearn.metrics import f1_score
    best_threshold = 0.0
    best_f1_score = 0.0
    
    for threshold in np.arange(0.0, 1.01, 0.01):
        y_pred_binary = (y_pred > threshold).astype(int)
        f1 = f1_score(y_true, y_pred_binary, average='macro')
        
        if f1 > best_f1_score:
            best_f1_score = f1
            best_threshold = threshold
    
    return best_threshold

In [24]:
def training_round(X, y, X_val, y_val, **kwargs):
    if kwargs['iteration'] == 0:
        # Create the model
        model = create_tf_model(max_time_steps=kwargs['max_time_steps'], 
                                num_time_series=kwargs['num_time_series'], 
                                num_categories=kwargs['num_categories'], 
                                nb_questions=kwargs['nb_questions'], 
                                max_text_length=kwargs['max_text_length'])
        learning_rate = 0.0001  # Initial learning rate
    else:
        model = load_model(kwargs['model_path'], custom_objects={'F1Score': get_f1_score(kwargs['nb_questions'])})
        previous_optimizer = model.optimizer
        learning_rate = K.get_value(previous_optimizer.lr)  # Get the learning rate from the previous optimizer

    total_params = model.count_params()
    print("Total number of model parameters:", total_params)

    # Compile and train the model
    optimizer = Adam(learning_rate=learning_rate)  # Use the same learning rate
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[kwargs['eval_metric']])
    lr_callback = LearningRateScheduler(lr_scheduler)
    model.fit(X, y,
              batch_size=128, 
              epochs=5,
              validation_data=(X_val, y_val),
              callbacks=[lr_callback, EarlyStopping(patience=10)])
    model.save(kwargs['model_path'])
    return model


In [25]:
import random
session_ids = list(grouped_targets.session_id.unique())
sample_size = int(len(session_ids) * 0.85)
training_ids = random.sample(session_ids, sample_size)
val_ids = grouped_targets[~grouped_targets.session_id.isin(training_ids)].session_id.unique()
"Training ids:", len(training_ids), " Validation ids: ", len(val_ids)

('Training ids:', 20027, ' Validation ids: ', 3535)

In [26]:
train = True
    
models = {group: f"model_{group}.h5" for group in ['0-4', '5-12', '13-22']}
thresholds = {group: 0.5 for group in ['0-4', '5-12', '13-22']}
training_session_perc = 0.2
val_size = 1000
for group in ['0-4', '5-12', '13-22']:
    if not train:
        break
    print(f"======= Training {group} ===========")
    print("Reading group:",group, end=' ')
    for i in range(4):
        sub_data = read_data('train', group=group)
        sessions=np.concatenate((\
                                 np.random.choice(training_ids, int(len(training_ids)*training_session_perc)),
                                 random.sample(list(val_ids), val_size)
                                ))
        sub_data = sub_data[sub_data.session_id.isin(sessions)]
        print(sub_data.shape)
        _, ([categorical_input, text_input, numerical_input], y)\
        ,([categorical_input_val, text_input_val, numerical_input_val], y_val) = \
        prep_data(sub_data, group, grouped_targets, training=True)
        # Define the number of time series, categorical fields, and classes
        num_time_series = len(numerical_ft)
        num_categories = len(categorical_ft)
        nb_questions = y.shape[1]  # Assuming there are 18 questions per level group
        vocab_size = len(tokenizer.word_index) + 1
        max_text_length = text_input.shape[-1]
        f1_score = get_f1_score(nb_questions)
        params = {
            "max_time_steps": max_time_steps[group],
            "num_time_series" : num_time_series,
            "nb_questions": nb_questions,
            "max_text_length":max_text_length,
            "num_categories":num_categories,
            "eval_metric": f1_score,
            "iteration": i,
            "model_path": f"model_{group}.h5"
        }
        model = training_round(
            X=[categorical_input, text_input, numerical_input],
            y=y,
            X_val=[categorical_input_val, text_input_val, numerical_input_val],
            y_val=y_val,
            **params
        )
        
    models[group] = f"model_{group}.h5"
    print(f"======= Predicting {group} ===========")
    # Obtain predictions on the validation set
    y_pred_val = model.predict([categorical_input_val, text_input_val, numerical_input_val])

    # Find the optimal threshold
    optimal_threshold = find_optimal_threshold(y_val, y_pred_val)
    thresholds[group] = optimal_threshold
    print("Optimal threshold: ", optimal_threshold)
    data = read_data('test', group=group)
    sessions_id, ([categorical_input, text_input, numerical_input], y)\
    ,([_, _, _], _) = prep_data(data, group, None, training=False)
    y_pred_val = model.predict([categorical_input_val, text_input_val, numerical_input_val])
    y_pred_binary = (y_pred_val > optimal_threshold).astype(int)

    result = pd.DataFrame(zip(sessions_id, y_pred_binary), columns=['session_id', 'correct'])
    result = result.explode('correct', ignore_index=False)
    result['session_id'] = result['session_id'].apply(lambda x: str(x)+ '_q') + (result.groupby('session_id').cumcount()+1).astype(str)
    result.to_csv(f"submission_{group}.csv", index=False)    

Reading group: 0-4 (785855, 20)
Done tokenizing => Done scaling numericals => Done aggregating => 

  trunc = np.asarray(trunc, dtype=dtype)


Done padding.


2023-06-21 15:43:09.735890: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Max
2023-06-21 15:43:09.735910: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 32.00 GB
2023-06-21 15:43:09.735918: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 10.67 GB
2023-06-21 15:43:09.735949: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-06-21 15:43:09.735964: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Total number of model parameters: 294467
Epoch 1/5


2023-06-21 15:43:11.410239: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-06-21 15:43:24.658183: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  saving_api.save_model(


(784332, 20)
Done tokenizing => Done scaling numericals => Done aggregating => 

  trunc = np.asarray(trunc, dtype=dtype)


Done padding.
Total number of model parameters: 294467
Epoch 1/5


2023-06-21 15:44:17.275007: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-06-21 15:44:32.935195: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  saving_api.save_model(


(773312, 20)
Done tokenizing => Done scaling numericals => Done aggregating => 

  trunc = np.asarray(trunc, dtype=dtype)


Done padding.
Total number of model parameters: 294467
Epoch 1/5


2023-06-21 15:45:27.275277: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-06-21 15:45:43.729934: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  saving_api.save_model(


(773097, 20)
Done tokenizing => Done scaling numericals => Done aggregating => 

  trunc = np.asarray(trunc, dtype=dtype)


Done padding.
Total number of model parameters: 294467
Epoch 1/5


2023-06-21 15:46:38.215383: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-06-21 15:46:57.350534: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  saving_api.save_model(
2023-06-21 15:47:43.792597: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Optimal threshold:  0.31
Done tokenizing => Done scaling numericals => Done aggregating => Done padding.
 2/32 [>.............................] - ETA: 1s

  trunc = np.asarray(trunc, dtype=dtype)


Reading group: 5-12 (1742725, 20)
Done tokenizing => Done scaling numericals => Done aggregating => 

  trunc = np.asarray(trunc, dtype=dtype)


Done padding.
Total number of model parameters: 644840
Epoch 1/5


2023-06-21 15:48:05.779071: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-06-21 15:48:49.098393: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  saving_api.save_model(


(1750852, 20)
Done tokenizing => Done scaling numericals => Done aggregating => 

  trunc = np.asarray(trunc, dtype=dtype)


Done padding.
Total number of model parameters: 644840
Epoch 1/5


2023-06-21 15:51:44.027932: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-06-21 15:52:30.817569: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  saving_api.save_model(


(1741033, 20)
Done tokenizing => Done scaling numericals => Done aggregating => 

  trunc = np.asarray(trunc, dtype=dtype)


Done padding.
Total number of model parameters: 644840
Epoch 1/5


2023-06-21 15:55:23.666169: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-06-21 15:56:10.876817: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  saving_api.save_model(


(1734770, 20)
Done tokenizing => Done scaling numericals => Done aggregating => 

  trunc = np.asarray(trunc, dtype=dtype)


Done padding.
Total number of model parameters: 644840
Epoch 1/5


2023-06-21 15:59:05.504959: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-06-21 15:59:53.802131: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  saving_api.save_model(
2023-06-21 16:02:30.902901: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Optimal threshold:  0.19
Done tokenizing => Done scaling numericals => Done aggregating => Done padding.


  trunc = np.asarray(trunc, dtype=dtype)


Reading group: 13-22 (2648801, 20)
Done tokenizing => Done scaling numericals => Done aggregating => 

  trunc = np.asarray(trunc, dtype=dtype)


Done padding.
Total number of model parameters: 644807
Epoch 1/5


2023-06-21 16:03:07.195382: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-06-21 16:03:55.113054: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5

In [None]:
import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

In [None]:
import pandas as pd

counter = 0
# The API will deliver two dataframes in this specific order,
# for every session+level grouping (one group per session for each checkpoint)
for (test, sample_submission) in iter_test:
    test['text'] = test['text'].astype(str)
    test[numerical_ft] = test[numerical_ft].fillna(0)
    if counter == 0:
        print(test.shape)
        print(sample_submission.head())
        
    list_df = []
    for group, model_path in models.items():
        if test[test.level_group==group].shape[0] == 0:
            continue
        print(f"==== Submission {group} - Model {model_path}")
        #print(test[test.level_group==group].head())
        sessions_id, ([categorical_input, text_input, numerical_input], y)\
        ,([_, _, _], _) = prep_data(test[test.level_group==group].copy(), group, None, training=False)
        ## users make predictions here using the test data
        model = load_model(model_path, custom_objects={'F1Score': get_f1_score(3)})
        y_pred_val = model.predict([categorical_input, text_input, numerical_input])
        y_pred_binary = (y_pred_val > thresholds[group]).astype(int)

        result = pd.DataFrame(zip(sessions_id, y_pred_binary), columns=['session_id', 'correct'])
        result = result.explode('correct', ignore_index=False)
        result['session_id'] = result['session_id'].apply(lambda x: str(x)+ '_q') + (result.groupby('session_id').cumcount()+1).astype(str)
        list_df.append(result)
        print(sessions_id)
    #final_result = pd.concat(list_df)
    #submission_merged=final_result.merge(sample_submission, on='session_id', how='left')
    # Replace values in column B of df1 with values from merged_df
    
    sample_submission = pd.concat(list_df)
    env.predict(sample_submission)
    counter += 1

In [None]:
## the end result is a submission file containing all test session predictions
! head submission.csv