In [1]:
"""
TensorFlow2.Xで構築
"""

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Layer, Input
from tensorflow.keras.regularizers import l2
import tensorflow as tf
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC
import os

# All the data could be downloaded from: https://www.kaggle.com/datasets/mrkmakr/criteo-dataset

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
"""
criteo dataset preprocessing

criteo data feature introduce：
- Label - Y label，「Click」value=1，「Non-Click」value=0
- I1-I13 - Totally 13 col integer data feature（most of them are counting features）
- C1-C26 - Totally 26 col catagory features，for secure reason，original data was transformed to 32 bit data
"""

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, KBinsDiscretizer
from sklearn.model_selection import train_test_split

def sparse_feature(feat, feat_num, embed_dim=4):
    """
    Build a dictionary for sparse features
    :@param feat: features name
    :@param feat_num: The number of sparse features that are not repeated
    :@param embed_dim: Dimension of the feature embedding
    """
    return {'feat_name': feat, 'feat_num': feat_num, 'embed_dim': embed_dim}


def dense_feature(feat):
    """
    Build dictionaries for dense (numerical) type features
    :@param feat: features name
    """
    return {'feat_name': feat}


def create_criteo_dataset(file, embed_dim=8, read_part=True, sample_num=100000, test_size=0.2):
    """
    criteo data set preprocessing
    :@param file: data file path
    :@param embed_dim: The embedding dimension of sparse features
    :@param read_part: Read partial data (best set to True if the data set is large)
    :@param sample_num: Sample size for each part under the partial read form
    :@param test_size: Test set ratio
    """
    names = ['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13',
             'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11',
             'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22',
             'C23', 'C24', 'C25', 'C26']

    # Partial read and full read
    if read_part:
        data_df = pd.read_csv(file, sep='\t', iterator=True, header=None, names=names)
        data_df = data_df.get_chunk(sample_num)

    else:
        data_df = pd.read_csv(file, sep='\t', header=None, names=names)

    # Specify sparse and dense features
    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]
    features = sparse_features + dense_features

    # Missing value filling
    data_df[sparse_features] = data_df[sparse_features].fillna('nan')
    data_df[dense_features] = data_df[dense_features].fillna(0)

    # discretization
    est = KBinsDiscretizer(n_bins=100, encode='ordinal', strategy='uniform')
    data_df[dense_features] = est.fit_transform(data_df[dense_features])

    for feat in sparse_features:
        le = LabelEncoder()
        data_df[feat] = le.fit_transform(data_df[feat])

    # Feature engineering: embedding of discrete features
    feature_columns = [sparse_feature(feat, int(data_df[feat].max()) + 1, embed_dim=embed_dim) for feat in features]
    train, test = train_test_split(data_df, test_size=test_size)

    # Generate training and test sets
    train_X = train[features].values.astype('int32')
    train_y = train['label'].values.astype('int32')
    test_X = test[features].values.astype('int32')
    test_y = test['label'].values.astype('int32')

    return feature_columns, (train_X, train_y), (test_X, test_y)

'\ncriteo dataset preprocessing\n\ncriteo data feature introduce：\n- Label - Y label，「Click」value=1，「Non-Click」value=0\n- I1-I13 - Totally 13 col integer data feature（most of them are counting features）\n- C1-C26 - Totally 26 col catagory features，for secure reason，original data was transformed to 32 bit data\n'

In [3]:
class MyLayer(Layer):
    def __init__(self, feature_columns, k, w_r=1e-6, v_r=1e-6):
        """
        FM Model
        :@param feature_columns: A list. sparse column feature information.
        :@param k: Implicit vector dimension
        :@param w_r: The regularization coefficient of the parameter w
        :@param v_r: The regularization coefficient of the parameter v
        """
        super(MyLayer, self).__init__()
        self.sparse_feature_columns = feature_columns
        self.index_mapping = []
        self.feature_length = 0
        for feat in self.sparse_feature_columns:
            self.index_mapping.append(self.feature_length)
            self.feature_length += feat['feat_num']
        self.k = k
        self.w_r = w_r
        self.v_r = v_r

    def build(self, input_shape):
        self.w0 = self.add_weight(name='w0', shape=(1,),
                                  initializer=tf.zeros_initializer(),
                                  trainable=True)
        self.w = self.add_weight(name='w', shape=(self.feature_length, 1),
                                 initializer=tf.random_normal_initializer(),
                                 regularizer=l2(self.w_r),
                                 trainable=True)
        self.V = self.add_weight(name='V', shape=(self.feature_length, self.k),
                                 initializer=tf.random_normal_initializer(),
                                 regularizer=l2(self.v_r),
                                 trainable=True)

    def call(self, inputs, **kwargs):
        # mapping
        inputs = inputs + tf.convert_to_tensor(self.index_mapping)
        
        # First-order term
        first_order = self.w0 + tf.reduce_sum(tf.nn.embedding_lookup(self.w, inputs), axis=1)  # (batch_size, 1)
        
        # second-order term
        second_inputs = tf.nn.embedding_lookup(self.V, inputs)  # (batch_size, fields, embed_dim)
        square_sum = tf.square(tf.reduce_sum(second_inputs, axis=1, keepdims=True))  # (batch_size, 1, embed_dim)
        sum_square = tf.reduce_sum(tf.square(second_inputs), axis=1, keepdims=True)  # (batch_size, 1, embed_dim)
        second_order = 0.5 * tf.reduce_sum(square_sum - sum_square, axis=2)  # (batch_size, 1)
        
        # First-order+second-order
        outputs = first_order + second_order
        return outputs

In [4]:
class FM(Model):
    def __init__(self, feature_columns, k, w_r=1e-6, v_r=1e-6):
        """
        Factorization Machines
        :param feature_columns: A list. sparse column feature information.
        :param k: the latent vector
        :param w_r: the regularization coefficient of parameter w
		:param v_r: the regularization coefficient of parameter v
        """
        super(FM, self).__init__()
        self.sparse_feature_columns = feature_columns
        self.fm = MyLayer(feature_columns, k, w_r, v_r)

    def call(self, inputs, **kwargs):
        fm_outputs = self.fm(inputs)
        outputs = tf.nn.sigmoid(fm_outputs)
        return outputs

    def summary(self, **kwargs):
        sparse_inputs = Input(shape=(len(self.sparse_feature_columns),), dtype=tf.int32)
        Model(inputs=sparse_inputs, outputs=self.call(sparse_inputs)).summary()

In [5]:
# Environment setting
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
#フォルダ
dataDir = '/Users/hayden/Documents/data'#フォルダ

# Hyperparameter setting
file = dataDir + '/Criteo_dataset/train.txt'
read_part = True
sample_num = 200000
test_size = 0.2

k = 8

learning_rate = 0.001
batch_size = 512
epochs = 10

# Build data set
feature_columns, train, test = create_criteo_dataset(file=file,
                                        read_part=read_part,
                                        sample_num=sample_num,
                                        test_size=test_size)
train_X, train_y = train
test_X, test_y = test

In [6]:
 # Model building
model = FM(feature_columns=feature_columns, k=k)
model.summary()
model.compile(loss=binary_crossentropy, 
                optimizer=Adam(learning_rate=learning_rate),
                metrics=[AUC()])

# Model training
model.fit(
    train_X,
    train_y,
    epochs=epochs,
    callbacks=[EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)],  # checkpoint
    batch_size=batch_size,
    validation_split=0.15
)



Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 39)]              0         
                                                                 
 my_layer (MyLayer)          (None, 1)                 3751795   
                                                                 
 tf.math.sigmoid (TFOpLambd  (None, 1)                 0         
 a)                                                              
                                                                 
Total params: 3751795 (14.31 MB)
Trainable params: 3751795 (14.31 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


2023-11-02 09:16:57.897892: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2023-11-02 09:16:57.897915: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2023-11-02 09:16:57.897922: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2023-11-02 09:16:57.898618: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:303] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-11-02 09:16:57.898991: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:269] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/10


2023-11-02 09:16:58.443981: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




2023-11-02 09:17:03.876961: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10


<keras.src.callbacks.History at 0x15a809c30>

In [7]:
# Performance on testing data set
print('test AUC: %f' % model.evaluate(test_X, test_y, batch_size=batch_size)[1])

test AUC: 0.760651


In [8]:
train_X
test_X

array([[   12,   461, 75647, ...,     0,     0,     0],
       [   12,   485, 11336, ...,     0,     0,     0],
       [  203,   120, 48048, ...,     3,     0,     0],
       ...,
       [   12,   186, 76492, ...,     0,     0,     0],
       [   47,   447, 11202, ...,     1,     0,     0],
       [   12,    29, 62315, ...,     0,     0,     0]], dtype=int32)

array([[   12,    62, 60320, ...,     1,     0,     0],
       [   12,    71,   904, ...,     0,     0,     0],
       [   12,    52, 12894, ...,     0,     0,     0],
       ...,
       [  693,    62, 44655, ...,     6,     0,     0],
       [   12,   267, 73484, ...,     2,     0,     0],
       [  254,   396,    88, ...,     0,     0,     0]], dtype=int32)

In [9]:
train_X.shape
test_X.shape

(160000, 39)

(40000, 39)

In [10]:
model.fm.V


<tf.Variable 'my_layer/V:0' shape=(416866, 8) dtype=float32, numpy=
array([[-3.8651906e-02,  2.5016459e-02,  1.3805315e-02, ...,
        -2.1974508e-02, -1.4769967e-02,  4.3764729e-02],
       [ 1.2494435e-02, -1.6562700e-02,  8.9537706e-03, ...,
        -2.1433776e-02,  1.2245435e-02, -2.8662352e-02],
       [ 1.1627961e-02,  6.9080521e-03,  5.4086052e-04, ...,
         2.4100258e-03, -1.0117851e-04,  2.7487814e-02],
       ...,
       [ 2.5572982e-02,  3.8686167e-02, -3.7211604e-02, ...,
         1.3050003e-02, -3.3481061e-02,  5.3918116e-05],
       [ 9.0363529e-03, -2.3263791e-03,  4.4197729e-03, ...,
        -1.8327414e-03, -5.9062685e-03, -4.4815750e-03],
       [-3.4889456e-02,  1.0225524e-02, -5.7260077e-03, ...,
         2.5195107e-02,  1.7950292e-03, -1.9268787e-03]], dtype=float32)>

In [16]:
model.fm.w

<tf.Variable 'my_layer/w:0' shape=(416866, 1) dtype=float32, numpy=
array([[ 0.02176597],
       [-0.04108954],
       [ 0.00081691],
       ...,
       [-0.04254679],
       [-0.01306935],
       [ 0.00056044]], dtype=float32)>

In [12]:
model.fm.w0

<tf.Variable 'my_layer/w0:0' shape=(1,) dtype=float32, numpy=array([-0.0593174], dtype=float32)>

In [13]:
model.fm.index_mapping

ListWrapper([0, 733, 1247, 84307, 127606, 127786, 127799, 136560, 136914, 136917, 152232, 156355, 233179, 236094, 236120, 242537, 305675, 305685, 308688, 310212, 310216, 381397, 381411, 381426, 400703, 400758, 415566, 415666, 415766, 415866, 415966, 416066, 416166, 416266, 416366, 416466, 416566, 416666, 416766])

In [14]:
model.fm.sparse_feature_columns

ListWrapper([DictWrapper({'feat_name': 'C1', 'feat_num': 733, 'embed_dim': 8}), DictWrapper({'feat_name': 'C2', 'feat_num': 514, 'embed_dim': 8}), DictWrapper({'feat_name': 'C3', 'feat_num': 83060, 'embed_dim': 8}), DictWrapper({'feat_name': 'C4', 'feat_num': 43299, 'embed_dim': 8}), DictWrapper({'feat_name': 'C5', 'feat_num': 180, 'embed_dim': 8}), DictWrapper({'feat_name': 'C6', 'feat_num': 13, 'embed_dim': 8}), DictWrapper({'feat_name': 'C7', 'feat_num': 8761, 'embed_dim': 8}), DictWrapper({'feat_name': 'C8', 'feat_num': 354, 'embed_dim': 8}), DictWrapper({'feat_name': 'C9', 'feat_num': 3, 'embed_dim': 8}), DictWrapper({'feat_name': 'C10', 'feat_num': 15315, 'embed_dim': 8}), DictWrapper({'feat_name': 'C11', 'feat_num': 4123, 'embed_dim': 8}), DictWrapper({'feat_name': 'C12', 'feat_num': 76824, 'embed_dim': 8}), DictWrapper({'feat_name': 'C13', 'feat_num': 2915, 'embed_dim': 8}), DictWrapper({'feat_name': 'C14', 'feat_num': 26, 'embed_dim': 8}), DictWrapper({'feat_name': 'C15', 'fea

# レコール

In [22]:
user_features = ['I1', 'I2', 'I3', 'I4', 'I5','C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10']
item_features = ['I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13','C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 
                 'C21', 'C22', 'C23', 'C24', 'C25', 'C26']

feature_names = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11',
            'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22',
            'C23', 'C24', 'C25', 'C26','I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13']



In [23]:
test_X
test_y

array([[   12,    62, 60320, ...,     1,     0,     0],
       [   12,    71,   904, ...,     0,     0,     0],
       [   12,    52, 12894, ...,     0,     0,     0],
       ...,
       [  693,    62, 44655, ...,     6,     0,     0],
       [   12,   267, 73484, ...,     2,     0,     0],
       [  254,   396,    88, ...,     0,     0,     0]], dtype=int32)

array([0, 0, 0, ..., 0, 0, 0], dtype=int32)

In [50]:
recall_df = pd.DataFrame(test_X, columns=feature_names)
recall_df['label'] = test_y
recall_df = recall_df.reset_index()
user_recall_df = recall_df[user_features]
item_recall_df = recall_df[item_features]

In [116]:
recall_df.sort_values(["label"], ascending=False)

Unnamed: 0,index,C1,C2,C3,C4,C5,C6,C7,C8,C9,...,I5,I6,I7,I8,I9,I10,I11,I12,I13,label
31339,31339,12,62,8397,14769,72,4,5000,122,2,...,0,0,0,0,0,0,1,0,0,1
8664,8664,397,65,51219,27271,27,12,4477,13,2,...,0,0,0,0,0,14,0,0,0,1
8668,8668,12,65,57200,18993,27,4,8206,13,2,...,0,0,0,0,0,14,0,0,0,1
33369,33369,301,82,44917,34763,27,12,5328,122,2,...,0,0,0,0,0,14,0,0,0,1
8671,8671,301,45,57715,23230,72,11,3547,122,2,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15124,15124,301,389,15848,3298,72,4,7187,106,2,...,0,0,0,0,0,0,1,0,0,0
15123,15123,12,256,60341,22694,27,4,323,13,2,...,0,0,0,0,0,14,3,0,0,0
15122,15122,12,443,67549,32935,27,4,2036,38,2,...,1,0,0,0,0,0,0,0,0,0
15121,15121,375,118,83059,43298,27,12,7508,13,0,...,1,0,0,0,0,0,0,0,0,0


In [219]:
lookup_dict = {}
for k,v in zip(feature_names, model.fm.index_mapping):
    lookup_dict[k] = v

# lookup_dict

user_index_mapping = []
for e in user_features:
    user_index_mapping.append(lookup_dict[e])

item_index_mapping = []
for e in item_features:
    item_index_mapping.append(lookup_dict[e])

# item_index_mapping

def get_user_embedding(user_inputs, user_index_mapping, V):
    inputs = user_inputs + user_index_mapping # (batch_size, fields)
    inputs_maped = tf.nn.embedding_lookup(V, inputs) # (batch_size, fields, embed_dim)
    embedding_sum = tf.reduce_sum(inputs_maped, axis=1, keepdims=False) # (batch_size, embed_dim)

    outputs = tf.concat((np.ones([user_inputs.shape[0],1]), embedding_sum), axis=1)
    return outputs

def get_item_embedding(item_inputs, item_index_mapping, V, w):
    inputs = item_inputs + item_index_mapping # (batch_size, fields)

    first_order = tf.reduce_sum(tf.nn.embedding_lookup(w, inputs), axis=1) # (batch_size, 1)
    

    inputs_maped = tf.nn.embedding_lookup(V, inputs) # (batch_size, fields, embed_dim)
    square_sum = tf.square(tf.reduce_sum(inputs_maped, axis=1, keepdims=True))  # (batch_size, 1, embed_dim)
    sum_square = tf.reduce_sum(tf.square(inputs_maped), axis=1, keepdims=True)  # (batch_size, 1, embed_dim)
    second_order = 0.5 * tf.reduce_sum(square_sum - sum_square, axis=2)  # (batch_size, 1)


    embedding_sum = tf.reduce_sum(inputs_maped, axis=1, keepdims=False) # (batch_size, embed_dim)

    outputs = tf.concat((first_order + second_order, embedding_sum), axis=1) # Consider the popularity of the items
    # outputs = tf.concat((np.zeros([item_inputs.shape[0],1]), embedding_sum), axis=1) # Only consider the correlation of items and user
    
    return outputs




In [220]:
user_embedding = get_user_embedding(user_recall_df.values, user_index_mapping, model.fm.V)
user_embedding

<tf.Tensor: shape=(40000, 9), dtype=float32, numpy=
array([[ 1.        , -0.6468935 ,  0.3370846 , ..., -0.68615985,
         0.0979158 ,  0.43949693],
       [ 1.        , -0.13042451,  0.17191194, ..., -0.41336745,
         0.31399703,  0.46311575],
       [ 1.        , -0.28416422,  0.29110178, ..., -0.21459931,
        -0.05070641,  0.15274854],
       ...,
       [ 1.        , -0.37377542,  0.09187578, ..., -0.26510286,
         0.18119244,  0.2644743 ],
       [ 1.        , -0.63497233,  0.57023054, ..., -0.40202576,
        -0.19110855,  0.69388413],
       [ 1.        , -0.14232281,  0.04876885, ..., -0.17596878,
         0.01733314,  0.17160805]], dtype=float32)>

In [221]:
item_embedding = get_item_embedding(item_recall_df.values, item_index_mapping, model.fm.V, model.fm.w)
item_embedding

<tf.Tensor: shape=(40000, 9), dtype=float32, numpy=
array([[-0.5728367 ,  0.03312674,  0.6088773 , ..., -0.37178078,
        -0.6312825 ,  0.54181486],
       [-1.3222276 , -0.16337073,  0.40970895, ..., -0.18750069,
        -0.12048618,  0.3187617 ],
       [-1.275589  , -0.32365072,  0.10119347, ..., -0.25255525,
        -0.21104786,  0.28153002],
       ...,
       [-0.58613044, -0.30093923,  0.9842807 , ..., -0.2373664 ,
        -0.26452255,  0.29609683],
       [-1.0798173 , -0.30645075,  0.5520266 , ...,  0.21123523,
         0.15820906,  0.49555358],
       [-1.9605564 ,  0.00583294,  0.09371707, ...,  0.01130188,
        -0.15821932, -0.01737399]], dtype=float32)>

# 検証

In [222]:
recall_df.sort_values(["label"], ascending=False)

Unnamed: 0,index,C1,C2,C3,C4,C5,C6,C7,C8,C9,...,I5,I6,I7,I8,I9,I10,I11,I12,I13,label
31339,31339,12,62,8397,14769,72,4,5000,122,2,...,0,0,0,0,0,0,1,0,0,1
8664,8664,397,65,51219,27271,27,12,4477,13,2,...,0,0,0,0,0,14,0,0,0,1
8668,8668,12,65,57200,18993,27,4,8206,13,2,...,0,0,0,0,0,14,0,0,0,1
33369,33369,301,82,44917,34763,27,12,5328,122,2,...,0,0,0,0,0,14,0,0,0,1
8671,8671,301,45,57715,23230,72,11,3547,122,2,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15124,15124,301,389,15848,3298,72,4,7187,106,2,...,0,0,0,0,0,0,1,0,0,0
15123,15123,12,256,60341,22694,27,4,323,13,2,...,0,0,0,0,0,14,3,0,0,0
15122,15122,12,443,67549,32935,27,4,2036,38,2,...,1,0,0,0,0,0,0,0,0,0
15121,15121,375,118,83059,43298,27,12,7508,13,0,...,1,0,0,0,0,0,0,0,0,0


In [228]:
index = 4119

tmp = tf.reduce_sum(tf.multiply(user_embedding[index], item_embedding), axis=1)
result_df = pd.DataFrame(tmp, columns=['y_pred']).reset_index()
result_df['label'] = recall_df['label']
recalled_index = result_df.sort_values(["y_pred"], ascending=False).head(100).index

recalled_index
result_df


Int64Index([39173, 22768, 31850, 25620, 15442, 15316,  4953, 29991, 38318,
            10951, 25331, 34862, 30054, 24416, 29880, 31344, 20802,  1447,
             9435,  2302, 21333, 31046, 21039, 26675, 18830, 24679,  1264,
            16313, 35542, 11674, 18690, 16743, 29432,   391,  7795,  8112,
            10194, 24063,   186, 36531,  2708,  1956, 11835, 17951, 23719,
            25259, 35346, 26867, 20569,  8276,  4741, 31285, 24825, 20284,
            25734, 39015, 26074, 35021, 17485,  4966, 16621, 21799, 21972,
            13882, 22589, 18754,  4151, 23632, 19448, 15847, 39337, 19942,
            30596, 37181, 21666, 21000, 35279, 37002,  7555, 12985, 24693,
            21428, 36010, 29247, 21337, 34065, 28006, 25328, 39723, 19773,
            12669, 11319, 27960, 13164, 36476, 29103, 12223, 28827, 16385,
            17269],
           dtype='int64')

Unnamed: 0,index,y_pred,label
0,0,0.224364,0
1,1,-0.760147,0
2,2,-0.703180,0
3,3,-0.541756,0
4,4,-1.294139,1
...,...,...,...
39995,39995,-0.989995,0
39996,39996,-1.019589,0
39997,39997,0.401733,0
39998,39998,-0.601465,0


In [229]:
result_df.sort_values(["y_pred"], ascending=False)

result_df.query("index=="+str(index))

result_df.query("y_pred >= " + str(float(result_df.query("index==" + str(index))["y_pred"].values)))

Unnamed: 0,index,y_pred,label
39173,39173,3.959716,1
22768,22768,3.914730,1
31850,31850,3.871641,1
25620,25620,3.871641,1
15442,15442,3.804772,1
...,...,...,...
25851,25851,-2.618930,0
15620,15620,-2.645260,0
19012,19012,-2.691553,1
8919,8919,-2.716230,0


Unnamed: 0,index,y_pred,label
4119,4119,2.428402,1


Unnamed: 0,index,y_pred,label
90,90,2.620577,1
186,186,3.323369,1
374,374,2.439479,1
391,391,3.384777,1
1264,1264,3.463467,1
...,...,...,...
39015,39015,3.105930,1
39173,39173,3.959716,1
39337,39337,2.995627,1
39723,39723,2.857268,1
