<div style="padding: 10px">
    <h1 class="h1-title" style="color: black; font-size: 30px !important;
              font-family: Calibri;">
        <b style="color: #00FF66; display: inline-block; font-size: 30px !important;">1 |</b><p style="font-size: 30px !important; display: inline-block;">Encoder-Decoder solution</p>
    </h1>
</div>

<div style="padding: 10px">
    <h2 class="h2-title" style="color: black; font-size: 25px !important;
              font-family: Calibri;">
        <b style="color: #00FF66; display: inline-block; font-size: 25px !important;">1.1 |</b><p style="font-size: 25px !important; display: inline-block;">Imports</p>
    </h2>
</div>

In [1]:
# math and tables
import pandas as pd
import numpy as np

# for model building
import tensorflow as tf

# utils
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")



<div style="padding: 10px">
    <h2 class="h2-title" style="color: black; font-size: 25px !important;
              font-family: Calibri;">
        <b style="color: #00FF66; display: inline-block; font-size: 25px !important;">1.2 |</b><p style="font-size: 25px !important; display: inline-block;">Data Loading, Formatting</p>
    </h2>
</div>

In [2]:
train_df = pd.read_csv("/kaggle/input/alfabank/df_train.csv", sep=";")
test_df = pd.read_csv("/kaggle/input/alfabank/df_test.csv", sep=";")

train_df.dtypes

Id         int64
Data      object
Target    object
dtype: object

In [3]:
# Since the arrays have varying lengths, we opt for Python lists over numpy ndarrays.
# This is because numpy ndarrays require uniform lengths, while lists do not.

# The lambda function takes each string, splits it by commas, and converts each segment into an integer.
train_df['Data'] = train_df.Data.apply(lambda s: list(map(int, s.split(','))))
train_df['Target'] = train_df.Target.apply(lambda s: list(map(int, s.split(','))))
test_df['Data'] = test_df.Data.apply(lambda s: list(map(int, s.split(','))))

# Сonvert all the train data to python lists
x_train = train_df.Data.to_list()
x_target = train_df.Target.to_list()
x_test = test_df.Data.to_list()

print("\tlength", "\n  train:", len(x_train), "\n  test: ", len(x_test))

	length 
  train: 7033 
  test:  7033


In [4]:
# The goal here is to augment the training data by breaking down long transaction sequences
# into smaller, more manageable subsequences. This is based on the assumption that the 'Target'
# column is a direct continuation of the 'Data' column, allowing us to use past transaction data
# to predict future transactions.

# Set the distance between the start of each subsequence.
dist = 28

x_train_aug = []
x_target_aug = []

for seq in x_train:
    # Only consider sequences longer than 100 transactions for augmentation.
    if len(seq) > 100:
        # Create subsequences starting from index 0, stepping by 'dist' each time.
        for j in range(0, len(seq), dist):
            # If the remaining part of the sequence is less than 60 transactions,
            # it's too short to be split further and is skipped.
            if len(seq) - j < 60:
                break
            
            i = j
            train_seq_aug = []
            target_seq_aug = []
            
            # Collect the next 50 transactions for the training subsequence.
            while i < 50 + j:
                train_seq_aug.append(seq[i])
                i += 1
            
            # Collect the next 10 transactions for the target subsequence.
            while i < 60 + j:
                target_seq_aug.append(seq[i])
                i += 1
            
            x_train_aug.append(train_seq_aug)
            x_target_aug.append(target_seq_aug)

# The result is a set of shorter sequences that can be used to train a model
# with a more focused context, potentially improving the prediction accuracy.

In [5]:
x_train += x_train_aug
x_target += x_target_aug

print("augmented data length:", len(x_train))

augmented data length: 113808


In [6]:
# 0 - start output sequnce symbol
# 1 - end output sequnce symbol
# -1 - filler symbol

# Convert the training data sequences into numpy arrays.
encoder_input_seqs = np.asarray([np.array(seq) for seq in x_train])

# Prepare the input sequences for the decoder by adding the start symbol at the beginning.
# The last element of each sequence is removed.
decoder_input_seqs = np.asarray([np.array([0] + seq[:-1]) for seq in x_target])

# The target sequences for the decoder are the original sequences without modification.
decoder_target_seqs = np.asarray([np.array(seq) for seq in x_target])

# Convert the test data sequences into numpy arrays.
test_input_seqs = np.asarray([np.array(seq) for seq in x_test])

# Create a vocabulary set of MCC codes from the sequences.
# This will be used to map each unique MCC code to a unique integer for model processing.
# Starting with a set containing the filler symbol.
vocab = set([-1])

# Update the vocabulary with all unique MCC codes found in the encoder input sequences.
for seq in encoder_input_seqs:
    vocab.update(set(seq))
    
# Update the vocabulary with all unique MCC codes found in the decoder input sequences.
for seq in decoder_input_seqs:
    vocab.update(set(seq))

print(f"vocab: {vocab}\nlength: {len(vocab)}")

vocab: {0, 5122, 9222, 5641, 5131, 4111, 4112, 5137, 5651, 7699, 5655, 4121, 8220, 5661, 4131, 7210, 7216, 5681, 5169, 5172, 7221, 8244, 5691, 7230, 5697, 5699, 6211, 5192, 5193, 5199, 5200, 5712, 5714, 5713, 5719, 5722, 5211, 9311, 5732, 5733, 5734, 5735, 7273, 8299, 7278, 5231, 4722, 4214, 4215, 7298, 7299, 5251, 5261, 7311, 7829, 7832, 6300, 7841, 7338, 1711, 4784, 4411, 5811, 5812, 5813, 5814, 4789, 5300, 5816, 9399, 9402, 2741, 5309, 5310, 5311, 1731, 4812, 7372, 4814, 7375, 4816, 8398, 5331, 4829, 7395, 742, 7399, 7922, 7932, 7933, 1799, 9211, 7991, 5399, 5912, 7993, 3351, 5921, 5411, 4900, 4899, 8999, 5931, 5422, 5940, 5941, 5942, 5943, 5944, 5945, 5946, 5947, 7996, 5949, 7994, 5948, 7995, 5441, 7997, 5950, 7999, 5451, 5964, 8011, 5965, 5967, 5968, 5969, 5970, 5971, 8021, 5462, 7512, 5977, 5976, 5983, 7011, 7523, 5992, 5993, 5994, 5995, 8043, 7531, 5999, 6513, 7538, 7542, 6010, 5499, 6011, 6012, 8062, 5511, 6536, 8071, 5013, 5532, 5533, 4511, 8099, 6051, 5541, 5542, 3501, 5039, 

In [7]:
# Augment sequences shorter than n elements with some "zero" element.
# Don't need to do padding to decoder sequences.
MAX_INPUT_SEQ_LEN = 50

encoder_input_seqs = tf.keras.preprocessing.sequence.pad_sequences(encoder_input_seqs, value=-1, padding='pre', maxlen=MAX_INPUT_SEQ_LEN)

In [8]:
# Create a dictionary that maps each unique character (MCC code) to a unique index.
# This is used to encode MCC codes into indices.
ch2idx_d = {ch:i for i, ch in enumerate(vocab)}
# Create a list of characters from the vocabulary, which will be used to decode indices back to MCC codes.
idx2ch_l = list(vocab)

# Define lambda functions.
ch2idx = lambda ch: ch2idx_d[ch]
idx2ch = lambda idx: idx2ch_l[idx]

In [9]:
# Сonvert inputs and targets to indexes.
encoder_input_seqs = np.asarray([np.asarray([ch2idx(el) for el in seq]) for seq in encoder_input_seqs])
decoder_input_seqs = np.asarray([np.asarray([ch2idx(el) for el in seq]) for seq in decoder_input_seqs])
decoder_target_seqs = np.asarray([np.asarray([ch2idx(el) for el in seq]) for seq in decoder_target_seqs])
test_input_seqs = np.asarray([np.asarray([ch2idx(el) for el in seq]) for seq in test_input_seqs])

print("example:", encoder_input_seqs[0])

example: [ 78  74  78 150 150  97  78  78  78  78  98  98  74  98  98 150  74  74
 150 150 150  78  78  78  78  78  98  74  98 150  74  98  74  98  74  98
  78 150  98  74  98 150  78 150  78  78  78  78  78  78]


In [10]:
encoder_input_seqs_train, encoder_input_seqs_val, decoder_input_seqs_train, decoder_input_seqs_val, decoder_target_seqs_train, decoder_target_seqs_val = \
train_test_split(encoder_input_seqs, decoder_input_seqs, decoder_target_seqs, test_size=0.15, random_state=42)

<div style="padding: 10px">
    <h2 class="h2-title" style="color: black; font-size: 25px !important;
              font-family: Calibri;">
        <b style="color: #00FF66; display: inline-block; font-size: 25px !important;">1.3 |</b><p style="font-size: 25px !important; display: inline-block;">RNN Model</p>
    </h2>
</div>

In [11]:
# LSTM models are chosen for this sequence prediction task due to their strengths in handling
# sequences where the context and order of events are important. I implement generator model 
# that will be generating future transactions. 

h_size = 256
emb_size = 64

class Encoder(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.embed = tf.keras.layers.Embedding(len(vocab), emb_size, name='embed')
        self.lstm3 = tf.keras.layers.LSTM(h_size, return_sequences=False, return_state=True, name='lstm_enc')
        
    def call(self, x):
        out = self.embed(x)
        out, h, c = self.lstm3(out)
        state = (h, c)
        
        return state
    
    
class Decoder(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.embed = tf.keras.layers.Embedding(len(vocab), emb_size, name='embed')
        self.lstm1 = tf.keras.layers.LSTM(h_size, return_sequences=True, return_state=True, name='lstm_dec')
        self.fc = tf.keras.layers.Dense(len(vocab), activation='softmax', name='fc')
        self.dropout = tf.keras.layers.Dropout(rate=0.9)
        
    def call(self, x, init_state):
        out = self.embed(x)
        out, h, c = self.lstm1(out, initial_state=init_state)
        state = (h, c)
        out = self.dropout(out)
        out = self.fc(out)
        
        return out, state
    
    
encoder_model = Encoder()
decoder_model = Decoder()

encoder_inputs = tf.keras.layers.Input(shape=(None, ))
decoder_inputs = tf.keras.layers.Input(shape=(None, ))

enc_state = encoder_model(encoder_inputs)
decoder_outputs, _ = decoder_model(decoder_inputs, enc_state)

seq2seq = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

seq2seq.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 encoder (Encoder)              ((None, 256),        340608      ['input_1[0][0]']                
                                 (None, 256))                                                     
                                                                                                  
 decoder (Decoder)              ((None, None, 186),  388410      ['input_2[0][0]',            

<div style="padding: 10px">
    <h2 class="h2-title" style="color: black; font-size: 25px !important;
              font-family: Calibri;">
        <b style="color: #00FF66; display: inline-block; font-size: 25px !important;">1.4 |</b><p style="font-size: 25px !important; display: inline-block;">Model Training</p>
    </h2>
</div>

In [12]:
BATCH_SIZE = 1024
EPOCHS = 1000
loss = tf.losses.SparseCategoricalCrossentropy()
seq2seq.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

seq2seq.fit([encoder_input_seqs_train, decoder_input_seqs_train], decoder_target_seqs_train,
           validation_data=([encoder_input_seqs_val, decoder_input_seqs_val], decoder_target_seqs_val), 
           batch_size=BATCH_SIZE,
           epochs=EPOCHS)



<keras.callbacks.History at 0x7eb4b61ed120>

<div style="padding: 10px">
    <h2 class="h2-title" style="color: black; font-size: 25px !important;
              font-family: Calibri;">
        <b style="color: #00FF66; display: inline-block; font-size: 25px !important;">1.5 |</b><p style="font-size: 25px !important; display: inline-block;">Inference</p>
    </h2>
</div>

In [13]:
# This inference function takes an input sequence and predicts the sequence of future transactions.
def seq2seq_inference(input_seq):
    # First, we obtain the initial state of the encoder by passing the input sequence through it
    state = encoder_model(input_seq)

    # Initialize the target sequence with the start symbol, which is encoded to its corresponding index.
    target_seq = np.array([[ch2idx(0)]])
    
    pred = []
    i = 0
    # We will predict a fixed number of future transactions.
    while i <= 9:
        # Pass the current target sequence and the state into the decoder model to get the next output token and updated state.
        output_tokens, state = decoder_model(target_seq, state)

        # Select the index with the highest probability from the last output.
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        # Decode the index to the corresponding MCC code.
        sampled_char = idx2ch(sampled_token_index)
        
        pred.append(sampled_char)

        # Update the target sequence to contain only the last predicted index.
        target_seq = np.array([[sampled_token_index]])
        
        i += 1
    
    return pred

In [14]:
# predictions
print(len(test_input_seqs))

pred = []
i = 0
for el in test_input_seqs:
    pred.append(seq2seq_inference(np.asarray([el])))
    
    if i % 1000 == 0:
        print(i)
        
    i += 1

7033
0
1000
2000
3000
4000
5000
6000
7000


In [15]:
# convert predictions to DataFrame, taking rows from "1:-1" element to remove "[" and "]"
pred = np.asarray(pred)
pred1 = [np.array2string(np.array(array))[1:-1] for array in pred]
pred_df = pd.DataFrame(pred1, columns=['Predicted'])

In [16]:
# saving
pred_df.index.rename('Id', inplace=True )
pred_df.to_csv('/kaggle/working/submission.csv')

Unfortunately, it was not possible to get a good score on this metric with this solution

<div style="padding: 10px">
    <h1 class="h1-title" style="color: black; font-size: 30px !important;
              font-family: Calibri;">
        <b style="color: #00FF66; display: inline-block; font-size: 30px !important;">2 |</b><p style="font-size: 30px !important; display: inline-block;">Statistic solution</p>
    </h1>
</div>

<div style="padding: 10px">
    <h2 class="h2-title" style="color: black; font-size: 25px !important;
              font-family: Calibri;">
        <b style="color: #00FF66; display: inline-block; font-size: 25px !important;">2.1 |</b><p style="font-size: 25px !important; display: inline-block;">Imports</p>
    </h2>
</div>

in addition to the previous imports add **collections.Counter**

In [17]:
# utils
from collections import Counter

<div style="padding: 10px">
    <h2 class="h2-title" style="color: black; font-size: 25px !important;
              font-family: Calibri;">
        <b style="color: #00FF66; display: inline-block; font-size: 25px !important;">2.2 |</b><p style="font-size: 25px !important; display: inline-block;">Functions</p>
    </h2>
</div>

In [18]:
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
# https://www.kaggle.com/code/prampampam/baseline-popular-transactions/
def apk(actual, predicted, k=10):
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

<div style="padding: 10px">
    <h2 class="h2-title" style="color: black; font-size: 25px !important;
              font-family: Calibri;">
        <b style="color: #00FF66; display: inline-block; font-size: 25px !important;">2.3 |</b><p style="font-size: 25px !important; display: inline-block;">Data Loading, Formatting</p>
    </h2>
</div>

In [19]:
df_train = pd.read_csv("/kaggle/input/alfabank/df_train.csv", sep=";")
df_test = pd.read_csv("/kaggle/input/alfabank/df_test.csv", sep=";")

train_df.dtypes

Id         int64
Data      object
Target    object
dtype: object

In [20]:
df_train['Data'] = df_train.Data.apply(lambda s: list(map(int, s.split(','))))
df_train['Target'] = df_train.Target.apply(lambda s: list(map(int, s.split(','))))
df_test['Data'] = df_test.Data.apply(lambda s: list(map(int, s.split(','))))

<div style="padding: 10px">
    <h2 class="h2-title" style="color: black; font-size: 25px !important;
              font-family: Calibri;">
        <b style="color: #00FF66; display: inline-block; font-size: 25px !important;">2.4 |</b><p style="font-size: 25px !important; display: inline-block;">The most popular user transactions</p>
    </h2>
</div>

In [21]:
# Take the 10 most popular codes from train["Target"] data (we can take codes from test data instead)
# and convert them to a Python List.
top10_codes = df_train['Target'].explode().value_counts().head(10).index.to_list()
print('top 10 codes from target:', sorted(top10_codes))

top 10 codes from target: [4814, 4829, 5331, 5411, 5499, 5541, 5812, 5912, 6010, 6011]


In [22]:
# By metrics, the first elements are more important to guess, 
# and they are more likely to be the most popular user spendings.

# We take the most popular transactions of a particular user for the last n transactions 
# and add to them the most popular ones on the whole "Target" (in case there are less than 10 of them). 
# And, we add the codes in such a way that they do not repeat, it is important.
def get_top_codes(transactions, top_n=10, drop_from=5, last=5):
    if len(transactions) > 200 * last:
        transactions_stats = sorted(
            Counter(transactions[-200 * last:]).items(), 
            key=lambda x: x[1], 
            reverse=True)[:top_n]
    else:
        transactions_stats = sorted(
            Counter(transactions).items(), 
            key=lambda x: x[1], 
            reverse=True)[:top_n]

    # Filter out the top MCC codes that occur at least 'drop_from' times.
    top_codes = [mcc_code for (mcc_code, count) in transactions_stats if count >= drop_from]
    # Identify the most popular MCC codes across all users' "Target" transactions that are not already in the user's top list.
    top10_codes_diff = [code for code in top10_codes if code not in top_codes]

    return (top_codes + top10_codes_diff)[:10]

In [23]:
# pick the best parameters, kind of like a Grid Search
mx = 0
for x in range(0, 11):
    for y in range(0, 20, 2):
        for z in range(0, 25):
            df_train['pred_baseline_2'] = df_train['Data'].apply(get_top_codes, args=(x, y, z))
            if mapk(df_train['Target'], df_train['pred_baseline_2']) > mx:
                mx = mapk(df_train['Target'], df_train['pred_baseline_2'])
                print("new_mx:", mx, "|| x:", x, "|| y:", y, "|| z:", z)

new_mx: 0.28712452519753817 || x: 0 || y: 0 || z: 0
new_mx: 0.29833160113659185 || x: 1 || y: 0 || z: 0
new_mx: 0.29987566551337347 || x: 1 || y: 0 || z: 1
new_mx: 0.30644249671051893 || x: 2 || y: 0 || z: 0
new_mx: 0.30860017062420014 || x: 2 || y: 0 || z: 1
new_mx: 0.30860795704603466 || x: 2 || y: 4 || z: 1
new_mx: 0.31374303453785896 || x: 3 || y: 0 || z: 0
new_mx: 0.31624318123856016 || x: 3 || y: 0 || z: 1
new_mx: 0.31889968831743326 || x: 4 || y: 0 || z: 0
new_mx: 0.32202800742079857 || x: 4 || y: 0 || z: 1
new_mx: 0.32205191399276434 || x: 4 || y: 2 || z: 1
new_mx: 0.3229761848338559 || x: 5 || y: 0 || z: 0
new_mx: 0.3259975760530289 || x: 5 || y: 0 || z: 1
new_mx: 0.3287708625324153 || x: 6 || y: 0 || z: 1
new_mx: 0.32877712552389077 || x: 6 || y: 2 || z: 1
new_mx: 0.3307926351734115 || x: 7 || y: 0 || z: 1
new_mx: 0.33084552641853937 || x: 7 || y: 2 || z: 1
new_mx: 0.3324265198756881 || x: 8 || y: 0 || z: 1
new_mx: 0.3324772388219707 || x: 8 || y: 2 || z: 1
new_mx: 0.33314645

In [26]:
# predictions
df_test['Predicted'] = df_test['Data'].apply(get_top_codes, args=(10, 2, 1))

In [27]:
# saving
submission = df_test[['Id', 'Predicted']]
submission['Predicted'] = [np.array2string(np.array(array))[1:-1] for array in submission['Predicted'].to_list()]
submission.to_csv('submissionnew.csv', index=False)

with this solution I got score on public df - 0.28925