In [11]:
import warnings

warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import itertools
from tqdm import tqdm
from collections import namedtuple

from sklearn.preprocessing import MinMaxScaler, LabelEncoder


# Define feature classes and NCF model components
class SparseFeat:
    def __init__(self, name, vocabulary_size, embedding_dim):
        self.name = name
        self.vocabulary_size = vocabulary_size
        self.embedding_dim = embedding_dim


def build_input_layers(feature_columns):
    dense_input_dict, sparse_input_dict = {}, {}
    for fc in feature_columns:
        if isinstance(fc, SparseFeat):
            sparse_input_dict[fc.name] = Input(shape=(1,), name=fc.name)
    return dense_input_dict, sparse_input_dict


def build_embedding_layers(feature_columns, input_layers_dict, is_linear, prefix=''):
    embedding_layers_dict = {}
    sparse_feature_columns = [x for x in feature_columns if isinstance(x, SparseFeat)]
    if is_linear:
        for fc in sparse_feature_columns:
            embedding_layers_dict[fc.name] = Embedding(fc.vocabulary_size + 1, 1, name=prefix + '1d_emb_' + fc.name)
    else:
        for fc in sparse_feature_columns:
            embedding_layers_dict[fc.name] = Embedding(fc.vocabulary_size + 1, fc.embedding_dim,
                                                       name=prefix + 'kd_emb_' + fc.name)
    return embedding_layers_dict


def get_dnn_out(dnn_inputs, units=(32, 16)):
    dnn_out = dnn_inputs
    for out_dim in units:
        dnn_out = Dense(out_dim)(dnn_out)
    return dnn_out


def NCF(dnn_feature_columns):
    _, sparse_input_dict = build_input_layers(dnn_feature_columns)
    input_layers = list(sparse_input_dict.values())
    GML_embedding_dict = build_embedding_layers(dnn_feature_columns, sparse_input_dict, is_linear=False, prefix='GML')
    MLP_embedding_dict = build_embedding_layers(dnn_feature_columns, sparse_input_dict, is_linear=False, prefix='MLP')
    GML_user_emb = Flatten()(GML_embedding_dict['user_id'](sparse_input_dict['user_id']))
    GML_item_emb = Flatten()(GML_embedding_dict['item_id'](sparse_input_dict['item_id']))

    # Use Keras Multiply layer instead of tf.multiply
    GML_out = Multiply()([GML_user_emb, GML_item_emb])

    MLP_user_emb = Flatten()(MLP_embedding_dict['user_id'](sparse_input_dict['user_id']))
    MLP_item_emb = Flatten()(MLP_embedding_dict['item_id'](sparse_input_dict['item_id']))
    MLP_dnn_input = Concatenate(axis=1)([MLP_user_emb, MLP_item_emb])
    MLP_dnn_out = get_dnn_out(MLP_dnn_input, (32, 16))
    concat_out = Concatenate(axis=1)([GML_out, MLP_dnn_out])
    output_layer = Dense(1)(concat_out)
    model = Model(input_layers, output_layer)
    return model


def prepare_data(file_path):
    data = pd.read_excel(file_path)
    data['StockCode'] = data['StockCode'].astype(str)
    data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])
    data = data.dropna(subset=['CustomerID'])  # Drop rows where CustomerID is NaN
    data['CustomerID'] = data['CustomerID'].astype(int)

    # Create a time index to separate data by months
    data.set_index('InvoiceDate', inplace=True)
    data.sort_index(inplace=True)

    # Extract months
    months = data.index.to_period('M').unique()
    data['InvoiceDate'] = pd.to_datetime(data.index)
    data['Month'] = data['InvoiceDate'].dt.to_period('M')
    weekly_purchase_frequency = data.groupby(['CustomerID', 'StockCode', 'Month']).size().reset_index(
        name='rating')

    # weekly_purchase_frequency['InvoiceDate'] = data['InvoiceDate']
    # weekly_purchase_frequency.set_index('InvoiceDate', inplace=True)
    # weekly_purchase_frequency.sort_index(inplace=True)
    data = weekly_purchase_frequency
    data['monthly_rating'] = data.groupby(['StockCode', 'Month'])['StockCode'].transform('count')
    data['rating'] = data['monthly_rating']
    # Split data into training (first 5 months) and test (6th month)
    # Encoding
    lbe_user = LabelEncoder()
    lbe_item = LabelEncoder()
    data['user_id'] = lbe_user.fit_transform(data['CustomerID'])
    data['item_id'] = lbe_item.fit_transform(data['StockCode'])
    item_id_to_stockcode = dict(zip(lbe_item.transform(lbe_item.classes_), lbe_item.classes_))
    train_weeks = months[:]

    random_values = np.random.choice(data['user_id'].values, size=1000, replace=False)
    random_values[0] = 1880
    random_values[1] = 326
    random_values[2] = 4011
    random_values[3] = 1290
    random_values[4] = 1334
    repeated_values = np.repeat(random_values, 3665)
    test_data = pd.DataFrame()
    test_data['user_id'] = repeated_values
    test_data['item_id'] = np.tile(np.arange(3665), len(random_values))

    train_data = data.loc[data['Month'].isin(train_weeks)]


    return train_data, test_data, item_id_to_stockcode, data

file_path = './handled_data2.xlsx'
train_data, test_data, item_id_to_stockcode, data = prepare_data(file_path)

# Print the mapping between item_id and StockCode before training the model
print("Mapping from item_id to StockCode:")
for item_id, stockcode in item_id_to_stockcode.items():
    print(f"item_id: {item_id} -> StockCode: {stockcode}")

# Define feature columns for the model
dnn_feature_columns = [
    SparseFeat('user_id', data['user_id'].nunique(), 8),
    SparseFeat('item_id', data['item_id'].nunique(), 8)
]

# Build and compile the model
model = NCF(dnn_feature_columns)
model.summary()
model.compile(optimizer="adam", loss="mse", metrics=['mae'])

# Prepare training data
train_model_input = {name: train_data[name] for name in ['user_id', 'item_id']}
train_labels = train_data['rating']

# Train the model
model.fit(train_model_input, train_labels, batch_size=32, epochs=10, validation_split=0.2)

# Predict on test data
test_model_input = {name: test_data[name] for name in ['user_id', 'item_id']}
test_predictions = model.predict(test_model_input)

# Add predictions to test_data
test_data['predicted_rating'] = test_predictions



Mapping from item_id to StockCode:
item_id: 0 -> StockCode: 10002
item_id: 1 -> StockCode: 10080
item_id: 2 -> StockCode: 10120
item_id: 3 -> StockCode: 10123C
item_id: 4 -> StockCode: 10124A
item_id: 5 -> StockCode: 10124G
item_id: 6 -> StockCode: 10125
item_id: 7 -> StockCode: 10133
item_id: 8 -> StockCode: 10135
item_id: 9 -> StockCode: 11001
item_id: 10 -> StockCode: 15030
item_id: 11 -> StockCode: 15034
item_id: 12 -> StockCode: 15036
item_id: 13 -> StockCode: 15039
item_id: 14 -> StockCode: 15044A
item_id: 15 -> StockCode: 15044B
item_id: 16 -> StockCode: 15044C
item_id: 17 -> StockCode: 15044D
item_id: 18 -> StockCode: 15056BL
item_id: 19 -> StockCode: 15056N
item_id: 20 -> StockCode: 15056P
item_id: 21 -> StockCode: 15058A
item_id: 22 -> StockCode: 15058B
item_id: 23 -> StockCode: 15058C
item_id: 24 -> StockCode: 15060B
item_id: 25 -> StockCode: 16008
item_id: 26 -> StockCode: 16010
item_id: 27 -> StockCode: 16011
item_id: 28 -> StockCode: 16012
item_id: 29 -> StockCode: 16014


Epoch 1/10
[1m9085/9085[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2ms/step - loss: 541.3232 - mae: 14.1123 - val_loss: 322.9841 - val_mae: 11.2530
Epoch 2/10
[1m9085/9085[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step - loss: 284.4068 - mae: 10.6785 - val_loss: 319.0102 - val_mae: 11.0699
Epoch 3/10
[1m9085/9085[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 2ms/step - loss: 249.0823 - mae: 9.7992 - val_loss: 317.9190 - val_mae: 10.9912
Epoch 4/10
[1m9085/9085[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 226.8028 - mae: 9.2472 - val_loss: 317.5574 - val_mae: 10.9917
Epoch 5/10
[1m9085/9085[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - loss: 219.2770 - mae: 8.9772 - val_loss: 316.1871 - val_mae: 10.9234
Epoch 6/10
[1m9085/9085[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 209.5588 - mae: 8.7338 - val_loss: 316.8408 - val_mae: 10.9198
Epoch 7/10
[1m9085/9085[0m [32m━━━━

In [33]:
a = pd.DataFrame(test_predictions[3665 * 4:3665 * 5])
a.rename(columns={0: 'rating'})
a

Unnamed: 0,0
0,14.327215
1,2.705029
2,7.149193
3,-1.757564
4,-0.374655
...,...
3660,3.063838
3661,6.163161
3662,19.367092
3663,2.073791


In [27]:
top_customer_items = a.sort_values(by='rating', ascending=False).head(5)
top_customer_items

KeyError: 'rating'

In [33]:
predicts = predicts.rename(columns={0: 'rating'})

In [35]:
predicts['item_id'] = np.tile(np.arange(3665), 1)

In [37]:
predicts['StockCode'] = predicts['item_id'].map(item_id_to_stockcode)

In [46]:
predicts[(predicts['StockCode'] == '20725')]

Unnamed: 0,rating,item_id,StockCode
170,96.685051,170,20725


In [48]:
predicts.to_excel('./predicts.xlsx')

In [33]:
train_data[filt]

Unnamed: 0,CustomerID,StockCode,Month,rating,monthly_rating,user_id,item_id
67,12347,22423,2011-01,196,98,1,1261
68,12347,22423,2011-04,276,138,1,1261
318,12352,22423,2011-02,212,106,4,1261
471,12356,22423,2011-01,196,98,6,1261
472,12356,22423,2011-04,276,138,6,1261
...,...,...,...,...,...,...,...
360107,18223,22423,2011-02,212,106,2411,1261
361460,18239,22423,2011-01,196,98,2420,1261
361920,18250,22423,2011-02,212,106,2423,1261
362266,18260,22423,2011-01,196,98,2426,1261


In [47]:
test_predictions

array([[42.92569 ],
       [58.77943 ],
       [58.77943 ],
       ...,
       [45.686474],
       [45.686474],
       [45.686474]], dtype=float32)

In [48]:
test_model_input['user_id']

369          6
370          6
371          6
372          6
431          8
          ... 
363377    4338
363378    4338
363379    4338
363381    4338
363383    4338
Name: user_id, Length: 26372, dtype: int64

In [39]:
test_model_input

{'user_id': 369          0
 370          0
 371          0
 372          0
 431          1
           ... 
 363377    1055
 363378    1055
 363379    1055
 363381    1055
 363383    1055
 Name: user_id, Length: 26372, dtype: int64,
 'item_id': 369       1346
 370       1651
 371       1654
 372       1655
 431        983
           ... 
 363377    1878
 363378    1967
 363379    2033
 363381    2034
 363383    2036
 Name: item_id, Length: 26372, dtype: int64}

In [45]:
item_id_to_stockcode

{0: '10002',
 1: '10080',
 2: '10120',
 3: '10123C',
 4: '10124A',
 5: '10124G',
 6: '10125',
 7: '10133',
 8: '10135',
 9: '11001',
 10: '15030',
 11: '15034',
 12: '15036',
 13: '15039',
 14: '15044A',
 15: '15044B',
 16: '15044C',
 17: '15044D',
 18: '15056BL',
 19: '15056N',
 20: '15056P',
 21: '15058A',
 22: '15058B',
 23: '15058C',
 24: '15060B',
 25: '16008',
 26: '16010',
 27: '16011',
 28: '16012',
 29: '16014',
 30: '16015',
 31: '16016',
 32: '16020C',
 33: '16033',
 34: '16043',
 35: '16045',
 36: '16046',
 37: '16048',
 38: '16049',
 39: '16052',
 40: '16054',
 41: '16151A',
 42: '16156L',
 43: '16156S',
 44: '16161G',
 45: '16161M',
 46: '16161P',
 47: '16161U',
 48: '16162L',
 49: '16162M',
 50: '16168M',
 51: '16169E',
 52: '16169K',
 53: '16169M',
 54: '16169N',
 55: '16169P',
 56: '16202A',
 57: '16202B',
 58: '16202E',
 59: '16206B',
 60: '16207A',
 61: '16207B',
 62: '16216',
 63: '16218',
 64: '16219',
 65: '16225',
 66: '16235',
 67: '16236',
 68: '16237',
 69: '1

In [58]:
common_elements = list(set(list1) & set(list2))

TypeError: unique() takes 1 positional argument but 2 were given

TypeError: unique() takes 1 positional argument but 2 were given

In [68]:
common_elements = np.intersect1d(list1, list2)

In [69]:
common_elements

array([  22,   39,   42,   54,   55,   61,   69,   75,   86,   96,   97,
        100,  103,  105,  110,  111,  115,  119,  124,  139,  143,  155,
        160,  165,  176,  177,  182,  189,  200,  203,  204,  205,  206,
        215,  223,  227,  233,  241,  243,  245,  252,  253,  264,  270,
        272,  273,  274,  282,  291,  293,  294,  296,  298,  304,  310,
        314,  325,  326,  344,  346,  386,  389,  390,  391,  393,  404,
        409,  410,  416,  425,  427,  430,  431,  436,  437,  441,  445,
        448,  453,  455,  457,  461,  465,  466,  468,  471,  473,  481,
        485,  492,  495,  499,  501,  504,  509,  510,  511,  512,  515,
        516,  518,  527,  534,  537,  550,  555,  558,  562,  565,  567,
        570,  573,  581,  584,  585,  601,  603,  623,  631,  633,  645,
        649,  658,  660,  662,  667,  668,  679,  691,  693,  694,  695,
        696,  719,  725,  726,  728,  731,  733,  738,  777,  779,  787,
        795,  796,  802,  821,  824,  829,  836,  8

In [70]:
unique_elements = np.unique(common_elements)

In [71]:
unique_elements

array([  22,   39,   42,   54,   55,   61,   69,   75,   86,   96,   97,
        100,  103,  105,  110,  111,  115,  119,  124,  139,  143,  155,
        160,  165,  176,  177,  182,  189,  200,  203,  204,  205,  206,
        215,  223,  227,  233,  241,  243,  245,  252,  253,  264,  270,
        272,  273,  274,  282,  291,  293,  294,  296,  298,  304,  310,
        314,  325,  326,  344,  346,  386,  389,  390,  391,  393,  404,
        409,  410,  416,  425,  427,  430,  431,  436,  437,  441,  445,
        448,  453,  455,  457,  461,  465,  466,  468,  471,  473,  481,
        485,  492,  495,  499,  501,  504,  509,  510,  511,  512,  515,
        516,  518,  527,  534,  537,  550,  555,  558,  562,  565,  567,
        570,  573,  581,  584,  585,  601,  603,  623,  631,  633,  645,
        649,  658,  660,  662,  667,  668,  679,  691,  693,  694,  695,
        696,  719,  725,  726,  728,  731,  733,  738,  777,  779,  787,
        795,  796,  802,  821,  824,  829,  836,  8

In [72]:
diff_elements = [item for item in list2 if item not in list1]

In [75]:
len(diff_elements)

284

In [76]:
list2

array([   6,    8,   13, ..., 4320, 4337, 4338])

In [3]:
test_data

Unnamed: 0,CustomerID,StockCode,Month,rating,monthly_rating,user_id,item_id
369,12353,22890,2011-05,9,9,6,1733
370,12353,37446,2011-05,9,9,6,2500
371,12353,37449,2011-05,11,11,6,2503
372,12353,37450,2011-05,9,9,6,2504
431,12355,22423,2011-05,158,158,8,1288
...,...,...,...,...,...,...,...
363377,18287,84584,2011-05,4,4,4338,2907
363378,18287,84920,2011-05,2,2,4338,3077
363379,18287,85039A,2011-05,2,2,4338,3179
363381,18287,85039B,2011-05,4,4,4338,3180


In [4]:
test_data['predict'] =  test_predictions

In [5]:
test_data

Unnamed: 0,CustomerID,StockCode,Month,rating,monthly_rating,user_id,item_id,predict
369,12353,22890,2011-05,9,9,6,1733,22.198454
370,12353,37446,2011-05,9,9,6,2500,28.968863
371,12353,37449,2011-05,11,11,6,2503,28.968863
372,12353,37450,2011-05,9,9,6,2504,28.968863
431,12355,22423,2011-05,158,158,8,1288,128.541931
...,...,...,...,...,...,...,...,...
363377,18287,84584,2011-05,4,4,4338,2907,21.271963
363378,18287,84920,2011-05,2,2,4338,3077,21.271963
363379,18287,85039A,2011-05,2,2,4338,3179,21.271963
363381,18287,85039B,2011-05,4,4,4338,3180,21.271963


In [6]:
filt = (test_data['StockCode'] == '85123A')

In [7]:
test_data[filt]

Unnamed: 0,CustomerID,StockCode,Month,rating,monthly_rating,user_id,item_id,predict
27857,12747,85123A,2011-05,176,176,325,3233,41.274250
31080,12748,85123A,2011-05,176,176,326,3233,15.705448
35313,12843,85123A,2011-05,176,176,393,3233,39.058033
36792,12868,85123A,2011-05,176,176,410,3233,45.752926
36805,12871,85123A,2011-05,176,176,411,3233,28.057581
...,...,...,...,...,...,...,...,...
359929,18219,85123A,2011-05,176,176,4288,3233,21.271963
361165,18231,85123A,2011-05,176,176,4300,3233,21.271963
361502,18239,85123A,2011-05,176,176,4306,3233,21.271963
361605,18241,85123A,2011-05,176,176,4308,3233,21.271963


In [9]:
filt = (test_data['StockCode'] == '22423')
test_data[filt]

Unnamed: 0,CustomerID,StockCode,Month,rating,monthly_rating,user_id,item_id,predict
431,12355,22423,2011-05,158,158,8,1288,128.541931
2806,12395,22423,2011-05,158,158,39,1288,121.120911
6360,12437,22423,2011-05,158,158,75,1288,128.604233
7917,12463,22423,2011-05,158,158,96,1288,120.981689
8135,12471,22423,2011-05,158,158,100,1288,117.641495
...,...,...,...,...,...,...,...,...
358807,18190,22423,2011-05,158,158,4267,1288,120.347015
359221,18202,22423,2011-05,158,158,4275,1288,120.347015
359855,18219,22423,2011-05,158,158,4288,1288,120.347015
359954,18221,22423,2011-05,158,158,4290,1288,120.347015


In [8]:
filt = (train_data['StockCode'] == '85123A')
train_data[filt]

Unnamed: 0,CustomerID,StockCode,Month,rating,monthly_rating,user_id,item_id
1595,12370,85123A,2010-12,155,155,20,3233
5355,12428,85123A,2011-03,154,154,66,3233
7560,12455,85123A,2011-02,113,113,90,3233
7883,12462,85123A,2011-02,113,113,95,3233
10760,12484,85123A,2011-01,150,150,112,3233
...,...,...,...,...,...,...,...
361500,18239,85123A,2010-12,155,155,4306,3233
361501,18239,85123A,2011-01,150,150,4306,3233
361857,18245,85123A,2010-12,155,155,4310,3233
362341,18260,85123A,2011-01,150,150,4320,3233


In [10]:
filt = (train_data['StockCode'] == '22423')
train_data[filt]

Unnamed: 0,CustomerID,StockCode,Month,rating,monthly_rating,user_id,item_id
67,12347,22423,2011-01,98,98,1,1288
68,12347,22423,2011-04,138,138,1,1288
318,12352,22423,2011-02,106,106,5,1288
471,12356,22423,2011-01,98,98,9,1288
472,12356,22423,2011-04,138,138,9,1288
...,...,...,...,...,...,...,...
360107,18223,22423,2011-02,106,106,4292,1288
361460,18239,22423,2011-01,98,98,4306,1288
361920,18250,22423,2011-02,106,106,4314,1288
362266,18260,22423,2011-01,98,98,4320,1288


In [11]:
random_values = np.random.choice(train_data['user_id'].values, size=100, replace=False)

In [12]:
random_values

array([ 115, 1993, 2391, 1159, 4009, 2077, 3213,  235, 2290, 2064, 2725,
        122, 1501, 1633, 3274,  340, 1874, 2034, 2621, 1555, 1443, 2166,
       1496,  634,  694, 2209,   55, 2508, 1880, 1026, 4262,  891, 1277,
        940, 3262,  626, 1537, 3172, 3582, 1542, 1450, 1548, 1126, 2415,
       3360, 1564, 3122, 3723, 2237, 1938, 3057, 3484,  223, 3967, 1606,
       1896, 3907, 2373, 4110, 4266,  311,  389, 1731,  515,  292,  155,
       1102, 3992, 2211, 3569, 3903, 2584, 2051, 2760,  562, 1074,  325,
       2223, 3259, 1880,  326, 3479, 2621,  391, 3510, 3371, 2420, 2308,
       3089,  154, 3786, 2690,  340, 3205, 3794, 3956,  915, 3179, 1969,
       2145])

In [13]:
repeated_values = np.repeat(random_values, 3665)

In [14]:
repeated_values

array([ 115,  115,  115, ..., 2145, 2145, 2145])

In [16]:
random_values = np.random.choice(train_data['user_id'].values, size=1000, replace=False)
repeated_values = np.repeat(random_values, 3665)
test_data = pd.DataFrame()
test_data['user_id'] = repeated_values
test_data['item_id'] = np.tile(np.arange(3665), len(random_values))

In [17]:
test_data

Unnamed: 0,user_id,item_id
0,2827,0
1,2827,1
2,2827,2
3,2827,3
4,2827,4
...,...,...
3664995,3970,3660
3664996,3970,3661
3664997,3970,3662
3664998,3970,3663
