In [None]:
import pandas as pd
import numpy as np
from tensorflow import keras
import tensorflow as tf
import re
from ipypb import track
from glob import glob
import json
from random import shuffle

In [None]:
df = pd.read_csv('bq-results-20211205-233906-s3fcmqg6kwal.csv')
df = df[['anonymousId', 'isTransaction', 'totalTransactions_windowed', 'eventNumBeforeTransaction', 'totalTransactionPerUser']]

In [None]:
batch_size=10000
n_steps=51
freq1, freq2, offsets1, offsets2 = np.random.rand(4,batch_size,1)
time=np.linspace(0,1,n_steps)
series = 0.5*np.sin((time-offsets1)*(freq1*10+10))
seriesn = series[...,np.newaxis]

In [None]:
X_train, y_train = seriesn[:7000,:n_steps-1], seriesn[:7000,-1]
X_valid, y_valid = seriesn[7000:9000,:n_steps-1], seriesn[7000:9000,-1]
X_test, y_test = seriesn[9000:,:n_steps-1], seriesn[9000:,-1]

In [None]:
y_pred = X_valid[:,-1]

In [None]:
np.mean(keras.losses.mean_squared_error(y_valid, y_pred))

In [None]:
model_lin = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[50,1]),
    keras.layers.Dense(1)
])

model_lin.compile(loss='mse', optimizer='adam')
history = model_lin.fit(X_train, y_train, verbose=0, epochs=20)

In [None]:
model_rnn = keras.models.Sequential(
    keras.layers.SimpleRNN(1, input_shape=[None,1])
)
model_rnn.compile(loss='mse', optimizer='adam')
history = model_lin.fit(X_train, y_train, verbose=0, epochs=20)

# Проверка на пробном рабочем датасете

In [None]:
X_train.shape, y_train.shape

In [None]:
users = df['anonymousId'].unique()
for user in users:
    df[df['anonymousId'] == user].to_csv('olduserData\\'+user+'.csv', index=False)

In [None]:
model_rnn = keras.models.Sequential(
    keras.layers.SimpleRNN(1, input_shape=[None,3])
)
model_rnn.compile(loss='mse', optimizer='adam')

for i,user in enumerate(users):
    t=df[df['anonymousId'] == user]
    t=t.to_numpy()[np.newaxis,...]
    X = np.float32(t[:,:,1:-1])
    y = np.array([[np.float32(np.max(t[:,:,-1]))]])
    model_rnn.fit(X,y,epochs=10,verbose=0)

history = model_rnn.fit(X,y,epochs=10,verbose=0)

https://keras.io/guides/functional_api/#manipulate-complex-graph-topologies

In [None]:
#
x = np.arange(20).reshape(2, 2, 5)
y = np.arange(20).reshape(2, 2, 5)
keras.layers.Concatenate(axis=2)([x, y])

In [None]:
x

In [None]:
y

# обработка датасета

In [None]:
%%time
# df=pd.read_csv('2000 users CJM.csv')

categorical_columns = ['eventName', 'eventPageType', 'eventPageCategory', 'skuCode', 'productCategory',
                       'source', 'medium', 'campaign', 'isTransaction']
timestamp_columns = ['sentAt', 'session_start', 'previousSessionStart']
numerical_columns = ['transactionRevenue', 'totalTransactions_current', 'totalTransactionsPerUser',
                     'session_num', 'secondsSinceLastSession', 'secondsSinceLastEvent', 'eventNumBeforeTransaction',
                     'eventNumInSession', 'sessionsCount']
meta_columns = ['anonymousId', 'sessionId', 'url']

target_columns = ['everTransacted']

In [None]:
def extract_city(row):
    res = re.findall(r'https:\/\/(\w*)?\.?petrovich\.ru', row)
    if len(res) != 0:
        if res[-1] == '':
            return 'spb'
        else:
            return res[-1]
    else:
        return 'spb'
        
df['city'] = df['url'].apply(extract_city)

for col in categorical_columns:
    df[col] = df[col].fillna('(empty)').astype('str')
    
for col in numerical_columns:
    df[col] = df[col].fillna(0.0).astype(np.float32)
    
cols = (categorical_columns + numerical_columns)
cols.append('anonymousId')
cols.append('city')
df = df[cols]

### кодирование категориальных переменных

In [None]:
%%time
def get_encoder_for_category(column_name):
    values = df[col].unique()
    values = np.sort(values)
    encoder = dict(zip(values,range(1,len(values)+1)))
    # сохранение словаря в json-файл
    with open('categoricalEncoders\\'+column_name+'.json', 'w') as f:
        json.dump(encoder, f)
        
    return encoder

def get_heavy_encoder(col):
    t=pd.Series(df[col].unique()).reset_index()
    t['index'] += 1
    t['index'] = t['index'].astype('str')
    tt = t.T
    tt.columns = tt.loc[0,:]
    tt = tt.drop(0)
    encoder = json.loads(tt.to_json(orient='records')[1:-1])
    # сохранение словаря в json-файл
    with open('categoricalEncoders\\'+col+'.json', 'w') as f:
        json.dump(encoder, f)
        
    return encoder

for col in track(categorical_columns):
    print(col)
    # следующие столбцы имеют слишком большое количество уникальных значений, поэтому их надо сохранить отдельно
    if col in ['skuCode', 'productCategory']:
        df[col] = df[col].replace({'(empty)':'0.0'})
        continue
    cat_encoder = get_encoder_for_category(col)
    df[col] = df[col].replace(cat_encoder)
    

In [None]:
col='skuCode'
t=pd.Series(df[col].unique()).reset_index()
t['index'] += 1
t['index'] = t['index'].astype('str')
tt = t.T
tt.columns = tt.loc[0,:]
tt = tt.drop(0)
skuCodeEncoder = json.loads(tt.to_json(orient='records')[1:-1])
# сохранение словаря в json-файл
with open('categoricalEncoders\\'+col+'.json', 'w') as f:
    json.dump(skuCodeEncoder, f)
    
col='productCategory'
t=pd.Series(df[col].unique()).reset_index()
t['index'] += 1
t['index'] = t['index'].astype('str')
tt = t.T
tt.columns = tt.loc[0,:]
tt = tt.drop(0)
productCategoryEncoder = json.loads(tt.to_json(orient='records')[1:-1])
# сохранение словаря в json-файл
with open('categoricalEncoders\\'+col+'.json', 'w') as f:
    json.dump(productCategoryEncoder, f)

In [None]:
for _id in track(df['anonymousId'].unique()):
    df_to_store = df[df['anonymousId'] == _id]
    everTransacted = df_to_store['everTransacted'].unique()
    classLabel = df_to_store['everTransacted'].unique()[0] if len(everTransacted) == 1 else '2'
    df_to_store = df_to_store.drop(columns=['anonymousId', 'everTransacted'])
    df_to_store.to_csv('targetUserData\\'+classLabel+_id+'.csv', index=False)

In [None]:
for filename in track(glob('targetUserData\\*.csv')):
    dft = pd.read_csv(filename)
    dft['skuCode'] = dft['skuCode'].replace(skuCodeEncoder)
    dft['productCategory'] = dft['productCategory'].replace(productCategoryEncoder)
    dft.to_csv(filename, index=False)

# выбор архитектуры

In [None]:
userPaths = glob('targetUserData\\*.csv')[:3]
df = pd.concat([pd.read_csv(filename) for filename in userPaths])

In [None]:
summ=0
for col in categorical_columns:
    summ += len(df[col].unique())
    
summ

# Model

In [None]:
def data_generator(mode):
    filenames = glob('targetUserData\\*.csv')
    shuffle(filenames)
    classLabels = list(map(lambda x: x.split('\\')[-1][0], filenames))
    filenames_list, classLabels_list= None, None
    if mode == 'train':
        filenames_list, classLabels_list = filenames[:1500], classLabels[:1500]
    elif mode == 'validation':
        filenames_list, classLabels_list = filenames[1500:], classLabels[1500:]
    
    if not classLabels_list is None and not filenames_list is None:
        for filename, classLabel in zip(filenames_list, classLabels_list):
            features = pd.read_csv(filename)
            n_timestamps = features.shape[0]
            categorical_features = features[categorical_columns].to_numpy()
            numerical_features = features[numerical_columns].to_numpy()
            target = np.array([np.int32(classLabel)]).repeat(n_timestamps)
#             target = np.array([np.int32(classLabel)])
#             yield ({'categorical_features':categorical_features, 'numerical_features':numerical_features}, {'output_layer':target})
            yield tuple([(categorical_features, numerical_features), target])

gen_train = data_generator('train')
gen_valid = data_generator('validation')

tt1 = next(gen_train)
tt2 = next(gen_valid)

In [None]:
dataset_train = tf.data.Dataset.from_generator(data_generator, args=['train'], output_types=(tf.int32))
dataset_valid = tf.data.Dataset.from_generator(data_generator, args=['validation'], output_types=(tf.int32))

In [None]:
def x_generator(filenames, mode):
    filenames_list = None
    if mode == 'train':
        filenames_list = filenames[:1500]
    elif mode == 'validation':
        filenames_list = filenames[1500:]
    
    if not filenames_list is None:
        for filename in filenames_list:
            features = pd.read_csv(filename)
            n_timestamps = features.shape[0]
            categorical_features = features[categorical_columns].to_numpy()
            numerical_features = features[numerical_columns].to_numpy()
            yield {'categorical_features':categorical_features, 'numerical_features':numerical_features}
            
def y_generator(filenames, mode):
    classLabels = list(map(lambda x: x.split('\\')[-1][0], filenames))
    filenames_list, classLabels_list = None, None
    if mode == 'train':
        filenames_list, classLabels_list = filenames[:1500], classLabels[:1500]
    elif mode == 'validation':
        filenames_list, classLabels_list = filenames[1500:], classLabels[1500:]
    
    if not classLabels_list is None and not filenames_list is None:
        for filename, classLabel in zip(filenames_list, classLabels_list):
            features = pd.read_csv(filename)
            n_timestamps = features.shape[0]
            target = np.array([np.int32(classLabel)]).repeat(n_timestamps)
            yield {'output_layer':target}
            
filenames = glob('targetUserData\\*.csv')
shuffle(filenames)
x_generator_train = x_generator(filenames, 'train')
y_generator_train = y_generator(filenames, 'train')
x_generator_valid = x_generator(filenames, 'validation')
y_generator_valid = y_generator(filenames, 'validation')

In [None]:
vocab_size = 25000
output_vector_shape = 8
categorical_columns_len = len(categorical_columns)
numerical_columns_len = len(numerical_columns)
cat_input = keras.Input(
    shape=(None,categorical_columns_len,), name='categorical_features'
)
num_input = keras.Input(
    shape=(None,numerical_columns_len), name='numerical_features'
)

cat_features = keras.layers.Embedding(vocab_size, output_vector_shape, name='embedded_categorical_features')(cat_input)
unstacked = keras.layers.Lambda(lambda x: tf.unstack(x, axis=2), name='unstack_layer')(cat_features)
dense_outputs = [keras.layers.Dense(1, name='embedded_feature_denser_'+str(i))(l) for i,l in enumerate(unstacked)]
merged = keras.layers.Lambda(lambda x: tf.stack(x, axis=2), name='stack_layer')(dense_outputs)
squeezed = keras.layers.Lambda(lambda x: tf.squeeze(x, axis=3), 'squeeze_layer')(merged)

concated = keras.layers.Concatenate(name='concatenation_layer')([num_input, squeezed])
rnn_layer = keras.layers.SimpleRNN(64, input_shape=[None,1], name='RNN_layer')(concated)
output_layer = keras.layers.Dense(1, name='output_layer')(rnn_layer)

model = keras.Model(
    inputs=[cat_input, num_input],
    outputs=[output_layer]
)

model.compile(
    optimizer='adam',
    loss = {
        'output_layer': keras.losses.BinaryCrossentropy()
    }
)

# опреатор yield генератора должен возвращать следующую структуру:
# yield ({'categorical_features':categorical_features, 'numerical_features':numerical_features},{'output_layer':output_layer})
model.fit(
    data_generator('train'),
    validation_data = data_generator('validation'),
    epochs=1,
#     batch_size=32,
)

In [None]:
df = pd.concat([pd.read_csv(filename) for filename in glob('targetUserData\\*.csv')]).fillna(0.0).astype('str')

In [None]:
t = tf.data.Dataset.from_tensor_slices(dict(df))

def input_solver(sample):
    s1 = sample[categorical_columns]
    s2 = sample[numerical_columns]
    s3 = sample['totalTransactionPerUser']
    return {'categorical_features':s1, 
            'numerical_featurs':s2}, {'output_layer':s3}
    
t.map(input_solver)
model.fit(t, epochs=1)

# запуск обучения на массивах numpy
1. Массив np, хранящий категориальные признаки: X_train_cat, shape=(N_users, n_timestamps, n_features)
2. Массив np, хранящий количественные признаки: X_train_num, shape=(N_users, n_timestamps, n_features)
3. Массив np, хранящий значения класса: y_train_num, shape=(N_users, 1)
4. Повторить предыдущие шаги для образования тестовой выборки

In [None]:
filenames = glob('targetUserData\\*.csv')
shuffle(filenames)

In [None]:
get_label_from_filename = lambda fname: np.int8(fname.split('\\')[-1][0])
get_cat_data_from_dataframe = lambda df: df[categorical_columns].to_numpy()
get_num_data_from_dataframe = lambda df: df[numerical_columns].to_numpy()

def read_singleUserData(filename):
    df = pd.read_csv(filename)
    classLabel = get_label_from_filename(filename)
    # массивы NumPy
    cat_data = get_cat_data_from_dataframe(df)
    num_data = get_num_data_from_dataframe(df)
    return cat_data, num_data, classLabel

def read_userData(filenames):
    df_cat, df_num, df_target = [], [], []
    for filename in filenames:
        cat_data, num_data, classLabel = read_singleUserData(filename)
        df_cat.append(cat_data)
        df_num.append(num_data)
        df_target.append(classLabel)
    return df_cat, df_num, df_target

In [None]:
cat_data, num_data, target = read_userData(filenames[:1])

In [None]:
keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.fit(
#     x={'categorical_features':cat_data[0], 'numerical_features':num_data[0]},
#     y={'output_layer':target},
    x=(cat_data[0], num_data[0]),
    y=np.array(target).repeat(24),
    epochs=1
)