In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from utils.preprocess import *
from utils.model import *

In [4]:
target = read_target_train()
train_data = read_data('train')
test_data = read_data('test')

from category_encoders import TargetEncoder

def first_preprocess_plavki(plavki):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    
    plavki = plavki.copy()
    
    new_plavki = pd.DataFrame()
    for NPLV in plavki.NPLV.unique():
        cur_ts = plavki[plavki['NPLV'] == NPLV]
        new_ts = cur_ts.iloc[0:1,:]
        new_ts[new_ts.select_dtypes(include=numerics).columns] = pd.DataFrame(cur_ts[new_ts.select_dtypes(include=numerics).columns].mean(), columns = new_ts.index).T

        new_plavki = pd.concat((new_plavki, new_ts))
    new_plavki = new_plavki.reset_index().drop(['index'], axis = 1)
    
    new_plavki['plavka time'] = (pd.to_datetime(new_plavki['plavka_VR_KON']) - pd.to_datetime(new_plavki['plavka_VR_NACH'])).dt.seconds
    new_plavki = new_plavki.drop(['plavka_VR_NACH', 'plavka_VR_KON'], axis = 1)
    return new_plavki


def preprocess_plavki(train_plavki, test_plavki):
    new_plavki_train = first_preprocess_plavki(train_plavki)
    new_plavki_test = first_preprocess_plavki(test_plavki)
    
    plavki_cat = ['plavka_NMZ', 'plavka_NAPR_ZAD', 'plavka_TIPE_FUR', 'plavka_TIPE_GOL']
    for col in plavki_cat:
        new_plavki_train[col + '_C'] = new_plavki_train[col]
        new_plavki_test[col + '_C']  = new_plavki_test[col] 
        new_plavki_train[col + '_TST'] = new_plavki_train[col]
        new_plavki_test[col + '_TST']  = new_plavki_test[col]
        
    target = read_target_train()
    
    encoder_C = TargetEncoder(cols = list(map(lambda x: x + '_C', plavki_cat)))
    encoder_TST = TargetEncoder(cols = list(map(lambda x: x + '_TST', plavki_cat)))
    encoder_C.fit(new_plavki_train, target['C'])
    encoder_TST.fit(new_plavki_train, target['TST'])
    
    new_plavki_train = encoder_C.transform(new_plavki_train)
    new_plavki_test = encoder_C.transform(new_plavki_test)
    new_plavki_train = encoder_TST.transform(new_plavki_train)
    new_plavki_test = encoder_TST.transform(new_plavki_test)
    
    return new_plavki_train, new_plavki_test
    

new_plavki_train, new_plavki_test = preprocess_plavki(train_data['plavki'], test_data['plavki'])

new_plavki_train[config['cat_features_list']] = new_plavki_train[config['cat_features_list']].astype(str)
new_plavki_test[config['cat_features_list']] = new_plavki_test[config['cat_features_list']].astype(str)

# new_plavki_train = new_plavki_train.drop(config['cat_features_list'], axis=1)
# new_plavki_test = new_plavki_test.drop(config['cat_features_list'], axis=1)


new_plavki_train.to_csv(config['data_path'] + 'preprocessed_plavki_train.csv', index=False)
new_plavki_test.to_csv(config['data_path'] + 'preprocessed_plavki_test.csv', index=False)