In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow.keras as keras
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import kerastuner as kt
import pickle
import joblib
from scipy import stats
from numpy import load
from numpy import save
from tensorflow.keras import regularizers
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns

In [None]:
sns.set_style("white")

In [None]:
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 300)

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"

In [None]:
tf.test.is_gpu_available()

In [None]:
from tensorflow.compat.v1.keras.backend import set_session

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.compat.v1.Session(config=config))


### Read data for str1 and str2

In [None]:
str1_df = pd.read_pickle("../final_data/df_str1_transformed.pkl")
str1_df = str1_df.rename(columns={'Str1TempMittelLsInC_mean': 'TempMittelLsInC_mean', 'Str1TempMittelFsInC_mean': 'TempMittelFsInC_mean'})

In [None]:
str2_df = pd.read_pickle("../final_data/df_str2_transformed.pkl")
str2_df = str2_df.rename(columns={'Str2TempMittelLsInC_mean': 'TempMittelLsInC_mean', 'Str2TempMittelFsInC_mean': 'TempMittelFsInC_mean'})

### Combine Dataframes

In [None]:
df_cc4 = pd.concat([str1_df,str2_df],axis=0)
df_cc4 = df_cc4.reset_index(drop=True)
df_cc4 = df_cc4.drop(['ChargenNrErsteSchmInSeq'],axis=1)

### Drop some attributes

In [None]:
data = df_cc4.drop([
       'z1_begin_time', 'z1_delta_time', 'z1_end_time', 
       'z2a_begin_time', 'z2a_delta_time',  'z2a_end_time',
        'z2b_begin_time', 'z2b_delta_time', 
       'z2b_end_time',  'z3a_begin_time', 'z3a_delta_time',
        'z3a_end_time',  'z3b_begin_time',
       'z3b_delta_time',  'z3b_end_time', 
       'z4_begin_time', 'z4_delta_time',  'z4_end_time', 
       'z5_begin_time', 'z5_delta_time',  'z5_end_time', 
       'zpy_begin_time', 'zpy_delta_time',  'zpy_end_time','GiessBeginnSchmelze', 'GiessBeginn_DateTime',
       'GiessEndeSchmelze', 'GiessEnde_DateTime', 'EndeSchmelze',
       'Ende_DateTime', 'NrSchmelzeInSequenz', ],axis=1)

### One hot encoding

In [None]:
formats = np.array(data['Format'])
format_endocer = OneHotEncoder()
format_1hot = format_endocer.fit_transform(formats.reshape(-1, 1))
df_format = pd.DataFrame(format_1hot.toarray(), columns=format_endocer.get_feature_names())
df_format.rename(columns={'x0_1825':'Format_1825', 'x0_2230':'Format_2230', 'x0_2234':'Format_2234', 'x0_2235':'Format_2235' }, inplace=True)

data = pd.concat((data, df_format), axis=1)

### Save Data

In [None]:
# data.to_pickle("/home/di40438/bachelorarbeit/data/mlp_data.pkl")

In [None]:
data[['TempMittelLsInC_mean','TempMittelFsInC_mean']].describe()

In [None]:
import seaborn as sns
corrmat = data.drop(['ChargenNr','SequenzNr', 'strang_nr', 'ith_section', 'Format','Format_1825', 'Format_2230', 'Format_2234', 'Format_2235' ] , axis=1).corr()
f, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(corrmat, vmax=.8, square=True);

#### Data cleaning

In [None]:
data_prep = data[data['ith_section']>0]  # drop the first section of each sequence
data_prep = data_prep.drop(['ChargenNr','ith_section', 'strang_nr'],axis=1)  
data_prep = data_prep.reset_index(drop=True)

columns = data_prep.columns

In [None]:
data_prep.describe()

In [None]:
labels = ['TempMittelLsInC_mean', 'TempMittelFsInC_mean']
unwanted = ['Format', 'UeberhitzungMittelInK', 'SollGiessGeschwInMproMin','SequenzNr', 'seq_id']
features =  [e for e in columns if e not in (labels+unwanted)]

In [None]:
# with open('/home/di40438/bachelorarbeit/final_data/label_keys_mlp.pkl', 'wb') as fp:
#     pickle.dump(labels, fp)
# with open('/home/di40438/bachelorarbeit/final_data/feature_keys_mlp.pkl', 'wb') as fp:
#     pickle.dump(features, fp)

### Same Sequences as for LSTM

In [None]:
sequences = data_prep.groupby('SequenzNr')
seq_keys_mlp = list(sequences.groups.keys())
seq_keys_mlp.remove(475229)

from sklearn.model_selection import train_test_split
train_full, test_keys = train_test_split(seq_keys_mlp, test_size=0.25, random_state=42)
train_keys, valid_keys = train_test_split(train_full, test_size=0.25, random_state=42)
train_keys.append(475229)
train_full.append(475229)

In [None]:
def create_set(key_set,data):
    df = pd.DataFrame()
    for key in key_set:
        df2 = pd.DataFrame()
        df2 = data[data['SequenzNr']==key]
        df = pd.concat([df,df2],axis=0)
    return df

In [None]:
train_data = create_set(train_keys, data_prep)
train_data = train_data.reset_index(drop=True)

valid_data = create_set(valid_keys, data_prep)
valid_data = valid_data.reset_index(drop=True)

test_data = create_set(test_keys, data_prep)
test_data = test_data.reset_index(drop=True)

In [None]:
test_data_evaluation = create_set(test_keys, df_cc4[df_cc4['ith_section']>0])
test_data_evaluation = test_data_evaluation.reset_index()

### Triple sequence with a Target temperatures higher than 800 degrees

In [None]:
df_high_temp = train_data[(train_data['ZielTempTreiberInC']>800)]
seq_id = list(df_high_temp['seq_id'].drop_duplicates(keep='first'))     
df_hot_seq = train_data.set_index('seq_id', drop=True).loc[seq_id].reset_index(drop=False)
df_hot_seq['seq_id'] = df_hot_seq['seq_id'].astype(str) + '_1'  #rename seq_id of first set 
df_hot_seq2 = train_data.set_index('seq_id', drop=True).loc[seq_id].reset_index(drop=False)    #create second set of hot sequences
df_hot_seq2['seq_id'] = df_hot_seq2['seq_id'].astype(str) + '_2' #rename seq_id of  second set

In [None]:
train_data = pd.concat([train_data,df_hot_seq],axis=0).reset_index(drop=True)    #add first set of hot sequences
train_data = pd.concat([train_data,df_hot_seq2],axis=0).reset_index(drop=True)


all_train_data = pd.concat([train_data, valid_data], axis=0).reset_index(drop=True)

#### scaling the data

#### Train and Validation Data

In [None]:
train_X_unsc = train_data[features]
train_y_unsc = train_data[labels]
valid_X_unsc = valid_data[features]
valid_y_unsc = valid_data[labels]

scaler_x_train = MinMaxScaler()
train_X = scaler_x_train.fit_transform(train_X_unsc)
valid_X = scaler_x_train.transform(valid_X_unsc)

scaler_y_train = MinMaxScaler()
train_y = scaler_y_train.fit_transform(train_y_unsc)
valid_y = scaler_y_train.transform(valid_y_unsc)


#### All known Data and Test Data

In [None]:
all_train_X_unsc = all_train_data[features]
all_train_y_unsc = all_train_data[labels]
test_X_unsc = test_data[features]
test_y_unsc = test_data[labels]

scaler_x = MinMaxScaler()
all_train_X = scaler_x.fit_transform(all_train_X_unsc)
test_X = scaler_x.transform(test_X_unsc)

scaler_y = MinMaxScaler()
all_train_y = scaler_y.fit_transform(all_train_y_unsc)
test_y = scaler_y.transform(test_y_unsc)

### Save data

In [None]:
# train_data.to_pickle("../final_data/train_unsc_mlp.pkl")
# valid_data.to_pickle("../final_data/valid_unsc_mlp.pkl")
# test_data.to_pickle("../final_data/test_unsc_mlp.pkl")
# all_train_data.to_pickle("../final_data/all_train_unsc_mlp.pkl")

In [None]:
# save('../final_data/train_x_mlp.npy', train_X)
# save('../final_data/train_y_mlp.npy', train_y)
# save('../final_data/valid_x_mlp.npy', valid_X)
# save('../final_data/valid_y_mlp.npy', valid_y)
# save('../final_data/test_x_mlp.npy', test_X)
# save('../final_data/test_y_mlp.npy', test_y)
# save('../final_data/all_x_mlp.npy', all_train_X)
# save('../final_data/all_y_mlp.npy', all_train_y)

In [None]:
# test_data_evaluation.to_pickle("../final_data/test_data_evaluation.pkl")

#### save scaler

In [None]:
# scaler_x_filename = "../final_data/scaler_x_mlp.save"
# joblib.dump(scaler_x, scaler_x_filename)

# scaler_y_filename = "../final_data/scaler_y_mlp.save"
# joblib.dump(scaler_y, scaler_y_filename)

# scaler_y_filename = "../final_data/scaler_y_train_mlp.save"
# joblib.dump(scaler_y_train, scaler_y_filename)