In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.preprocessing import MinMaxScaler
import json
import seaborn as sns
from numpy import asarray
from numpy import save
from numpy import load
import pickle
import joblib
from sklearn.model_selection import train_test_split
import matplotlib

In [None]:
sns.set_style("white")

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

## Datapreparation for LSTM

In [None]:
df_cc4 = pd.read_pickle("/home/di40438/bachelorarbeit/final_data/cc4_data.pkl") 

### Minutes a section need to pass through the caster when casting speed is at the lowest 

In [None]:
data = pd.read_pickle("/home/di40438/bachelorarbeit/final_data/cc4_data.pkl") 
data = df_cc4.drop('seq_id', axis=1).apply(pd.to_numeric)
combined_grouped = data.groupby(data['ChargenNrErsteSchmInSeq'])
grouped = combined_grouped['GiessGeschwInMproMin'].mean().to_frame()
length_cc4 = 15.42
min_v = np.round(grouped.min(),2)
seq_len = int(np.ceil(length_cc4 / min_v))
seq_len

In [None]:
interesting = ['DATE_TIME','ChargenNr','GiessBeginn_DateTime',
            'GiessEnde_DateTime','TempMittelLsInC','GiessLaengeSequenzInM',
    'TempMittelFsInC','ChargenNrErsteSchmInSeq','GiessLaengeSchmelzeInM','AusfLaengeSchmelzeInM','seq_id']

In [None]:
unwanted = ['DATE_TIME','TIME', 'ChargenNr', 'GiessBeginnSchmelze','GiessBeginn_DateTime',
            'GiessEndeSchmelze','GiessEnde_DateTime','EndeSchmelze','Ende_DateTime','NrSchmelzeInSequenz',
            
            'WasserZ1FsInLproMin','WasserZ1LsInLproMin', 'WasserZ2bFsInLproMin','WasserZ4FsInLproMin',
            'WasserZ4LsInLproMin','WasserZ3bLsInLproMin','WasserZ3bFsInLproMin','WasserZ1DiefflenInLproMin',
            'WasserZ2bLsInLproMin','WasserZ1DillingenInLproMin','WasserZ2aLsInLproMin','WasserZ2aFsInLproMin',
            'WasserZ5FsInLproMin','WasserZ5LsInLproMin','WasserZ3aFsInLproMin','WasserZ3aLsInLproMin',
            'GiessGeschwInMproMin',  
            'GiessGeschwInMproMin_integr','strang_nr','TempMittelLsInC_old','TempMittelFsInC_old',
            'WasserZ1DiefflenInLproMin_integr', 'WasserZ1DillingenInLproMin_integr',
            'GiessLaengeSchmelzeInM_delta', 'Format',
           ]    

In [None]:
cc4_keys = df_cc4.columns.to_list()

column_keys = [key for key in cc4_keys if key not in unwanted]

numeric_keys = column_keys.copy()
numeric_keys.remove('seq_id')

label_keys = ['TempMittelLsInC',
 'TempMittelFsInC']

feature_keys = [key for key in numeric_keys if key not in label_keys]
feature_keys.remove('ChargenNrErsteSchmInSeq')

wanted_keys = label_keys + feature_keys

In [None]:
# with open('/home/di40438/bachelorarbeit/final_data/label_keys_ts.pkl', 'wb') as fp:
#     pickle.dump(label_keys, fp)
# with open('/home/di40438/bachelorarbeit/final_data/feature_keys_ts.pkl', 'wb') as fp:
#     pickle.dump(feature_keys, fp)

### Drop some columns

In [None]:
cc4_data = df_cc4.drop(unwanted, axis=1)

In [None]:
cc4_data[['TempMittelLsInC','TempMittelFsInC']].describe()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5),  sharex=True)
fig.suptitle('Distributions of Temperature in Datasets')

sns.histplot(ax=axes[0], data=cc4_data, x='TempMittelLsInC', bins=100)
sns.histplot(ax=axes[1], data=cc4_data, x='TempMittelFsInC', bins=100)

### create train-, validation- and test-set
#### don't split inbetween a sequence

In [None]:
sequences = cc4_data.groupby('ChargenNrErsteSchmInSeq')
seq_keys = list(sequences.groups.keys())
seq_keys.remove(475229)  #remove hot sequence

from sklearn.model_selection import train_test_split
train_full_keys, test_keys = train_test_split(seq_keys, test_size=0.25,random_state=42)
train_keys, valid_keys = train_test_split(train_full_keys, test_size=0.25, random_state=42)
train_keys.append(475229)
train_full_keys.append(475229)

In [None]:
# with open('/home/di40438/bachelorarbeit/final_data/train_keys_ts.pkl', 'wb') as fp:
#     pickle.dump(train_keys, fp)
# with open('/home/di40438/bachelorarbeit/final_data/valid_keys_ts.pkl', 'wb') as fp:
#     pickle.dump(valid_keys, fp)
# with open('/home/di40438/bachelorarbeit/final_data/test_keys_ts.pkl', 'wb') as fp:
#     pickle.dump(test_keys, fp)

In [None]:
def create_set(key_set,data):
    df = pd.DataFrame()
    for key in key_set:
        df2 = pd.DataFrame()
        df2 = data[data['ChargenNrErsteSchmInSeq']==key]
        df = pd.concat([df,df2],axis=0)
    return df

In [None]:
train_data = create_set(train_keys, cc4_data)
train_data = train_data.reset_index(drop=True)

valid_data = create_set(valid_keys, cc4_data)
valid_data = valid_data.reset_index(drop=True)

test_data = create_set(test_keys, cc4_data)
test_data = test_data.reset_index(drop=True)

### Duplicate all sequences with temperatures greater than 800

In [None]:
df_high_temp = train_data[(train_data['ZielTempTreiberInC']>800)]
seq_id = list(df_high_temp['seq_id'].drop_duplicates(keep='first'))
              
df_hot_seq = train_data.set_index('seq_id', drop=True).loc[seq_id].reset_index(drop=False)
df_hot_seq['seq_id'] = df_hot_seq['seq_id'].astype(str) + '_1'  #rename seq_id of first set 

df_hot_seq2 = train_data.set_index('seq_id', drop=True).loc[seq_id].reset_index(drop=False)    #create second set of hot sequences
df_hot_seq2['seq_id'] = df_hot_seq2['seq_id'].astype(str) + '_2' #rename seq_id of  second set

In [None]:
train_data = pd.concat([train_data,df_hot_seq],axis=0).reset_index(drop=True)    #add first set oh hot sequences
train_data = pd.concat([train_data,df_hot_seq2],axis=0).reset_index(drop=True)

#One dataset of all the known data
all_train_data = pd.concat([train_data, valid_data], axis=0).reset_index(drop=True)

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(10,12),  sharex=True, sharey=True)

matplotlib.rc('xtick', labelsize=13) 
matplotlib.rc('ytick', labelsize=13) 

sns.histplot(ax=axes[0,0], data=train_data, x='TempMittelLsInC',color=sns.color_palette()[0])
sns.histplot(ax=axes[0,1], data=train_data, x='TempMittelFsInC',color=sns.color_palette()[0])
sns.histplot(ax=axes[1,0] ,data=valid_data, x='TempMittelLsInC',color=sns.color_palette()[0])
sns.histplot(ax=axes[1,1] ,data=valid_data, x='TempMittelFsInC',color=sns.color_palette()[0])
sns.histplot(ax=axes[2,0], data=test_data, x='TempMittelLsInC',color=sns.color_palette()[0] )
sns.histplot(ax=axes[2,1], data=test_data, x='TempMittelFsInC', color=sns.color_palette()[0])

axes[0,0].set_title('Training Set', fontdict={'fontsize':13} )
axes[0,0].set(xticks=[]) 
axes[0,0].set_xlabel('Temperature [°C]', fontsize=13)
axes[0,0].set_ylabel('Count', fontsize=13)
axes[0,0].yaxis.grid(True)
axes[0,0].set(xticks=[]) 

axes[0,1].set_title('Training Set', fontdict={'fontsize':13} )
axes[0,1].set(xticks=[]) 
axes[0,1].set_xlabel('Temperature [°C]', fontsize=13)
axes[0,1].set_ylabel('Count', fontsize=13)
axes[0,1].yaxis.grid(True)
axes[0,1].set(xticks=[]) 

axes[1,0].set_title('Validation Set', fontdict={'fontsize':13} )
axes[1,0].set(xticks=[]) 
axes[1,0].set_xlabel('Temperature [°C]', fontsize=13)
axes[1,0].set_ylabel('Count', fontsize=13)
axes[1,0].yaxis.grid(True)
axes[1,0].set(xticks=[]) 

axes[1,1].set_title('Validation Set', fontdict={'fontsize':13} )
axes[1,1].set(xticks=[]) 
axes[1,1].set_xlabel('Temperature [°C]', fontsize=13)
axes[1,1].set_ylabel('Count', fontsize=13)
axes[1,1].yaxis.grid(True)
axes[1,1].set(xticks=[]) 

axes[2,0].set_title('Test Set', fontdict={'fontsize':13} )
axes[2,0].set(xticks=[]) 
axes[2,0].set_xlabel('Temperature Loose Side [°C]', fontsize=13)
axes[2,0].set_ylabel('Count', fontsize=13)
axes[2,0].yaxis.grid(True)
axes[2,0].set(xticks=[]) 

axes[2,1].set_title('Test Set', fontdict={'fontsize':13} )
axes[2,1].set(xticks=[]) 
axes[2,1].set_xlabel('Temperature Fixed Side [°C]', fontsize=13)
axes[2,1].set_ylabel('Count', fontsize=13)
axes[2,1].yaxis.grid(True)
axes[2,1].set(xticks=[]) 





In [None]:
# train_data.to_pickle("/home/di40438/bachelorarbeit/final_data/train_unsc.pkl")
# valid_data.to_pickle("/home/di40438/bachelorarbeit/final_data/valid_unsc.pkl")
# test_data.to_pickle("/home/di40438/bachelorarbeit/final_data/test_unsc.pkl")
# all_train_data.to_pickle("/home/di40438/bachelorarbeit/final_data/all_train_unsc.pkl")

### Normalize data

In [None]:
#save ids since they are needed later
id_train = train_data['seq_id'].to_frame()
id_valid = valid_data['seq_id'].to_frame()
id_test = test_data['seq_id'].to_frame()
id_all_train = all_train_data['seq_id'].to_frame()

#train set and validation set
scaler_x_train = MinMaxScaler(feature_range=(0,1))
train_x = scaler_x_train.fit_transform(train_data[feature_keys])
df_train_x_norm = pd.DataFrame(train_x, columns=feature_keys)
valid_x = scaler_x_train.transform(valid_data[feature_keys])
df_valid_x_norm = pd.DataFrame(valid_x, columns=feature_keys)

scaler_y_train = MinMaxScaler(feature_range=(0,1))
train_y = scaler_y_train.fit_transform(train_data[label_keys])
df_train_y_norm = pd.DataFrame(train_y, columns=label_keys)
valid_y = scaler_y_train.transform(valid_data[label_keys])
df_valid_y_norm = pd.DataFrame(valid_y, columns=label_keys)

#all the training data
scaler_x = MinMaxScaler(feature_range=(0,1))
all_x = scaler_x.fit_transform(all_train_data[feature_keys])
df_all_x_norm = pd.DataFrame(all_x, columns=feature_keys)
test_x = scaler_x.transform(test_data[feature_keys])
df_test_x_norm = pd.DataFrame(test_x, columns=feature_keys)

scaler_y = MinMaxScaler(feature_range=(0,1))
all_y = scaler_y.fit_transform(all_train_data[label_keys])
df_all_y_norm = pd.DataFrame(all_y, columns=label_keys)
test_y = scaler_y.transform(test_data[label_keys])
df_test_y_norm = pd.DataFrame(test_y, columns=label_keys)

#create one dataframe
train_norm = pd.concat([df_train_y_norm, df_train_x_norm], axis=1)
valid_norm = pd.concat([df_valid_y_norm, df_valid_x_norm], axis=1)
test_norm = pd.concat([df_test_y_norm, df_test_x_norm], axis=1)
all_train_norm = pd.concat([df_all_y_norm, df_all_x_norm], axis=1)

#add the ids again
train_norm = pd.concat([train_norm,id_train],axis=1)
valid_norm = pd.concat([valid_norm,id_valid],axis=1)
test_norm = pd.concat([test_norm,id_test],axis=1)
all_train_norm = pd.concat([all_train_norm,id_all_train],axis=1)

train_norm = train_norm.reset_index(drop=True)
valid_norm = valid_norm.reset_index(drop=True)
test_norm = test_norm.reset_index(drop=True)
all_train_norm = all_train_norm.reset_index(drop=True)

### Save Scaler

In [None]:
# scaler_filename = "/home/di40438/bachelorarbeit/final_data/scaler_y.save"
# joblib.dump(scaler_y, scaler_filename)

# scaler_filename = "/home/di40438/bachelorarbeit/final_data/scaler_x.save"
# joblib.dump(scaler_x, scaler_filename)

# scaler_filename = "/home/di40438/bachelorarbeit/final_data/scaler_train_y.save"
# joblib.dump(scaler_y_train, scaler_filename)

# scaler_filename = "/home/di40438/bachelorarbeit/final_data/scaler_train_x.save"
# joblib.dump(scaler_x_train, scaler_filename)

### Save Normalized Data

In [None]:
# train_norm.to_pickle("/home/di40438/bachelorarbeit/final_data/train_norm.pkl")
# valid_norm.to_pickle("/home/di40438/bachelorarbeit/final_data/valid_norm.pkl")
# test_norm.to_pickle("/home/di40438/bachelorarbeit/final_data/test_norm.pkl")
# all_train_norm.to_pickle("/home/di40438/bachelorarbeit/final_data/all_train_norm.pkl")

### bring data in the right format

In [None]:
def lstm_format(data):
    length_cc4 = 15.42
    min_v = np.round(grouped.min(),2)
    seq_len = int(np.ceil(length_cc4 / min_v))
    features = []
    labels = []
    data_set = {}

    for _, group in data.groupby('seq_id'):
        for i in range(len(group)-(seq_len-1)):
            features.append(group[feature_keys].iloc[i:i+seq_len].to_numpy())  
            labels.append(group[label_keys].iloc[i+(seq_len-1)].to_numpy())

    data_set['x'] = np.stack(features)
    data_set['y'] = np.stack(labels)
    return data_set

In [None]:
train_n = lstm_format(train_norm)
train_x_norm = train_n['x'].copy()
train_y_norm = train_n['y'].copy()

In [None]:
valid_n = lstm_format(valid_norm)
valid_x_norm = valid_n['x'].copy()
valid_y_norm = valid_n['y'].copy()

In [None]:
test_n = lstm_format(test_norm)
test_x_norm = test_n['x'].copy()
test_y_norm = test_n['y'].copy()

In [None]:
all_train_n = lstm_format(all_train_norm)
all_train_x_norm = all_train_n['x'].copy()
all_train_y_norm = all_train_n['y'].copy()

### Save data

In [None]:
# save('/home/di40438/bachelorarbeit/final_data/train_x_norm.npy', train_x)
# save('/home/di40438/bachelorarbeit/final_data/train_y_norm.npy', train_y)
# save('/home/di40438/bachelorarbeit/final_data/valid_x_norm.npy', valid_x)
# save('/home/di40438/bachelorarbeit/final_data/valid_y_norm.npy', valid_y)
# save('/home/di40438/bachelorarbeit/final_data/test_x_norm.npy', test_x)
# save('/home/di40438/bachelorarbeit/final_data/test_y_norm.npy', test_y)
# save('/home/di40438/bachelorarbeit/final_data/all_train_x_norm.npy', all_train_x)
# save('/home/di40438/bachelorarbeit/final_data/all_train_y_norm.npy', all_train_y)