In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
from scipy.interpolate import interp1d
import seaborn as sns
import pickle
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import norm
from scipy import stats

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

### Read Time Series Data

In [None]:
from pathlib import Path
data_dir = Path("../final_data/csv")

data = {}
for csv_file in data_dir.glob("*.csv"):
    if csv_file.stem in ["schmelzen", "FormateCC4"]:continue
        
    this_csv = pd.read_csv(csv_file, delimiter=";")
    
    this_csv['DATE_TIME'] = pd.to_datetime(this_csv['TIME'],unit='s').dt.tz_localize("UTC")
    this_csv = this_csv.set_index('DATE_TIME', drop=False).sort_index()
    
    data[csv_file.stem] = this_csv.sort_values("TIME")

In [None]:
data['TundishTemperaturInC'] = data['TundishTemperaturInC'][(data['TundishTemperaturInC']['TundishTemperaturInC'] < 1600) & (data['TundishTemperaturInC']['TundishTemperaturInC'] > 1400)]  

## Preprocess Time Series Data

In [None]:
frequency = '1min'

rng = pd.date_range(start='2019-03-01 00:10:00', end='2019-08-30 00:40:00', freq=frequency)
df_times = pd.DataFrame({'DateTime': rng})
df_times['ts'] = df_times.DateTime.values.astype(np.int64) // 10**9    #np arry by values, then 10**9 for s

### Define Keys

In [None]:
discr = list(data.keys())

integrate_keys_all = [e for e in discr if e not in ('TundishTemperaturInC',
                                                 'Str1TempMittelLsInC',
                                                 'Str1TempMittelFsInC',
                                                 'Str2TempMittelLsInC',
                                                 'Str2TempMittelFsInC',
                                                 'Str1GiessLaengeSchmelzeInM',
                                                 'Str2GiessLaengeSchmelzeInM',
                                                 'Str1GiessLaengeSequenzInM',
                                                 'Str2GiessLaengeSequenzInM',
                                                 'Str2AusfLaengeSchmelzeInM',
                                                 'Str1AusfLaengeSchmelzeInM',)]

delta_keys_all = ['Str1GiessLaengeSequenzInM',
              'Str2GiessLaengeSequenzInM',
              'Str2AusfLaengeSchmelzeInM',
              'Str1AusfLaengeSchmelzeInM']

#### Strang 1 keys

In [None]:
str1_keys = ['TundishTemperaturInC',
 'Str1TempMittelLsInC',
 'Str1TempMittelFsInC',
 'Str1WasserZ4FsInLproMin',
 'Str1WasserZ4LsInLproMin',
 'Str1WasserZ2bFsInLproMin',
 'Str1WasserZ2bLsInLproMin',
 'Str1WasserZ3bFsInLproMin',
 'Str1WasserZ1FsInLproMin',
 'Str1WasserZ3bLsInLproMin',
 'Str1WasserZ1LsInLproMin',
 'Str1WasserZ3aFsInLproMin',
 'Str1WasserZ3aLsInLproMin',
 'Str1WasserZ2aLsInLproMin',
 'Str1WasserZ2aFsInLproMin',
 'Str1WasserZ5LsInLproMin',
 'Str1WasserZ5FsInLproMin',
 'Str1WasserZ1DiefflenInLproMin',
 'Str1WasserZ1DillingenInLproMin',
 'Str1GiessLaengeSchmelzeInM',
 'Str1GiessLaengeSequenzInM',
 'Str1AusfLaengeSchmelzeInM',
 'Str1GiessGeschwInMproMin',]

delta_keys_1 = ['Str1GiessLaengeSequenzInM',
              'Str1AusfLaengeSchmelzeInM']

integrate_keys_1 = [e for e in str1_keys if e not in ('TundishTemperaturInC',
                                                 'Str1TempMittelLsInC',
                                                 'Str1TempMittelFsInC',
                                                 'Str1GiessLaengeSchmelzeInM',
                                                 'Str1GiessLaengeSequenzInM',
                                                 'Str1AusfLaengeSchmelzeInM',)]  
temperature_keys_1 = ['Str1TempMittelLsInC',
                    'Str1TempMittelFsInC',]

cols_schmelzen_str1 = ['ChargenNr',
 'ChargenNrErsteSchmInSeq',
 'GiessBeginnSchmelze',
 'GiessBeginn_DateTime',
 'GiessEndeSchmelze',
 'GiessEnde_DateTime',
 'EndeSchmelze',
 'Ende_DateTime',
 'NrSchmelzeInSequenz',
 'LiquidusTempInC',
 'SolidusTempInC',
 'UeberhitzungMittelInK',
 'ZielTempTreiberInC',
 'Str1SollGiessGeschwInMproMin',
 'Str1Format',
 'C-Aequiv01',
 'C-Aequiv02',
 'C-AequivP']

#### Strang 2 keys

In [None]:
str2_keys = [
 'TundishTemperaturInC',
 'Str2TempMittelLsInC',
 'Str2TempMittelFsInC',
 'Str2WasserZ1FsInLproMin',
 'Str2WasserZ1LsInLproMin',
 'Str2WasserZ2bFsInLproMin',
 'Str2WasserZ4FsInLproMin',
 'Str2WasserZ4LsInLproMin',
 'Str2WasserZ3bLsInLproMin',
 'Str2WasserZ3bFsInLproMin',
 'Str2WasserZ1DiefflenInLproMin',
 'Str2WasserZ2bLsInLproMin',
 'Str2WasserZ1DillingenInLproMin',
 'Str2WasserZ2aLsInLproMin',
 'Str2WasserZ2aFsInLproMin',
 'Str2WasserZ5FsInLproMin',
 'Str2GiessLaengeSchmelzeInM',
 'Str2WasserZ5LsInLproMin',
 'Str2GiessLaengeSequenzInM',
 'Str2WasserZ3aFsInLproMin',
 'Str2WasserZ3aLsInLproMin',
 'Str2AusfLaengeSchmelzeInM',
 'Str2GiessGeschwInMproMin']

delta_keys_2 = ['Str2GiessLaengeSequenzInM',
              'Str2AusfLaengeSchmelzeInM',]
            
integrate_keys_2 = [e for e in str2_keys if e not in ('TundishTemperaturInC',
                                                 'Str2TempMittelLsInC',
                                                 'Str2TempMittelFsInC',
                                                 'Str2GiessLaengeSchmelzeInM',
                                                 'Str2GiessLaengeSequenzInM',
                                                 'Str2AusfLaengeSchmelzeInM',
                                                 )]

temperature_keys_2 = ['Str2TempMittelLsInC',
                    'Str2TempMittelFsInC',]
            
cols_schmelzen_str2 = ['ChargenNr',
 'ChargenNrErsteSchmInSeq',
 'GiessBeginnSchmelze',
 'GiessBeginn_DateTime',
 'GiessEndeSchmelze',
 'GiessEnde_DateTime',
 'EndeSchmelze',
 'Ende_DateTime',
 'NrSchmelzeInSequenz',
 'LiquidusTempInC',
 'SolidusTempInC',
 'UeberhitzungMittelInK',
 'ZielTempTreiberInC',
 'Str2SollGiessGeschwInMproMin',
 'Str2Format',
 'C-Aequiv01',
 'C-Aequiv02',
 'C-AequivP']

### Set keys 

In [None]:
keys = str1_keys
delta_keys = delta_keys_1
integrate_keys = integrate_keys_1
GiessLaengeSchmelzeInM = "Str1GiessLaengeSchmelzeInM"
laengeSequenz = 'Str1GiessLaengeSequenzInM'
cols_schmelzen = cols_schmelzen_str1
temperature = 'Str1TempMittelLsInC'
temp_keys = temperature_keys_1
ausförderlänge = 'Str1AusfLaengeSchmelzeInM'

### Interpolation

In [None]:
data_interval = {}
for i in keys:
    datetime = data[i]['TIME'].to_numpy()
    value = data[i][i].to_numpy()
    interpolate = interp1d(datetime,value, kind='linear')
    new_datetime = df_times['ts'].to_numpy()
   
    interp_array = interpolate(new_datetime)
    df_interp = pd.DataFrame({'DATE_TIME':new_datetime , i: interp_array, 'TIME': new_datetime})
    df_interp['DATE_TIME'] = pd.to_datetime(df_interp['DATE_TIME'],unit='s').dt.tz_localize("UTC")
    data_interval[i] = df_interp

### Integration

In [None]:
import scipy
def integrate(X):
        Y = [interpolate(x) for x in X]
        return scipy.integrate.trapz(Y,X, dx=1)


def trapz_integration(end_s):
    end_m = end_s / 60
    begin_m = end_m - 1
    try:
        return integrate([begin_m, end_m])
    except ValueError:
        return np.nan

In [None]:
for i in integrate_keys:
    minutes = data[i]['TIME'].to_numpy() / 60
    value = data[i][i].to_numpy()
    interpolate = interp1d(minutes,value, kind='linear')
    data_interval[i][i + '_integr'] = data_interval[i]["TIME"].apply(trapz_integration)

### Mean for the temperatures

In [None]:
def temp_mean(end_s):
    end_m = end_s / 60
    begin_m = end_m - 1
    try:
        return (1 / (end_m - begin_m)) * integrate([begin_m, end_m])
    except ValueError:
        return np.nan

In [None]:
for i in temp_keys:
    minutes = data[i]['TIME'].to_numpy() / 60
    value = data[i][i].to_numpy()
    interpolate = interp1d(minutes,value, kind='linear')
    data_interval[i][i + '_old'] = data_interval[i][i]
    data_interval[i][i] = data_interval[i]["TIME"].apply(temp_mean)

## Merging TS-Datasets

In [None]:
from functools import reduce
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['DATE_TIME', 'TIME'],
                                            how='outer'), data_interval.values())
df_merged = df_merged.set_index('DATE_TIME', drop=False).sort_index()
    

# Import and further Preprocessing of schmelzen Dataframe

In [None]:
df_schmelzen = pd.read_pickle("/home/di40438/bachelorarbeit/final_data/schmelzen.pkl")

# Add 90 minutes to the end of the last melting process of a sequence

In [None]:
df_schmelzen['EndeSchmelze'] = df_schmelzen['GiessEndeSchmelze']

df_schmelzen['ChargenNrErsteSchmInSeq_shifted'] = df_schmelzen['ChargenNrErsteSchmInSeq'].shift(-1)
df_schmelzen = df_schmelzen.fillna(0)

def add_time_to_end(row):               
        if row['ChargenNrErsteSchmInSeq'] !=  row['ChargenNrErsteSchmInSeq_shifted']:
            row['EndeSchmelze'] = row['EndeSchmelze'] + 90*60   #add 90 min to end of last 'GiessEnde' 
        return row

df_schmelzen = df_schmelzen.apply(add_time_to_end, axis=1)

df_schmelzen['Ende_DateTime'] = pd.to_datetime(df_schmelzen['EndeSchmelze'],unit='s').dt.tz_localize("UTC")

df_schmelzen = df_schmelzen.drop("ChargenNrErsteSchmInSeq_shifted", axis=1)

df_schmelzen = df_schmelzen[cols_schmelzen]

## Load schmelzen data

In [None]:
# df_schmelzen = pd.read_pickle("/home/di40438/bachelorarbeit/data/df_schmelzen_str1.pkl")

# Combine Datasets

In [None]:
merged_rows = []

for _, row in df_merged.iterrows():
    schmelze = df_schmelzen[(df_schmelzen["GiessBeginn_DateTime"] < row["DATE_TIME"]) & (df_schmelzen["Ende_DateTime"] >= row["DATE_TIME"])]
    if len(schmelze) != 1:
        continue
    merged_rows.append(pd.concat([row.to_frame().transpose().reset_index(drop=True), schmelze.reset_index(drop=True)], axis=1))

In [None]:
combined_df = pd.concat(merged_rows)
combined_df = combined_df.reset_index(drop=True)

### Calculate delta of lengths

#### delta for each schmelze

In [None]:
grouped = combined_df[GiessLaengeSchmelzeInM].groupby(combined_df['ChargenNr'])

schmelzen_length = {}
for name, group in grouped:
    schmelzen_length[name] = pd.DataFrame(group)
    schmelzen_length[name][GiessLaengeSchmelzeInM +'_delta'] = schmelzen_length[name].diff()
    if schmelzen_length[name][GiessLaengeSchmelzeInM].iloc[0] == 0:
        schmelzen_length[name] = schmelzen_length[name].fillna(0)
    else:
          schmelzen_length[name] = schmelzen_length[name].fillna(
                            schmelzen_length[name][GiessLaengeSchmelzeInM +'_delta'].iloc[1])
                
schmelzen_merged = pd.DataFrame()
for char_nr in list(schmelzen_length.keys()):
     schmelzen_merged = pd.concat([schmelzen_merged,schmelzen_length[char_nr]], axis=0)

#### delta for delta_keys for each sequence

In [None]:
sequence_merged = {}
for key in delta_keys:
    grouped = combined_df[key].groupby(combined_df['ChargenNrErsteSchmInSeq'])
    sequence_length = {}
    for name, group in grouped:
        sequence_length[name] = pd.DataFrame(group)
        sequence_length[name][key +'_delta'] = sequence_length[name].diff()
        if sequence_length[name][key].iloc[0] == 0:
            sequence_length[name] = sequence_length[name].fillna(0)
        else:
            sequence_length[name] = sequence_length[name].fillna(
                                sequence_length[name][key +'_delta'].iloc[1])
    for seq_nr in list(sequence_length.keys()):
         sequence_merged[key] = pd.concat(list(sequence_length.values()), axis=0)

### Combine deltas with combined_df

In [None]:
conc1 = pd.concat([sequence_merged[delta_keys[0]][delta_keys[0] +'_delta'],sequence_merged[
    delta_keys[1]][delta_keys[1] +'_delta']], axis=1)
conc2 = pd.concat([conc1,schmelzen_merged[GiessLaengeSchmelzeInM +'_delta']], axis=1)
combined_df = pd.concat([combined_df,conc2], axis=1)

### Outlier detection: Get Outliers with Schmelzen data

In [None]:
combined_df_grouped = combined_df.groupby(combined_df['ChargenNr'])
combined_df_max = combined_df_grouped[GiessLaengeSchmelzeInM].max().to_frame()
laengeSchmelze_max = GiessLaengeSchmelzeInM + '_max'
combined_df_max = combined_df_max.rename(columns={GiessLaengeSchmelzeInM: laengeSchmelze_max})
combined_df_max = combined_df_max.reset_index(drop=False)

In [None]:
combined_df_max.min()

In [None]:
fig = plt.figure(figsize=(15,5))
ax1 = fig.add_subplot(1,1,1)
ax1.plot('ChargenNr', laengeSchmelze_max, data = combined_df_max)

In [None]:
combined_df_max.shape

In [None]:
schmelzen_drop = combined_df_max[combined_df_max[laengeSchmelze_max]<10]
schmelzen_drop = schmelzen_drop['ChargenNr'].to_numpy()
schmelzen_drop

#### look at min_length

In [None]:
combined_df_min = combined_df_grouped[GiessLaengeSchmelzeInM].min().to_frame()
laengeSchmelze_min = GiessLaengeSchmelzeInM + '_min'
combined_df_min = combined_df_min.rename(columns={GiessLaengeSchmelzeInM: laengeSchmelze_min})
combined_df_min = combined_df_min.reset_index(drop=False)

In [None]:
combined_df_min.max()

In [None]:
fig = plt.figure(figsize=(15,5))
ax1 = fig.add_subplot(1,1,1)
ax1.plot('ChargenNr', laengeSchmelze_min, data = combined_df_min)

In [None]:
s = combined_df_min[combined_df_min[laengeSchmelze_min]>10]
s = s['ChargenNr'].to_numpy()
schmelzen_drop = np.append(schmelzen_drop,s)
s

In [None]:
schmelzen_drop

### identify the sequences of schmelzen_drop

In [None]:
schmelzen = combined_df.copy()
schmelzen = schmelzen.set_index('ChargenNr',drop=True)

seq_drop = schmelzen.loc[schmelzen_drop]

seq_drop = seq_drop['ChargenNrErsteSchmInSeq']
seq_drop = seq_drop.drop_duplicates(keep='first')
seq_drop = seq_drop.reset_index(drop=True)
seq_drop

#### save seq_drop

In [None]:
combined_df = combined_df.set_index(combined_df['ChargenNrErsteSchmInSeq'],drop=True)
combined_df = combined_df.drop(seq_drop, axis=0)
combined_df = combined_df.reset_index(drop=True)

## Combinde DataFrames

### Read str1_combined 

In [None]:
str1_combined = pd.read_pickle("/home/di40438/bachelorarbeit/final_data/str1_combined.pkl") 

In [None]:
str1_combined['strang_nr'] = 1
str1_combined = str1_combined.sort_values("TIME")
str1_combined = str1_combined.reset_index(drop=True)

### Read str2_combined 

In [None]:
str2_combined = pd.read_pickle("/home/di40438/bachelorarbeit/final_data/str2_combined.pkl") 

In [None]:
str2_combined['strang_nr'] = 2
str2_combined = str2_combined.sort_values("TIME")
str2_combined = str2_combined.reset_index(drop=True)

### rename columns

In [None]:
str1_combined.rename(columns={'delta_x':'GiessLaengeSchmelzeInM_delta', 'delta_y':'GiessLaengeSequenzInM_delta',
             'delta':'AusfLaengeSchmelzeInM_delta'}, inplace=True)

str1_combined.columns = str1_combined.columns.str.replace('Str1', '') 

In [None]:
str2_combined.rename(columns={'delta_x':'GiessLaengeSchmelzeInM_delta', 'delta_y':'GiessLaengeSequenzInM_delta',
             'delta':'AusfLaengeSchmelzeInM_delta'}, inplace=True)

str2_combined.columns = str2_combined.columns.str.replace('Str2', '') 

str2_combined = str2_combined.reset_index(drop=True)

### Concat DataFrames

In [None]:
cc4_data = pd.concat([str1_combined,str2_combined],axis=0)
cc4_data = cc4_data.reset_index(drop=True)

### Put for each seq_nr data for strang1 and strang2 togehther

In [None]:
strang_data = pd.DataFrame()
grouped_seq = cc4_data.groupby('ChargenNrErsteSchmInSeq')
for seq_name, seq_group in grouped_seq:
    grouped_str = seq_group.groupby('strang_nr')
    for str_name, str_group in grouped_str:
        str_gr = str_group.sort_values("TIME")
        strang_data= pd.concat([strang_data,str_gr], axis=0)

#### new column with seq_nr and str_nr

In [None]:
def seq_and_str_nr(row):
    row['seq_id'] = str(row['ChargenNrErsteSchmInSeq']) + '_str_' + str(row['strang_nr'])
    return row

strang_data = strang_data.apply(seq_and_str_nr, axis=1)
strang_data = strang_data.reset_index(drop=True)

### Drop the last sequence since it is not complete because of the interpolation
#### Some features are not measured for the end of the last sequence, therefore the interpolation is not until the end of the last sequence

In [None]:
strang_data = strang_data[strang_data['ChargenNrErsteSchmInSeq'] != 475513]

### Set all negative amounts of water and negative melt lengths to zero

In [None]:
water_keys =  ['WasserZ4FsInLproMin_integr',
 'WasserZ4LsInLproMin_integr',
 'WasserZ2bFsInLproMin_integr',
 'WasserZ2bLsInLproMin_integr',
 'WasserZ3bFsInLproMin_integr',
 'WasserZ1FsInLproMin_integr',
 'WasserZ3bLsInLproMin_integr',
 'WasserZ1LsInLproMin_integr',
 'WasserZ3aFsInLproMin_integr',
 'WasserZ3aLsInLproMin_integr',
 'WasserZ2aLsInLproMin_integr',
 'WasserZ2aFsInLproMin_integr',
 'WasserZ5LsInLproMin_integr',
 'WasserZ5FsInLproMin_integr',
 'WasserZ1DiefflenInLproMin_integr',
 'WasserZ1DillingenInLproMin_integr',]

In [None]:
for key in water_keys:
    strang_data[key] = np.where((strang_data[key] < 0),0,strang_data[key])
strang_data['GiessLaengeSchmelzeInM_delta'] = np.where((strang_data['GiessLaengeSchmelzeInM_delta'] < 0),0,strang_data['GiessLaengeSchmelzeInM_delta'])
strang_data['AusfLaengeSchmelzeInM_delta'] = np.where((strang_data['AusfLaengeSchmelzeInM_delta'] < 0),0,strang_data['AusfLaengeSchmelzeInM_delta'])

## Ende Data when last Melt has passed Pyrometer

In [None]:
grouped_end = strang_data.groupby('seq_id')
str_data = {}
strang_data = pd.DataFrame()
length_cc4 = 15.42
for name, group in grouped_end:
    sequ_max = group['GiessLaengeSequenzInM'].max()
    ausför_max = group['AusfLaengeSchmelzeInM'].max()
    str_data[name] = group[(group['GiessLaengeSequenzInM'] <= sequ_max-ausför_max+length_cc4)]    #last melt has reached pyrometer         
    strang_data = pd.concat([strang_data,str_data[name]], axis=0)

### Begin when first meter has reached the pyrometer

In [None]:
length_cc4 = 15.42
strang_data = strang_data[strang_data['GiessLaengeSequenzInM']>length_cc4+1]   #first meter has reached pyrometer
strang_data = strang_data.reset_index(drop=True)

### One hot encoding

In [None]:
strang_data = pd.read_pickle("/home/di40438/bachelorarbeit/final_data/cc4_data.pkl") 

In [None]:
formats = np.array(strang_data['Format'])
format_endocer = OneHotEncoder()
format_1hot = format_endocer.fit_transform(formats.reshape(-1, 1))
df_format = pd.DataFrame(format_1hot.toarray(), columns=format_endocer.get_feature_names())
df_format.rename(columns={'x0_1825':'Format_1825', 'x0_2230':'Format_2230', 'x0_2234':'Format_2234', 'x0_2235':'Format_2235' }, inplace=True)

strang_data = pd.concat((strang_data, df_format), axis=1)

## Save cc4_data

In [None]:
strang_data.to_pickle("/home/di40438/bachelorarbeit/final_data/cc4_data.pkl")

### Data Exploration

In [None]:
sns.set_style("white")

In [None]:
strang_data = pd.read_pickle("/home/di40438/bachelorarbeit/final_data/cc4_data.pkl") 

### Cross Section Data

In [None]:
df_schmelzen = pd.read_pickle("/home/di40438/bachelorarbeit/final_data/schmelzen.pkl")

In [None]:
cross_section_attr = ['ChargenNr', 'ChargenNrErsteSchmInSeq', 'GiessBeginnSchmelze',
       'GiessBeginn_DateTime', 'GiessEndeSchmelze', 'GiessEnde_DateTime',
       'NrSchmelzeInSequenz', 'LiquidusTempInC', 'SolidusTempInC',
       'UeberhitzungMittelInK', 'ZielTempTreiberInC',
       'SollGiessGeschwInMproMin','Format_1825','Format_2230','Format_2234','Format_2235',
       'C-Aequiv01', 'C-Aequiv02', 'C-AequivP', 'seq_id', 'TempMittelLsInC', 'TempMittelFsInC']

cross_section_data = strang_data[cross_section_attr]

cross_section_data = cross_section_data.drop_duplicates(subset=['seq_id'], keep='first')

In [None]:
fig, ax1 = plt.subplots(1, 1, figsize=(8, 5),  sharex=True)
sns.histplot(ax=ax1, data=cross_section_data, x='ZielTempTreiberInC',binwidth=5,   )
ax1.set_xlabel('Temperature [°C]', fontsize=15)
ax1.set_ylabel('Count', fontsize=15)
ax1.set(xticks=[]) 
ax1.xaxis.grid(False)
ax1.yaxis.grid(True)

In [None]:
fig, ax1 = plt.subplots(1,  figsize=(4, 5),  sharex=True)
sns.boxplot(data=cross_section_data['ZielTempTreiberInC'], ax=ax1)
ax1.set(yticks=[]) 
ax1.set(xticks=[]) 
ax1.xaxis.grid(False)
ax1.yaxis.grid(True)
ax1.set_ylabel('Casting Target Temperature [°C]', fontsize=15)

In [None]:
keys_test = ['TundishTemperaturInC', 'TIME', 'TempMittelLsInC',
       'TempMittelFsInC', 
      'WasserZ4FsInLproMin_integr',
        'WasserZ4LsInLproMin_integr',
      'WasserZ2bFsInLproMin_integr',
       'WasserZ2bLsInLproMin_integr',
        'WasserZ3bFsInLproMin_integr',
       'WasserZ1FsInLproMin_integr',
       'WasserZ3bLsInLproMin_integr',
        'WasserZ1LsInLproMin_integr',
       'WasserZ3aFsInLproMin_integr',
        'WasserZ3aLsInLproMin_integr',
       'WasserZ2aLsInLproMin_integr',
       'WasserZ2aFsInLproMin_integr',
       'WasserZ5LsInLproMin_integr',
       'WasserZ5FsInLproMin_integr',
       'GiessLaengeSchmelzeInM', 'GiessLaengeSequenzInM',
       'AusfLaengeSchmelzeInM', 'GiessGeschwInMproMin',
       'ChargenNr', 'ChargenNrErsteSchmInSeq',
       'GiessBeginnSchmelze',  'GiessEndeSchmelze',
    
       'NrSchmelzeInSequenz', 'LiquidusTempInC', 'SolidusTempInC',
       'UeberhitzungMittelInK', 'ZielTempTreiberInC',
       'SollGiessGeschwInMproMin', 'Format', 'C-Aequiv01', 'C-Aequiv02',
       'C-AequivP', 'GiessLaengeSequenzInM_delta',
       'AusfLaengeSchmelzeInM_delta', 'GiessLaengeSchmelzeInM_delta',
        ]

In [None]:
numerical_attributes = [  'TempMittelLsInC',
        'TempMittelFsInC', 'ZielTempTreiberInC',
                        'TundishTemperaturInC',
                           'WasserZ1LsInLproMin_integr',
     
                           'WasserZ1FsInLproMin_integr',
        'WasserZ2aLsInLproMin_integr',
       'WasserZ2aFsInLproMin_integr',
      'WasserZ2bFsInLproMin_integr',
      'WasserZ2bLsInLproMin_integr',
     'WasserZ3aFsInLproMin_integr',
   
       'WasserZ3aLsInLproMin_integr',
                          'WasserZ3bFsInLproMin_integr',
                        'WasserZ3bLsInLproMin_integr',
                        'WasserZ4FsInLproMin_integr',
       'WasserZ4LsInLproMin_integr',
      'WasserZ5LsInLproMin_integr',
       'WasserZ5FsInLproMin_integr',
      
       'GiessLaengeSchmelzeInM', 'GiessLaengeSequenzInM',
       'AusfLaengeSchmelzeInM', 'GiessGeschwInMproMin',
        'LiquidusTempInC', 'SolidusTempInC',
       'UeberhitzungMittelInK', 
       'SollGiessGeschwInMproMin',  'C-Aequiv01', 'C-Aequiv02',
       'C-AequivP', 
       ]

In [None]:
strang_data[numerical_attributes].describe().transpose()

In [None]:
strang_data[strang_data['seq_id'] != '475229_str_2'][['TempMittelLsInC','TempMittelFsInC']].describe()

In [None]:
strang_data[['TempMittelLsInC','TempMittelFsInC']].describe()

In [None]:
sns.set_style("white")
fig, ax1 = plt.subplots(1, 1, figsize=(6, 5),  sharex=True)
#fig.suptitle('Distributions of Model Target Temperatures')
sns.histplot(ax=ax1, data=strang_data, x='TempMittelLsInC', kde=True,color=sns.color_palette()[0] )
ax1.set(xticks=[]) 
ax1.set_xlabel('Temperature [°C]', fontsize=15)
ax1.set_ylabel('Count', fontsize=15)
ax1.xaxis.grid(False)
ax1.yaxis.grid(True)
plt.yticks(fontsize=13)


In [None]:
fig, ax1 = plt.subplots(1, 1, figsize=(6, 5),  sharex=True)
#fig.suptitle('Distributions of Model Target Temperatures')

sns.histplot(ax=ax1, data=strang_data, x='TempMittelFsInC', kde=True, color=sns.color_palette()[0] )
ax1.set(xticks=[]) 
ax1.set_xlabel('Temperature [°C]', fontsize=15)
ax1.set_ylabel('Count', fontsize=15)
ax1.xaxis.grid(False)
ax1.yaxis.grid(True)
plt.yticks(fontsize=13)


In [None]:
from scipy.stats import norm
from scipy import stats

In [None]:
fig, ax1 = plt.subplots(1, 1, figsize=(6, 5),  sharex=True)
sns.distplot(strang_data['TempMittelLsInC'],ax=ax1,fit=norm, color=sns.color_palette()[0])
ax1.set(xticks=[]) 
ax1.set_xlabel('Temperature [°C]', fontsize=15)
ax1.set_ylabel('Density', fontsize=15)
ax1.xaxis.grid(False)
ax1.yaxis.grid(True)
plt.yticks(fontsize=13)


In [None]:
fig, ax1 = plt.subplots(1, 1, figsize=(6, 5),  sharex=True)
sns.distplot(strang_data['TempMittelLsInC'],ax=ax1, fit=norm, color=sns.color_palette()[0])
ax1.set(xticks=[]) 
ax1.set_xlabel('Temperature [°C]', fontsize=15)
ax1.set_ylabel('Density', fontsize=15)
ax1.xaxis.grid(False)
ax1.yaxis.grid(True)
plt.yticks(fontsize=13)

In [None]:
fig, ax1 = plt.subplots(1,  figsize=(6, 5),  sharex=True)
res = stats.probplot(strang_data['TempMittelFsInC'], plot=plt)
ax1.xaxis.grid(True)
ax1.yaxis.grid(False)
ax1.set(yticks=[]) 
ax1.set_title('')
ax1.get_lines()[0].set_color(color=sns.color_palette()[0])
ax1.set_xlabel('Theoretical Quantiles', fontsize=15)
ax1.set_ylabel('Ordered Values', fontsize=15)
plt.yticks(fontsize=13)
plt.xticks(fontsize=13)


In [None]:
fig, ax1 = plt.subplots(1,  figsize=(6, 5),  sharex=True)
res = stats.probplot(strang_data['TempMittelLsInC'], plot=plt)
ax1.xaxis.grid(True)
ax1.yaxis.grid(False)
ax1.set(yticks=[]) 
ax1.set_title('')
ax1.get_lines()[0].set_color(color=sns.color_palette()[0])
ax1.set_xlabel('Theoretical Quantiles', fontsize=15)
ax1.set_ylabel('Ordered Values', fontsize=15)
plt.yticks(fontsize=13)
plt.xticks(fontsize=13)


In [None]:
fig, ax1 = plt.subplots(1,  figsize=(8, 5),  sharex=True)
data = strang_data.copy()
data.rename(columns={'TempMittelLsInC':'Temperature of Loose Side',
                                   'TempMittelFsInC':'Temperature of Fixed Side' },
                  inplace=True),
sns.boxplot(data=data,
            order=['Temperature of Loose Side','Temperature of Fixed Side'], ax=ax1, color=sns.color_palette()[0] )
ax1.set_ylabel('Temperature [°C]', fontsize=15)
ax1.set(yticks=[]) 
plt.xticks([0, 1], ['Temperature of Loose Side', 'Temperature of Fixed Side'], fontsize=15 )


In [None]:
grouped_end = strang_data.groupby('seq_id')
str_data = {}
strang_data_test = pd.DataFrame()
length_cc4 = 15.42
for name, group in grouped_end:
    group['max_seq_length'] = group['GiessLaengeSequenzInM'].max()
    str_data[name] = group.copy()      
    strang_data_test = pd.concat([strang_data_test,str_data[name]], axis=0)

strang_data_test['rel_seq_len'] =  strang_data_test['GiessLaengeSequenzInM'] / strang_data_test['max_seq_length'] 

In [None]:
var = 'rel_seq_len'
fig, ax1 = plt.subplots(1, 1, figsize=(6, 5),  sharex=True)
sns.scatterplot(data=strang_data_test,
                y='TempMittelLsInC', x=var, ax=ax1, s=10, color=sns.color_palette()[0])
ax1.axhline(650, ls='--', color='black', alpha=0.7)
ax1.axhline(765, ls='--', color='black',alpha=0.7)
ax1.set_xlabel('Relative Sequence  Length', fontsize=15)
ax1.set_ylabel('Temperature [°C]', fontsize=15)
ax1.set(yticks=[]) 
plt.yticks(fontsize=13)
plt.xticks(fontsize=13)



In [None]:
var = 'rel_seq_len'
fig, ax1 = plt.subplots(1, 1, figsize=(6, 5),  sharex=True)
sns.scatterplot(data=strang_data_test,
                y='TempMittelFsInC', x=var, ax=ax1,s=10, color=sns.color_palette()[0])
ax1.axhline(652, ls='--', color='black', alpha=0.7)
ax1.axhline(757, ls='--', color='black',alpha=0.7)
ax1.set_xlabel('Relative Sequence  Length', fontsize=15)
ax1.set_ylabel('Temperature [°C]', fontsize=15)
ax1.set(yticks=[]) 
plt.yticks(fontsize=13)
plt.xticks(fontsize=13)

In [None]:
#correlation matrix
data = strang_data[numerical_attributes]
data.rename(columns={'WasserZ1FsInLproMin_integr':'WasserZ1Fs',
                           'WasserZ1LsInLproMin_integr':'WasserZ1Ls',
                           'WasserZ2aFsInLproMin_integr':'WasserZ2aFs',
                           'WasserZ2aLsInLproMin_integr':'WasserZ2aLs',
                           'WasserZ2bFsInLproMin_integr':'WasserZ2bFs',
                           'WasserZ2bLsInLproMin_integr':'WasserZ2bLs',
                           'WasserZ3aFsInLproMin_integr':'WasserZ3aFs',
                           'WasserZ3aLsInLproMin_integr':'WasserZ3aLs',
                           'WasserZ3bFsInLproMin_integr':'WasserZ3bFs',
                           'WasserZ3bLsInLproMin_integr':'WasserZ3bLs',
                           'WasserZ4FsInLproMin_integr':'WasserZ4Fs',
                           'WasserZ4LsInLproMin_integr':'WasserZ4Ls',
                           'WasserZ5FsInLproMin_integr':'WasserZ5Fs',
                           'WasserZ5LsInLproMin_integr':'WasserZ5Ls',
                          },
                  inplace=True)



corrmat = data.corr()

f, ax = plt.subplots(figsize=(16, 15))
sns.heatmap(corrmat, vmax=.8, square=True);
plt.tight_layout

In [None]:
f.savefig('/home/di40438/bachelorarbeit/data/correlation_matrix.eps', format='eps',)

In [None]:
corrmat['TempMittelLsInC'].sort_values()

In [None]:
corrmat['ZielTempTreiberInC'].sort_values()

In [None]:
water_keys =  ['WasserZ4FsInLproMin_integr',
 'WasserZ4LsInLproMin_integr',
 'WasserZ2bFsInLproMin_integr',
 'WasserZ2bLsInLproMin_integr',
 'WasserZ3bFsInLproMin_integr',
 'WasserZ1FsInLproMin_integr',
 'WasserZ3bLsInLproMin_integr',
 'WasserZ1LsInLproMin_integr',
 'WasserZ3aFsInLproMin_integr',
 'WasserZ3aLsInLproMin_integr',
 'WasserZ2aLsInLproMin_integr',
 'WasserZ2aFsInLproMin_integr',
 'WasserZ5LsInLproMin_integr',
 'WasserZ5FsInLproMin_integr',
 ]

In [None]:
water_keys.sort()

In [None]:
water_data = strang_data[water_keys]
water_data.rename(columns={'WasserZ1FsInLproMin_integr':'Z1_FS',
                           'WasserZ1LsInLproMin_integr':'Z1_LS',
                           'WasserZ2aFsInLproMin_integr':'Z2a_FS',
                           'WasserZ2aLsInLproMin_integr':'Z2a_LS',
                           'WasserZ2bFsInLproMin_integr':'Z2b_FS',
                           'WasserZ2bLsInLproMin_integr':'Z2b_LS',
                           'WasserZ3aFsInLproMin_integr':'Z3a_FS',
                           'WasserZ3aLsInLproMin_integr':'Z3a_LS',
                           'WasserZ3bFsInLproMin_integr':'Z3b_FS',
                           'WasserZ3bLsInLproMin_integr':'Z3b_LS',
                           'WasserZ4FsInLproMin_integr':'Z4_FS',
                           'WasserZ4LsInLproMin_integr':'Z4_LS',
                           'WasserZ5FsInLproMin_integr':'Z5_FS',
                           'WasserZ5LsInLproMin_integr':'Z5_LS',
                          },
                  inplace=True)

In [None]:
fig, ax1 = plt.subplots(1, 1, figsize=(12, 5),  sharex=True)
sns.boxplot(data=water_data, color=sns.color_palette()[0], ax=ax1)
ax1.set_xlabel('Cooling Zones', fontsize=15)
ax1.set_ylabel('Cooling Water [l]', fontsize=15)
ax1.set(yticks=[]) 
plt.yticks(fontsize=13)
plt.xticks(fontsize=13)

### Look at single sequences

In [None]:
seq_number = '474947_str_1'
fig1, (ax1) = plt.subplots(1, figsize=(6, 5))
ax1.axhline(700, ls='--', color='red', label='casting target temperature' )
sns.lineplot(data=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True),
             y='TempMittelLsInC',x=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True).index,
              color='black',ax=ax1, label='model target temperature')
ax1.set_xlabel('Minutes', fontsize=15)
ax1.set_ylabel('Temperature [°C]',fontsize=15)
ax1.set(yticks=[]) 
ax1.xaxis.grid(True)
ax1.yaxis.grid(True)
ax1.legend(prop={'size': 15})
plt.yticks(fontsize=13)
plt.xticks(fontsize=13)


In [None]:
seq_number = '474947_str_1'
fig2, (ax1) = plt.subplots(1, figsize=(6, 5))
ax1.axhline(700, ls='--', color='red', label='casting target temperature' )
sns.lineplot(data=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True),
             y='TempMittelFsInC',x=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True).index,
             color='black', ax=ax1, label='model target temperature')
ax1.set_xlabel('Minutes', fontsize=15)
ax1.set_ylabel('Temperature [°C]',fontsize=15)
ax1.set(yticks=[]) 
ax1.xaxis.grid(True)
ax1.yaxis.grid(True)
ax1.legend(prop={'size': 15})
plt.yticks(fontsize=13)
plt.xticks(fontsize=13)


In [None]:
seq_number = '474947_str_1'
fig5, (ax1) = plt.subplots(1, figsize=(6, 5))
sns.lineplot(data=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True),
             y='GiessGeschwInMproMin',x=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True).index,
             ax=ax1, color='black')
ax1.set_xlabel('Minutes', fontsize=15)
ax1.set_ylabel('Casting Speed [m/min]', fontsize=15)
ax1.set(yticks=[]) 
plt.yticks(fontsize=13)
plt.xticks(fontsize=13)


In [None]:
seq_number = '474947_str_1'
fig6, (ax1) = plt.subplots(1,1, figsize=(6, 5))
sns.lineplot(data=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True),
             y='WasserZ5LsInLproMin_integr',x=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True).index,
             ax=ax1, label='cooling water Z5')
sns.lineplot(data=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True),
             y='WasserZ4LsInLproMin_integr',x=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True).index,
             ax=ax1, label='cooling water Z4')
sns.lineplot(data=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True),
             y='WasserZ3bLsInLproMin_integr',x=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True).index,
             ax=ax1, label='cooling water Z3b')
sns.lineplot(data=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True),
             y='WasserZ3aLsInLproMin_integr',x=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True).index,
             ax=ax1, label='cooling water Z3a')
sns.lineplot(data=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True),
             y='WasserZ2bLsInLproMin_integr',x=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True).index,
             ax=ax1, label='cooling water Z2b')
sns.lineplot(data=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True),
             y='WasserZ2aLsInLproMin_integr',x=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True).index,
             ax=ax1, label='cooling water Z2a')
sns.lineplot(data=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True),
             y='WasserZ1LsInLproMin_integr',x=strang_data[strang_data['seq_id']==seq_number].reset_index(drop=True).index,
             ax=ax1, label='cooling water Z1')

ax1.set_xlabel('Minutes', fontsize=15)
ax1.set_ylabel('Cooling Water [l]', fontsize=15)
ax1.set(yticks=[]) 
ax1.legend(prop={'size': 10})
ax1.legend(loc='upper left')
plt.yticks(fontsize=13)
plt.xticks(fontsize=13)

