# Using library functions:

In [3]:
from library import normalization_utils as nu
from library import features_extraction as fe
import pandas as pd
from sklearn.externals import joblib 



# Example

In [2]:
df = pd.DataFrame(data=None, columns=['timestamp','val1','val2'])
df.loc[:,'timestamp'] = pd.Series([1,2,3])
df.loc[:,'val1'] = pd.Series([1,10,10])
df.loc[:,'val2'] = pd.Series([-10,0,10])
df

Unnamed: 0,timestamp,val1,val2
0,1,1,-10
1,2,10,0
2,3,10,10


In [3]:
df_norm = nu.normalize_data(df)
df_norm

Unnamed: 0,timestamp,val1,val2
0,1,0.0,0.0
1,2,1.0,0.5
2,3,1.0,1.0


# Real data:

In [1]:
node="r183c12s04"

In [9]:
raw_dataset = pd.read_csv("raw_data/"+node+"/"+node+"_dataset.csv").drop_duplicates()

In [10]:
train_dataset = raw_dataset.sample(frac=0.8,random_state=0)
test_dataset = raw_dataset.drop(train_dataset.index)

In [12]:
train_stats = train_dataset.describe()
train_stats = train_stats.transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
avg:boottime,7497.0,1.563683e+09,1.007661e+08,33.0000,1.570182e+09,1.570182e+09,1.570182e+09,1.570182e+09
var:boottime,7497.0,1.814059e-04,6.056402e-03,0.0000,0.000000e+00,0.000000e+00,0.000000e+00,2.400000e-01
avg:bytes_in,7497.0,3.068948e+03,1.485845e+04,0.0000,3.021620e+02,7.470825e+02,1.728850e+03,4.223809e+05
var:bytes_in,7497.0,1.243632e+08,2.241005e+09,0.0000,2.286032e+03,8.756138e+04,1.092787e+06,1.597048e+11
avg:bytes_out,7497.0,3.191125e+03,6.013466e+04,-34.0000,2.392933e+02,6.170993e+02,1.915317e+03,4.472678e+06
var:bytes_out,7497.0,1.882228e+09,1.143585e+11,0.0000,1.225137e+03,6.513592e+04,1.233113e+06,8.437923e+12
avg:core_freq_avg,7497.0,1.888358e+03,2.499849e+02,61.2000,1.674550e+03,2.002333e+03,2.084200e+03,2.100000e+03
var:core_freq_avg,7497.0,4.939357e+03,1.272416e+04,0.0000,4.858083e+01,1.710497e+02,1.674616e+03,8.656289e+04
avg:core_freq_max,7497.0,2.045215e+03,1.581868e+02,-36.6400,2.039200e+03,2.099600e+03,2.100000e+03,2.100000e+03
var:core_freq_max,7497.0,5.046032e+03,1.407647e+04,0.0000,0.000000e+00,5.988889e-01,1.289928e+03,8.990000e+04


In [6]:
from sklearn.preprocessing import MinMaxScaler
def normalize_and_return_scaler(df):
    scaler = MinMaxScaler(feature_range=(0, 1))
    df = df.set_index('timestamp')
    df_norm = pd.DataFrame(data=scaler.fit_transform(df), columns=df.columns)
    df = df.reset_index()
    df_norm.loc[:,'timestamp'] = df['timestamp']
    cols = df_norm.columns.tolist()
    cols.insert(0, cols.pop(cols.index('timestamp')))
    df_norm = df_norm.reindex(columns=cols)
    return df_norm,scaler

In [57]:
list_col1 = list(raw_dataset.columns.values)

In [7]:
normalized_dataset,scaler = normalize_and_return_scaler(raw_dataset)

In [8]:
joblib.dump(scaler, './models/scaler_model.pkl') 

['./models/scaler_model.pkl']

In [9]:
normalized_dataset

Unnamed: 0,timestamp,avg:boottime,var:boottime,avg:bytes_in,var:bytes_in,avg:bytes_out,var:bytes_out,avg:core_freq_avg,var:core_freq_avg,avg:core_freq_max,...,avg:SysBrd_3_3V,var:SysBrd_3_3V,avg:SysBrd_5V,var:SysBrd_5V,avg:Sys_Power,var:Sys_Power,avg:Sys_Utilization,var:Sys_Utilization,avg:System_Air_Flow,var:System_Air_Flow
0,2019-10-03 00:05:00+02:00,0.999915,0.0,0.000508,1.330224e-08,0.000026,5.784396e-11,0.726192,0.001228,0.953688,...,0.0,1.0,0.0,1.0,0.161850,0.000000,0.000000,0.000000,0.716981,0.032922
1,2019-10-03 00:10:00+02:00,0.999915,0.0,0.000593,9.793164e-11,0.000029,5.710245e-13,0.711897,0.001672,0.857322,...,0.0,1.0,0.0,1.0,0.161850,0.000000,0.000000,0.000000,0.641509,0.085391
2,2019-10-03 00:15:00+02:00,0.999915,0.0,0.001211,1.814466e-07,0.000034,3.535379e-11,0.722391,0.003429,0.974645,...,0.0,1.0,0.0,1.0,0.161850,0.000000,0.000000,0.000000,0.654088,0.058642
3,2019-10-03 00:20:00+02:00,0.999915,0.0,0.001776,4.452299e-08,0.000027,1.023543e-10,0.715501,0.001087,0.884969,...,0.0,1.0,0.0,1.0,0.161850,0.000000,0.000000,0.000000,0.660377,0.051440
4,2019-10-03 00:25:00+02:00,0.999915,0.0,0.000877,5.135610e-07,0.000027,2.266847e-11,0.718370,0.000094,0.943841,...,0.0,1.0,0.0,1.0,0.161850,0.000000,0.000000,0.000000,0.647799,0.003086
5,2019-10-03 00:30:00+02:00,0.999915,0.0,0.000419,1.333400e-10,0.000024,3.110016e-11,0.712755,0.001260,0.867380,...,0.0,1.0,0.0,1.0,0.161850,0.000000,0.000000,0.000000,0.666667,0.094650
6,2019-10-03 00:35:00+02:00,0.999915,0.0,0.000435,2.417803e-10,0.000027,2.928006e-11,0.717569,0.002217,0.932247,...,0.0,1.0,0.0,1.0,0.161850,0.000000,0.000000,0.000000,0.716981,0.017490
7,2019-10-03 00:40:00+02:00,0.999915,0.0,0.000571,7.735568e-09,0.000030,1.381326e-12,0.718615,0.000041,0.952027,...,0.0,1.0,0.0,1.0,0.161850,0.000000,0.000000,0.000000,0.672956,0.049383
8,2019-10-03 00:45:00+02:00,0.999915,0.0,0.000483,9.761416e-09,0.000039,6.956686e-11,0.723249,0.000360,0.961383,...,0.0,1.0,0.0,1.0,0.161850,0.000000,0.000000,0.000000,0.672956,0.064815
9,2019-10-03 00:50:00+02:00,0.999915,0.0,0.013039,8.354911e-05,0.001120,1.275920e-06,0.713572,0.000554,0.872814,...,0.0,1.0,0.0,1.0,0.161850,0.000000,0.000000,0.000000,0.635220,0.002058


In [40]:
labels = pd.read_csv("raw_data/"+node+"/labels_all.csv")

In [41]:
final_data = fe.merge_data_on_timestamp([normalized_dataset, labels]).rename(columns={'index':'timestamp'})

In [42]:
final_data

Unnamed: 0,timestamp,avg:boottime,var:boottime,avg:bytes_in,var:bytes_in,avg:bytes_out,var:bytes_out,avg:core_freq_avg,var:core_freq_avg,avg:core_freq_max,...,var:SysBrd_3_3V,avg:SysBrd_5V,var:SysBrd_5V,avg:Sys_Power,var:Sys_Power,avg:Sys_Utilization,var:Sys_Utilization,avg:System_Air_Flow,var:System_Air_Flow,label
0,2019-10-07 00:00:00+02:00,1.0,0.0,0.073629,1.240056e-03,0.000100,2.779974e-09,0.714520,0.000980,0.941931,...,0.0,1.0,0.0,0.161850,0.000000,0.000000,0.000000,0.402516,0.032922,1.0
1,2019-10-07 00:05:00+02:00,1.0,0.0,0.057460,1.943532e-03,0.000076,7.203066e-10,0.726012,0.000359,0.945689,...,0.0,1.0,0.0,0.161850,0.000000,0.000000,0.000000,0.389937,0.049383,1.0
2,2019-10-07 00:05:00+02:00,1.0,0.0,0.057460,1.943532e-03,0.000076,7.203066e-10,0.726012,0.000359,0.945689,...,0.0,1.0,0.0,0.161850,0.000000,0.000000,0.000000,0.389937,0.049383,1.0
3,2019-10-07 00:10:00+02:00,1.0,0.0,0.007906,8.976361e-05,0.000049,1.017684e-09,0.722416,0.000204,0.935966,...,0.0,1.0,0.0,0.161850,0.000000,0.000000,0.000000,0.320755,0.048354,1.0
4,2019-10-07 00:10:00+02:00,1.0,0.0,0.007906,8.976361e-05,0.000049,1.017684e-09,0.722416,0.000204,0.935966,...,0.0,1.0,0.0,0.161850,0.000000,0.000000,0.000000,0.320755,0.048354,1.0
5,2019-10-07 00:15:00+02:00,1.0,0.0,0.001798,3.644319e-06,0.000059,3.467605e-10,0.718043,0.000655,0.923889,...,0.0,1.0,0.0,0.161850,0.000000,0.000000,0.000000,0.333333,0.131687,1.0
6,2019-10-07 00:15:00+02:00,1.0,0.0,0.001798,3.644319e-06,0.000059,3.467605e-10,0.718043,0.000655,0.923889,...,0.0,1.0,0.0,0.161850,0.000000,0.000000,0.000000,0.333333,0.131687,1.0
7,2019-10-07 00:20:00+02:00,1.0,0.0,0.001381,1.447338e-07,0.000067,1.572277e-13,0.715281,0.000220,0.951427,...,0.0,1.0,0.0,0.161850,0.000000,0.000000,0.000000,0.352201,0.048354,1.0
8,2019-10-07 00:20:00+02:00,1.0,0.0,0.001381,1.447338e-07,0.000067,1.572277e-13,0.715281,0.000220,0.951427,...,0.0,1.0,0.0,0.161850,0.000000,0.000000,0.000000,0.352201,0.048354,1.0
9,2019-10-07 00:25:00+02:00,1.0,0.0,0.009470,1.519672e-04,0.000056,9.016576e-11,0.725865,0.002217,0.966202,...,0.0,1.0,0.0,0.161850,0.000000,0.000000,0.000000,0.452830,0.131687,1.0


In [43]:
final_data.to_csv("final_data/"+node+"_normalised.csv", index=False)

In [44]:
final_data[final_data['label'] == 1]

Unnamed: 0,timestamp,avg:boottime,var:boottime,avg:bytes_in,var:bytes_in,avg:bytes_out,var:bytes_out,avg:core_freq_avg,var:core_freq_avg,avg:core_freq_max,...,var:SysBrd_3_3V,avg:SysBrd_5V,var:SysBrd_5V,avg:Sys_Power,var:Sys_Power,avg:Sys_Utilization,var:Sys_Utilization,avg:System_Air_Flow,var:System_Air_Flow,label
0,2019-10-07 00:00:00+02:00,1.0,0.0,0.073629,1.240056e-03,0.000100,2.779974e-09,0.714520,0.000980,0.941931,...,0.0,1.0,0.0,0.16185,0.0,0.0,0.0,0.402516,0.032922,1.0
1,2019-10-07 00:05:00+02:00,1.0,0.0,0.057460,1.943532e-03,0.000076,7.203066e-10,0.726012,0.000359,0.945689,...,0.0,1.0,0.0,0.16185,0.0,0.0,0.0,0.389937,0.049383,1.0
2,2019-10-07 00:05:00+02:00,1.0,0.0,0.057460,1.943532e-03,0.000076,7.203066e-10,0.726012,0.000359,0.945689,...,0.0,1.0,0.0,0.16185,0.0,0.0,0.0,0.389937,0.049383,1.0
3,2019-10-07 00:10:00+02:00,1.0,0.0,0.007906,8.976361e-05,0.000049,1.017684e-09,0.722416,0.000204,0.935966,...,0.0,1.0,0.0,0.16185,0.0,0.0,0.0,0.320755,0.048354,1.0
4,2019-10-07 00:10:00+02:00,1.0,0.0,0.007906,8.976361e-05,0.000049,1.017684e-09,0.722416,0.000204,0.935966,...,0.0,1.0,0.0,0.16185,0.0,0.0,0.0,0.320755,0.048354,1.0
5,2019-10-07 00:15:00+02:00,1.0,0.0,0.001798,3.644319e-06,0.000059,3.467605e-10,0.718043,0.000655,0.923889,...,0.0,1.0,0.0,0.16185,0.0,0.0,0.0,0.333333,0.131687,1.0
6,2019-10-07 00:15:00+02:00,1.0,0.0,0.001798,3.644319e-06,0.000059,3.467605e-10,0.718043,0.000655,0.923889,...,0.0,1.0,0.0,0.16185,0.0,0.0,0.0,0.333333,0.131687,1.0
7,2019-10-07 00:20:00+02:00,1.0,0.0,0.001381,1.447338e-07,0.000067,1.572277e-13,0.715281,0.000220,0.951427,...,0.0,1.0,0.0,0.16185,0.0,0.0,0.0,0.352201,0.048354,1.0
8,2019-10-07 00:20:00+02:00,1.0,0.0,0.001381,1.447338e-07,0.000067,1.572277e-13,0.715281,0.000220,0.951427,...,0.0,1.0,0.0,0.16185,0.0,0.0,0.0,0.352201,0.048354,1.0
9,2019-10-07 00:25:00+02:00,1.0,0.0,0.009470,1.519672e-04,0.000056,9.016576e-11,0.725865,0.002217,0.966202,...,0.0,1.0,0.0,0.16185,0.0,0.0,0.0,0.452830,0.131687,1.0


In [None]:
#NODE r183c09s01

In [1]:
node="r183c12s04"

In [4]:
raw_dataset = pd.read_csv("raw_data/"+node+"/"+node+"_dataset.csv").drop_duplicates()

In [5]:
raw_dataset.head(10)

Unnamed: 0,timestamp,avg:boottime,var:boottime,avg:bytes_in,var:bytes_in,avg:bytes_out,var:bytes_out,avg:core_freq_avg,var:core_freq_avg,avg:core_freq_max,...,avg:SysBrd_3_3V,var:SysBrd_3_3V,avg:SysBrd_5V,var:SysBrd_5V,avg:Sys_Power,var:Sys_Power,avg:Sys_Utilization,var:Sys_Utilization,avg:System_Air_Flow,var:System_Air_Flow
0,2019-10-03 00:05:00+02:00,1570048000.0,0.0,216.116,2124.431,82.707,488.0829,1541.65,106.3275,2001.0,...,3.3015,1.9721520000000001e-31,5.06,7.888609e-31,120.0,0.0,0.0,0.0,33.8,2.56
1,2019-10-03 00:10:00+02:00,1570048000.0,0.0,252.331,15.64015,93.817,4.818261,1512.5,144.75,1795.0,...,3.3015,1.9721520000000001e-31,5.06,7.888609e-31,120.0,0.0,0.0,0.0,31.4,6.64
2,2019-10-03 00:15:00+02:00,1570048000.0,0.0,514.949,28977.89,118.763,298.3126,1533.9,296.79,2045.8,...,3.3015,1.9721520000000001e-31,5.06,7.888609e-31,120.0,0.0,0.0,0.0,31.8,4.56
3,2019-10-03 00:20:00+02:00,1570048000.0,0.0,755.197,7110.533,85.179,863.6579,1519.85,94.1275,1854.1,...,3.3015,1.9721520000000001e-31,5.06,7.888609e-31,120.0,0.0,0.0,0.0,32.0,4.0
4,2019-10-03 00:25:00+02:00,1570048000.0,0.0,372.935,82018.13,87.066,191.2748,1525.7,8.11,1979.95,...,3.3015,1.9721520000000001e-31,5.06,7.888609e-31,120.0,0.0,0.0,0.0,31.6,0.24
5,2019-10-03 00:30:00+02:00,1570048000.0,0.0,178.401,21.29503,71.375,262.4207,1514.25,109.0875,1816.5,...,3.3015,1.9721520000000001e-31,5.06,7.888609e-31,120.0,0.0,0.0,0.0,32.2,7.36
6,2019-10-03 00:35:00+02:00,1570048000.0,0.0,184.872,38.61346,84.78,247.0629,1524.066667,191.928889,1955.166667,...,3.3015,1.9721520000000001e-31,5.06,7.888609e-31,120.0,0.0,0.0,0.0,33.8,1.36
7,2019-10-03 00:40:00+02:00,1570048000.0,0.0,242.63,1235.407,100.285,11.65553,1526.2,3.56,1997.45,...,3.3015,1.9721520000000001e-31,5.06,7.888609e-31,120.0,0.0,0.0,0.0,32.4,3.84
8,2019-10-03 00:45:00+02:00,1570048000.0,0.0,205.328,1558.945,139.529,586.9997,1535.65,31.1275,2017.45,...,3.3015,1.9721520000000001e-31,5.06,7.888609e-31,120.0,0.0,0.0,0.0,32.4,5.04
9,2019-10-03 00:50:00+02:00,1570048000.0,0.0,5545.409667,13343190.0,4975.363667,10766120.0,1515.916667,47.943056,1828.116667,...,3.3015,1.9721520000000001e-31,5.06,7.888609e-31,120.0,0.0,0.0,0.0,31.2,0.16


In [6]:
scaler = joblib.load('./models/scaler_model.pkl') 

In [7]:
def normalize_data(df):
    global scaler
    df = df.set_index('timestamp')
    df_norm = pd.DataFrame(data=scaler.transform(df), columns=df.columns)
    df = df.reset_index()
    df_norm.loc[:,'timestamp'] = df['timestamp']
    cols = df_norm.columns.tolist()
    cols.insert(0, cols.pop(cols.index('timestamp')))
    df_norm = df_norm.reindex(columns=cols)
    return df_norm

In [8]:
norm_data1 = normalize_data(raw_dataset)

In [10]:
labels = pd.read_csv("raw_data/"+node+"/labels_all.csv")

In [11]:
final_data = fe.merge_data_on_timestamp([norm_data1, labels]).rename(columns={'index':'timestamp'})

In [12]:
final_data.to_csv("my_code/norm_dataset/"+node+"_normalised.csv", index=False)