## Scale Features and Build Model
### Scales Raw Features

Import CSV of Aggregated Darshan Logs <br>
Apply Log10 and Percent Scaling

In [1]:
import os
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import random

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("./raws.csv",lineterminator='\n',sep = ',' ,error_bad_lines=False)
#df.mean()

In [None]:
df = df.drop(df.columns[0],axis = 1)
df = df.drop(df.columns[0],axis = 1)
f = pd.DataFrame()

In [None]:
df

In [None]:

df = df.dropna(axis=0, how='any')
df.columns

In [None]:
#files
f['log10_p_files'] = df['posix_number_of_files'] 
f['log10_l_files'] = df['lustre_number_of_files']

In [None]:
#accesses
df['p_accesses'] = df['posix_reads'] + df['posix_writes']
f['log10_p_accesses'] = df['p_accesses']

f['log10_p_accesses']

In [None]:
#bytes
f['log10_p_bytes'] = df['posix_bytes_read'] 

In [None]:
f['log10_p_opens'] = df['posix_opens']
f['log10_p_seeks'] = df['posix_seeks']
f['log10_p_stats'] = df['posix_stats']
f['log10_p_mode'] = df['posix_renamed_mode']

In [None]:
f['log10_l_n_osts'] = df['lustre_number_of_osts']
f['log10_l_stripe_w'] = df['lustre_stripe_width']
f['log10_l_mdts'] = df['lustre_mdts']

In [None]:
f['log10_p_nprocs'] = df['nprocs']
f['log10_p_falign'] = df['posix_f_align']
f['log10_p_malign'] = df['posix_m_align']

In [None]:
f['perc_p_reads'] = df['posix_reads'] 
f['perc_p_writes'] = df['posix_writes'] 

In [None]:
f['perc_p_bytes_read_100'] = df['posix_bytes_read_100']
f['perc_p_bytes_read_1K'] = df['posix_bytes_read_1K']
f['perc_p_bytes_read_10K'] = df['posix_bytes_read_10K'] 
f['perc_p_bytes_read_100K'] = df['posix_bytes_read_100K']
f['perc_p_bytes_read_1M'] = df['posix_bytes_read_1lM']
f['perc_p_bytes_read_4M'] = df['posix_bytes_read_4M']
f['perc_p_bytes_read_10M'] = df['posix_bytes_read_10M']
f['perc_p_bytes_read_100M'] = df['posix_bytes_read_100M']
f['perc_p_bytes_read_1G'] = df['posix_bytes_read_1G']
f['perc_p_bytes_read_PLUS'] = df['posix_bytes_read_PLUS']

In [None]:
f['perc_p_bytes_write_100'] = df['posix_bytes_write_100']
f['perc_p_bytes_write_1K'] = df['posix_bytes_write_1K']
f['perc_p_bytes_write_10K'] = df['posix_bytes_write_10K']
f['perc_p_bytes_write_100K'] = df['posix_bytes_write_100K']
f['perc_p_bytes_write_1M'] = df['posix_bytes_write_1M']
f['perc_p_bytes_write_4M'] = df['posix_bytes_write_4M']
f['perc_p_bytes_write_10M'] = df['posix_bytes_write_10M']
f['perc_p_bytes_write_100M'] = df['posix_bytes_write_100M']
f['perc_p_bytes_write_1G'] = df['posix_bytes_write_1G']
f['perc_p_bytes_write_PLUS'] = df['posix_bytes_write_PLUS']


f = f.replace(-np.inf, -1)
f = f.replace(np.nan, 0)

In [None]:
df['time'] = df['posix_write_time'].astype('float') + df['posix_read_time'].astype('float') + df['posix_meta_time'].astype('float')

In [None]:
df['bytes'] = df['posix_bytes_read'].astype('float') + df['posix_bytes_write'].astype('float')

In [None]:
df = df[df['bytes'] >99999999]


In [None]:

f['throughput'] = df['bytes'].astype('float') / df['time']

f = f[f['throughput'] >0]

In [None]:
#delete columns with all zeros
f = f.loc[:, (f != 0).any(axis=0)]

#remove infinite values
f = f.replace([np.inf, -np.inf], np.nan).dropna(axis=0)

f.max()

In [None]:
t = pd.DataFrame()
t['throughput'] = f['throughput']
f = f.drop(labels = 'throughput', axis = 1)
f

In [None]:
df = df[df.index.isin(t.index)]
t = t.reset_index()
f = f.reset_index()
f = f.drop(f.columns[0] , axis =1)
t = t.drop(t.columns[0] , axis =1)

In [None]:
f = StandardScaler().fit_transform(f)

In [None]:
t

In [None]:
print(t.min())
print(t.max())

In [None]:
rseed = 0
t_size = 0.1

In [None]:

fig = plt.figure()
fig.suptitle('Bytes Read vs. Bytes Written Shaded by Throughput', fontsize=14, fontweight='bold')

ax = fig.add_subplot(111)
sp = ax.scatter(df['posix_bytes_read'],df['posix_bytes_write'], marker = 'x',c = np.log10(t['throughput']),cmap='viridis')

ax.set_xlabel('Bytes Read s')
ax.set_ylabel('Bytes Written s')
ax.loglog()
#plt.autoscale(enable=True, axis='y')
plt.xlim(10**0,10**15)
plt.ylim(10**0,10**15)
fig.colorbar(sp)
plt.show()

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_absolute_percentage_error


In [None]:
from sklearn.linear_model import LinearRegression
for i in range(3):
    
    rseed = random.randint(1,10000)
    print(rseed)
    train_data, test_data, train_labels, test_labels = train_test_split(f,t, test_size=t_size, random_state=rseed)

    reg = LinearRegression().fit(train_data, train_labels)

    predicted_labels = reg.predict(test_data)

    print("Mean True Value: \t",int(test_labels.mean() ))
    print("Mean Absolute Error: \t", int(mean_absolute_error(test_labels, predicted_labels) ))
    print("Mean Squared Error: ", mean_squared_error(test_labels, predicted_labels) )
    print("Root Mean Squared Error: ", mean_squared_error(test_labels, predicted_labels, squared = False) )
    print("MAPE :" + str(mean_absolute_percentage_error( test_labels, predicted_labels )))
    print("R2: " + str(r2_score(test_labels,predicted_labels)) + "\n")
    

In [None]:
import xgboost as xg

In [None]:
#EXTREME GRADIENT BOOST

for i in range(3):
    rseed = random.randint(1,10000)
    print(rseed)

    train_data, test_data, train_labels, test_labels = train_test_split(f,t, test_size=t_size, random_state=rseed)
    xgb_r = xg.XGBRegressor(n_estimators = 1000, seed = 123)
    xgb_r.fit(train_data, train_labels)
    predicted_labels = xgb_r.predict(test_data)

    print("Mean True Value: \t",int(test_labels.mean() ))
    print("Mean Absolute Error: \t", int(mean_absolute_error(test_labels, predicted_labels) ))
    print("Mean Squared Error: ", mean_squared_error(test_labels, predicted_labels) )
    print("Root Mean Squared Error: ", mean_squared_error(test_labels, predicted_labels, squared = False) )
    print("MAPE :" + str(mean_absolute_percentage_error( test_labels, predicted_labels )))
    print("R2: " + str(r2_score(test_labels,predicted_labels)) + "\n")
    

In [None]:
from sklearn.tree import DecisionTreeRegressor

for i in range(3):
    rseed = random.randint(1,10000)
    print(rseed)

    train_data, test_data, train_labels, test_labels = train_test_split(f,t, test_size= t_size, random_state=rseed)
    
    reg = DecisionTreeRegressor(max_depth = 4)   
    

    reg.fit(train_data, train_labels)
    predicted_labels = reg.predict(test_data)


    print("Mean True Value: \t",int(test_labels.mean() ))
    print("Mean Absolute Error: \t", int(mean_absolute_error(test_labels, predicted_labels) ))
    print("Mean Squared Error: ", mean_squared_error(test_labels, predicted_labels) )
    print("Root Mean Squared Error: ", mean_squared_error(test_labels, predicted_labels, squared = False) )
    print("MAPE :" + str(mean_absolute_percentage_error( test_labels, predicted_labels )))
    print("R2: " + str(r2_score(test_labels,predicted_labels)) + "\n")
    
    