In [None]:
import os
import fnmatch
import pandas as pd
import numpy as np
from datetime import datetime,timedelta
from functools import partial
from scipy.stats import pearsonr,spearmanr
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
import matplotlib.pyplot as plt
%matplotlib notebook
# import plotly.express as px
from IPython.core.display import display, HTML
from IPython.core.debugger import set_trace
display(HTML('<style>.container { width:90% !important; }</style>')) 
pd.set_option('display.max_columns', None)
!pwd

In [None]:
data_path='/home/gkoren2/share/Data/MLA/DTT/scarlet/experiments'
# data_path='D:\\MLA\\Data\\DTT\\Scarlet\\experiments'
sorted(os.listdir(data_path))

In [None]:
def find(pattern, path):
    result = []
    for root, dirs, files in os.walk(path):
        for name in files:
            if fnmatch.fnmatch(name, pattern):
                result.append(os.path.join(root, name))
    return result

# Explore specific esif

In [None]:
folder_name='rl_greedy_1'
folder_name=os.path.join(data_path,folder_name)
esif_file=find('*_esif.csv',os.path.join(data_path,folder_name))[0]
score_file=find('*.xlsx',folder_name)[0]
print(esif_file)

In [None]:
tslog_parser = lambda x: pd.datetime.strptime(x, '%B%d %H:%M:%S')
xldf = pd.read_excel(score_file,  parse_dates = ['start_time', 'end_time'], date_parser = tslog_parser)
xlf=xldf[(xldf.Run_number==1)]
# fn_prefix = [trace[0]+'_'+x.strftime('%H:%M:%S').split(':')[0] for x in xlf['start_time']]   # e.g. ['cinebench_23', 'cinebench_08']
xlf

In [None]:
esif_df=pd.read_csv(esif_file)
esif_df['timestamp']=pd.to_datetime(esif_df['timestamp'])
esif_df.sort_values(by= 'timestamp').reset_index(inplace=True)
esif_df.loc[:,['MMIO_PL1','MMIO_PL2']] = esif_df.loc[:,['MMIO_PL1','MMIO_PL2']]/1000
print(esif_df.shape)
esif_df.columns

In [None]:
esif_df.loc[:,['POWER','tj','tskin','MMIO_PL1','MMIO_PL2']].plot(figsize=(16,4),grid=True)

In [None]:
esif_df.loc[15000,'File_name']

# get_data_frame

In [None]:
def get_data_frame_esif(data_filters):
    tslog_parser = lambda x: pd.datetime.strptime(x, '%B%d %H:%M:%S')
    ips_cols=['cpu{}_inst_delta'.format(i) for i in range(8)]
    data_df=pd.DataFrame(columns=['p','tj','tskn','ips'])

    for filt in data_filters:   # e.g. filt = {'folders':['rl_rnd_64_3','rl_rnd_8'],'traces':[('cinebench',120),('cinebench',30)]}
        for folder in filt['folders']:   # folder = 'rl_rnd_64_3'
            fpath=os.path.join(data_path,folder)
            esif_file=find('*_esif.csv',fpath)[0]
            tat_file=find('*_TAT.csv',fpath)[0]
            score_file=find('*.xlsx',fpath)[0]
            xldf = pd.read_excel(score_file,  parse_dates = ['start_time', 'end_time'], date_parser = tslog_parser)           
            print('='*30,'analyzing esif',esif_file,'='*30)
            esif_df=pd.read_csv(esif_file)
            esif_df['ips']=esif_df.loc[:,ips_cols].mean(axis=1)
            # extract the data from the esif according to the 'traces'
            for trace in filt['traces']:   # e.g. trace = ('cinebench',120)
                # find the rows of the 1st run of this filter. there might be more than one 
                print('-'*20,'analyzing trace',trace,'-'*20)
                xlf=xldf[((xldf.trace_name==trace[0]) & (xldf.num_of_sec_between==trace[1]) & (xldf.Run_number==1))]
                fn_prefix = [trace[0]+'_'+x.strftime('%H:%M:%S').split(':')[0] for x in xlf['start_time']]   # e.g. ['cinebench_23', 'cinebench_08']
                for fnp in fn_prefix:
#                     print('~'*10,'collecting file',fnp,'~'*10)
                    fnp_filt=lambda x: (fnp in x)
                    tj= esif_df.loc[esif_df['File_name'].apply(fnp_filt),'tj'].values
                    tskn= esif_df.loc[esif_df['File_name'].apply(fnp_filt),'tskin'].values
                    p=esif_df.loc[esif_df['File_name'].apply(fnp_filt),'POWER'].values
                    ips=esif_df.loc[esif_df['File_name'].apply(fnp_filt),'ips'].values

                    tj_nm1=tj[:-1]
                    tj_n=tj[1:]
                    ts_nm1=tskn[:-1]
                    ts_n=tskn[1:]
                    p_n=p[1:]
                    p_nm1=p[:-1]
                    ips_n=ips[1:]
                    ips_nm1=ips[:-1]
                    print('folder',folder,' trace',trace,' file prefix',fnp,'found {} samples'.format(len(p_n)))
                    data_df=data_df.append(pd.DataFrame({'p_n':p_n,'tj_n':tj_n,'tskn_n':ts_n,'ips_n':ips_n,
                                                         'p_nm1':p_nm1,'tj_nm1':tj_nm1,'tskn_nm1':ts_nm1,
                                                         'ips_nm1':ips_nm1}),ignore_index=True)
#                     print('~'*10,'done with file',fnp,'~'*10)
#                 print('-'*20,'done with trace', trace,'-'*20)
            print('='*30,'done with esif',folder,'='*30)
        # clean the ips
        q999=data_df['ips_n'].quantile(0.999)
        data_df['ips_n']=data_df['ips_n'].apply(lambda x: min(x,q999))
        q999=data_df['ips_nm1'].quantile(0.999)
        data_df['ips_nm1']=data_df['ips_nm1'].apply(lambda x: min(x,q999))
        print('total samples:',len(data_df))
        return data_df

In [None]:
def get_data_frame_tat(data_filters):
    tslog_parser = lambda x: pd.datetime.strptime(x, '%B%d %H:%M:%S')
    ips_cols=['cpu{}_inst_delta'.format(i) for i in range(8)]
    data_df=pd.DataFrame(columns=['p','tj','tskn','tmem'])

    for filt in data_filters:   # e.g. filt = {'folders':['rl_rnd_64_3','rl_rnd_8'],'traces':[('cinebench',120),('cinebench',30)]}
        for folder in filt['folders']:   # folder = 'rl_rnd_64_3'
            fpath=os.path.join(data_path,folder)
            esif_file=find('*_esif.csv',fpath)[0]
            tat_file=find('*_TAT.csv',fpath)[0]
            score_file=find('*.xlsx',fpath)[0]
            xldf = pd.read_excel(score_file,  parse_dates = ['start_time', 'end_time'], date_parser = tslog_parser)           
            print('='*30,'analyzing tat',tat_file,'='*30)
            tat_df=pd.read_csv(tat_file)
#             esif_df['ips']=esif_df.loc[:,ips_cols].mean(axis=1)
            # extract the data from the esif according to the 'traces'
            for trace in filt['traces']:   # e.g. trace = ('cinebench',120)
                # find the rows of the 1st run of this filter. there might be more than one 
                print('-'*20,'analyzing trace',trace,'-'*20)
                xlf=xldf[((xldf.trace_name==trace[0]) & (xldf.num_of_sec_between==trace[1]) & (xldf.Run_number==1))]
                fn_prefix = [trace[0]+'_'+x.strftime('%H:%M:%S').split(':')[0] for x in xlf['start_time']]   # e.g. ['cinebench_23', 'cinebench_08']
                for fnp in fn_prefix:
#                     print('~'*10,'collecting file',fnp,'~'*10)
                    fnp_filt=lambda x: (fnp in x)
                    tj= tat_df.loc[tat_df['File_name'].apply(fnp_filt),'tj'].values
                    tskn= tat_df.loc[tat_df['File_name'].apply(fnp_filt),'TSKN-temp(Degree C)'].values
                    tmem= tat_df.loc[tat_df['File_name'].apply(fnp_filt),'TMEM-temp(Degree C)'].values 
                    p=tat_df.loc[tat_df['File_name'].apply(fnp_filt),'POWER'].values
#                     ips=esif_df.loc[tat_df['File_name'].apply(fnp_filt),'ips'].values

                    tj_nm1=tj[:-1]
                    tj_n=tj[1:]
                    ts_nm1=tskn[:-1]
                    ts_n=tskn[1:]
                    tm_nm1=tmem[:-1]
                    tm_n=tmem[1:]

                    p_n=p[1:]
                    p_nm1=p[:-1]
#                     ips_n=ips[1:]
#                     ips_nm1=ips[:-1]
                    print('folder',folder,' trace',trace,' file prefix',fnp,'found {} samples'.format(len(p_n)))
                    data_df=data_df.append(pd.DataFrame({'p_n':p_n,'tj_n':tj_n,'tskn_n':ts_n,'tm_n':tm_n,'tm_nm1':tm_nm1,
                                                         'p_nm1':p_nm1,'tj_nm1':tj_nm1,'tskn_nm1':ts_nm1}),ignore_index=True)
#                     print('~'*10,'done with file',fnp,'~'*10)
#                 print('-'*20,'done with trace', trace,'-'*20)
            print('='*30,'done with tat',folder,'='*30)
        # clean the ips
#         q999=data_df['ips_n'].quantile(0.999)
#         data_df['ips_n']=data_df['ips_n'].apply(lambda x: min(x,q999))
#         q999=data_df['ips_nm1'].quantile(0.999)
#         data_df['ips_nm1']=data_df['ips_nm1'].apply(lambda x: min(x,q999))
        print('total samples:',len(data_df))
        return data_df

# Train and test models
in this section we train models to predict:
- Tj given power
- ips mean given power
- Tskin given Tj
- Tmem given Tj

In [None]:
# cb15
# train_data=[{'folders':['rl_rnd_64','rl_rnd_64_3','rl_rnd_64_4','rl_rnd_64_5','rl_rnd_64_6','psvt_at-9_25_45_64-fixed_1','psvt_at-9_25_45_64-greedy_1'],'traces':[('cinebench',300),('cinebench',240),('cinebench',180),('cinebench',120),('cinebench',90),('cinebench',60),('cinebench',30),('cinebench',1)]}]
# test_data =[{'folders':['rl_rnd_7','rl_rnd_8','psvt_at-9_25_45_64-base_1'],'traces':[('cinebench',300),('cinebench',240),('cinebench',180),('cinebench',120),('cinebench',90),('cinebench',60),('cinebench',30),('cinebench',1)]}] 
# cb20
train_data=[{'folders':['rl_rnd_64','rl_rnd_64_3','rl_rnd_64_4','rl_rnd_64_5','rl_rnd_64_6','psvt_at-9_25_45_64-fixed_1','psvt_at-9_25_45_64-greedy_1'],'traces':[('cb20',300),('cb20',240),('cb20',180),('cb20',120),('cb20',90),('cb20',60),('cb20',30),('cb20',1)]}]
test_data =[{'folders':['rl_rnd_7','rl_rnd_8','psvt_at-9_25_45_64-base_1'],'traces':[('cb20',300),('cb20',240),('cb20',180),('cb20',120),('cb20',90),('cb20',60),('cb20',30),('cb20',1)]}] 

In [None]:
trndf=get_data_frame_esif(train_data)
tstdf=get_data_frame_esif(test_data)
# p_n=trndf['p_n'].values
# tj_n=trndf['tj_n'].values
# tskn_n=trndf['tskn_n'].values
# p_nm1=trndf['p_nm1'].values
# tj_nm1=trndf['tj_nm1'].values
# tskn_nm1=trndf['tskn_nm1'].values
# ips_n=trndf['ips_n'].values
# ips_nm1=trndf['ips_nm1'].values

## model tj (power)

In [None]:
# train
p_n=trndf['p_n'].values
tj_n=trndf['tj_n'].values
tj_nm1=trndf['tj_nm1'].values
tjfit2=LinearRegression().fit(np.array([p_n,tj_nm1]).T,tj_n)
print(tjfit2.coef_)
print(tjfit2.intercept_)

In [None]:
# test
p_n=tstdf['p_n'].values
tj_n=tstdf['tj_n'].values
tj_nm1=tstdf['tj_nm1'].values
tjest2=np.minimum(tjfit2.predict(np.array(np.array([p_n,tj_nm1]).T)),102)
tstdf['tjest2']=tjest2
# tjest3=np.minimum(np.array([np.ones_like(p_n),p_n,tj_nm1]).T.dot(np.insert(tjfit2.coef_,0,tjfit2.intercept_)),100)
# tstdf['tjest3']=tjest3
# tstdf.loc[:,['tj_n','tjest2','tjest3']].plot(figsize=(8,4),grid=True)
tstdf.loc[:,['tj_n','tjest2']].plot(figsize=(8,4),grid=True)

## model tskn

In [None]:
train_data=[{'folders':['psvt_at-9_25_45_64-fixed_1','psvt_at-9_25_45_64-greedy_1','psvt_at-9_25_45_64-base_1'],'traces':[('cinebench',300),('cinebench',180),('cinebench',120),('cinebench',60),('cinebench',30),('cinebench',1),
                                                                                              ('cb20',300),('cb20',180),('cb20',120),('cb20',60),('cb20',30),('cb20',1)]}]
test_data =[{'folders':['psvt_at-9_25_45_64-base_1'],'traces':[('cinebench',300),('cinebench',180),('cinebench',120),('cinebench',60),('cinebench',30),('cinebench',1),
                                                              ('cb20',300),('cb20',180),('cb20',120),('cb20',60),('cb20',30),('cb20',1)]}] 

trndf=get_data_frame_esif(train_data)
tstdf=get_data_frame_esif(test_data)

In [None]:
# train
tj_n=trndf['tj_n'].values
tskn_n=trndf['tskn_n'].values
tj_nm1=trndf['tj_nm1'].values
tskn_nm1=trndf['tskn_nm1'].values
tsfit2=LinearRegression().fit(np.array([tj_n,tj_nm1,tskn_nm1]).T,tskn_n)
print(tsfit2.coef_)
print(tsfit2.intercept_)

In [None]:
# test
tj_n=tstdf['tj_n'].values
tskn_n=tstdf['tskn_n'].values
tj_nm1=tstdf['tj_nm1'].values
tskn_nm1=tstdf['tskn_nm1'].values
tskest2=tsfit2.predict(np.array([tj_n,tj_nm1,tskn_nm1]).T)
tstdf['tskest2']=tskest2
tstdf.loc[:,['tskn_n','tskest2']].plot(figsize=(16,4),grid=True)

## model ips_mean

In [None]:
train_data=[{'folders':['psvt_at-9_25_45_64-fixed_1','psvt_at-9_25_45_64-greedy_1','psvt_at-9_25_45_64-base_1'],'traces':[('cinebench',300),('cinebench',180),('cinebench',120),('cinebench',60),('cinebench',30),('cinebench',1),
                                                                                              ('cb20',300),('cb20',180),('cb20',120),('cb20',60),('cb20',30),('cb20',1)]}]
test_data =[{'folders':['psvt_at-9_25_45_64-base_1'],'traces':[('cinebench',300),('cinebench',180),('cinebench',120),('cinebench',60),('cinebench',30),('cinebench',1),
                                                              ('cb20',300),('cb20',180),('cb20',120),('cb20',60),('cb20',30),('cb20',1)]}] 

trndf=get_data_frame_esif(train_data)
tstdf=get_data_frame_esif(test_data)

### At Idle time

In [None]:
# find ips in idle periods
plt.plot(ips_n)

In [None]:
idle=(ips_n<1e8)
plt.plot(p_n[idle])

In [None]:
# train
ipsfit2=LinearRegression().fit(np.array([p_n[idle],p_nm1[idle],ips_nm1[idle]]).T,ips_n[idle])
print(ipsfit2.coef_)
print(ipsfit2.intercept_)

### over all period

In [None]:
# train
p_n=trndf['p_n'].values
p_nm1=trndf['p_nm1'].values
ips_n=trndf['ips_n'].values
ips_nm1=trndf['ips_nm1'].values
ipsfit2=LinearRegression().fit(np.array([p_n,p_nm1,ips_nm1]).T,ips_n)
print(ipsfit2.coef_)
print(ipsfit2.intercept_)

In [None]:
# test
p_n=tstdf['p_n'].values
p_nm1=tstdf['p_nm1'].values
ips_n=tstdf['ips_n'].values
ips_nm1=tstdf['ips_nm1'].values

ipsest2=tsfit2.predict(np.array([p_n,p_nm1,ips_nm1]).T)
tstdf['ipsest2']=ipsest2
tstdf.loc[:,['ips_n','ipsest2']].plot(figsize=(16,4),grid=True)

## model tmem 
currently from tat

In [None]:
# train_data=[{'folders':['psvt_at-9_25_45_64-fixed_1'],'traces':[('cb20',60)]}]
# train_data=[{'folders':['psvt_at-9_25_45_64-fixed_1','psvt_at-9_25_45_64-greedy_1',],'traces':[('cinebench',300),('cinebench',180),('cinebench',120),('cinebench',60),('cinebench',30),('cinebench',1)]}]
train_data=[{'folders':['psvt_at-9_25_45_64-fixed_1','psvt_at-9_25_45_64-greedy_1'],'traces':[('cinebench',300),('cinebench',180),('cinebench',120),('cinebench',60),('cinebench',30),('cinebench',1),('cb20',300),('cb20',180),('cb20',120),('cb20',60),('cb20',30),('cb20',1)]}]

# test_data =[{'folders':['psvt_at-9_25_45_64-base_1'],'traces':[('cb20',300),('cb20',180),('cb20',120),('cb20',60),('cb20',30),('cb20',1)]}] 
# test_data =[{'folders':['psvt_at-9_25_45_64-base_1'],'traces':[('cinebench',300),('cinebench',180),('cinebench',120),('cinebench',60),('cinebench',30),('cinebench',1)]}] 
test_data =[{'folders':['psvt_at-9_25_45_64-base_1'],'traces':[('cinebench',300),('cinebench',180),('cinebench',120),('cinebench',60),('cinebench',30),('cinebench',1),
                                                              ('cb20',300),('cb20',180),('cb20',120),('cb20',60),('cb20',30),('cb20',1)]}] 

trndf=get_data_frame_tat(train_data)
tstdf=get_data_frame_tat(test_data)

In [None]:
# train
tj_n=trndf['tj_n'].values
tmem_n=trndf['tm_n'].values
tj_nm1=trndf['tj_nm1'].values
tmem_nm1=trndf['tm_nm1'].values
p_n=trndf['p_n'].values
p_nm1=trndf['p_nm1'].values
tmfit2=LinearRegression().fit(np.array([tj_n,tj_nm1,tmem_nm1]).T,tmem_n)
# tmfit2=LinearRegression().fit(np.array([p_n,p_nm1,tmem_nm1]).T,tmem_n)
print(tmfit2.coef_)
print(tmfit2.intercept_)

In [None]:
# test
tj_n=tstdf['tj_n'].values
tmem_n=tstdf['tm_n'].values
tj_nm1=tstdf['tj_nm1'].values
tmem_nm1=tstdf['tm_nm1'].values
p_n=tstdf['p_n'].values
p_nm1=tstdf['p_nm1'].values
tmest2=tmfit2.predict(np.array([tj_n,tj_nm1,tmem_nm1]).T)
# tmest2=tmfit2.predict(np.array([p_n,p_nm1,tmem_nm1]).T)
tstdf['tmest2']=tmest2
tstdf.loc[:,['tm_n','tmest2']].plot(figsize=(16,4),grid=True)

# Feature Extraction
In this section we run the simulator to get statistics over the features 

In [None]:
sys.path

In [None]:
proj_root=os.path.dirname(os.path.dirname(os.getcwd()))

In [None]:
sys.path.insert(0,proj_root)

In [None]:
from train.custom_envs import DTTEnvSim,PLATFORMS
from train.dttsim_wrappers import DTTStateRewardWrapper



In [None]:
platform = PLATFORMS['Scarlet']
print(platform.params)

In [None]:
workload_params = 10*(['cb15']+['cooldown']*60) +\
                  ['cooldown'] * 150 + \
                  10*(['cb20']+['cooldown']*60) + \
                  ['cooldown'] * 150 + \
                  10*(['cb15']+['cooldown']*45) + \
                  ['cooldown'] * 150 + \
                  10 * (['cb20'] + ['cooldown'] * 45) + \
                  ['cooldown'] * 150 + \
                  10 * (['cb15'] + ['cooldown'] * 30) + \
                  ['cooldown'] * 150 + \
                  10 * (['cb20'] + ['cooldown'] * 30) + \
                  ['cooldown'] * 150 + \
                  10 * (['cb15'] + ['cooldown'] * 15) + \
                  ['cooldown'] * 150 + \
                  10 * (['cb20'] + ['cooldown'] * 15)

In [None]:
env = DTTEnvSim(platform, workload_params=workload_params, norm_obs=False,log_output=os.getcwd())

In [None]:
wenv=DTTStateRewardWrapper(env,n_frames=5,n_features=17)

# Reward
In this section we'll try to find a model to predict the score.
We'll develop a simple model per benchmark that is based on aggregated features (statistics gathered throughout the benchmark execution).   

Relevant features:
 - IPS mean
 - IPS stdev
 - Clip Reason events histogram (how many occurences of each reason will be a distinct feature)
 - some information about the turbo budget
     - % time below threshold - this will probably be more informative
     - avegare turbo budget level ? will not tell a lot. 

Let's start with that and develop a model for cb15 and cb20.  

first, start with common tools
 

In [None]:
def get_score_tid(xldf,tt):
    score=0
    trace='_idle'
#     start_time=tt.strftime('%Y-%m-%d %H-%M-%S')
#     start_time='1900-01-01 00:00:00'
    sc=xldf.loc[(xldf['start_time']<tt) & (xldf['end_time']>tt) ,['score','trace_name','num_of_sec_between','start_time']].values
    if len(sc)>0:
        score=sc[0][0]
        trace=sc[0][1]+'_'+str(sc[0][2])+'_'+str(sc[0][3])
#         start_time=str(sc[0][3])
    return score,trace

In [None]:
def get_data_for_reward(data_filters):
    tslog_parser = lambda x: pd.datetime.strptime(x, '%B%d %H:%M:%S')
    ips_cols=['cpu{}_inst_delta'.format(i) for i in range(8)]
    # tid = trace_id = trace name + start time
    data_df=pd.DataFrame(columns=['ts','tid','power','pl1','pl2','clip','ips','score'])
    for filt in data_filters:   # e.g. filt = {'folders':['rl_rnd_64_3','rl_rnd_8'],'traces':[('cinebench',120),('cinebench',30)]}
        for folder in filt['folders']:   # folder = 'rl_rnd_64_3'
            fpath=os.path.join(data_path,folder)
            # extract file names
            esif_file=find('*_esif.csv',fpath)[0]
            tat_file=find('*_TAT.csv',fpath)[0]
            score_file=find('*.xlsx',fpath)[0]
            # read excel (score)
            xldf = pd.read_excel(score_file,  parse_dates = ['start_time', 'end_time'], date_parser = tslog_parser)           
            print('='*30,'analyzing esif',esif_file,'='*30)
            # read esif file
            esif_df=pd.read_csv(esif_file)
            esif_df['timestamp']=pd.to_datetime(esif_df['timestamp'])
            esif_df.sort_values(by= 'timestamp').reset_index(inplace=True)
            esif_df.loc[:,['MMIO_PL1','MMIO_PL2']] = esif_df.loc[:,['MMIO_PL1','MMIO_PL2']]/1000
            # add turbo budget calc
            esif_df['ewma']=(esif_df['MMIO_PL1'] - esif_df['POWER']).ewm(com=27.5, adjust=False).mean()
            # add information about the trace name and score
            tid_score_extract=partial(get_score_tid,xldf)
            esif_df['score']=0
            esif_df['trace_name']=''
            esif_df.loc[:,['score','trace_name']]=np.array([[a,b] for a,b in esif_df['timestamp'].apply(tid_score_extract)])
            # calc ips stats
            esif_df['ips']=esif_df.loc[:,ips_cols].mean(axis=1)
            # extract the data from the esif according to the 'traces'
            for trace in filt['traces']:   # e.g. trace = ('cinebench',120)
                # find the rows of the 1st run of this filter. there might be more than one 
                print('-'*20,'analyzing trace',trace,'-'*20)
                xlf=xldf[((xldf.trace_name==trace[0]) & (xldf.num_of_sec_between==trace[1]) & (xldf.Run_number==1))]
#                 fn_prefix = [trace[0]+'_'+x.strftime('%H:%M:%S').split(':')[0] for x in xlf['start_time']]   # e.g. ['cinebench_23', 'cinebench_08']
                # Note : the file name includes the time it was opened whereas the start time in the xlsx is where the benchmark actually started to run
                # we open the file and then wait the 5 minutes between runs so to get the prefix of filename we have to reduce 5 min from the benchmark start time
                fn_prefix = [trace[0]+'_'+(x-timedelta(minutes=5)).strftime('%H:%M:%S').split(':')[0] for x in xlf['start_time']]   # e.g. ['cinebench_23', 'cinebench_08']
                for fnp in fn_prefix:
#                     print('~'*10,'collecting file',fnp,'~'*10)
                    fnp_filt=lambda x: (fnp in x)
                    ts = esif_df.loc[esif_df['File_name'].apply(fnp_filt),'timestamp'].values 
                    tid=esif_df.loc[esif_df['File_name'].apply(fnp_filt),'trace_name'].values
                    pl1= esif_df.loc[esif_df['File_name'].apply(fnp_filt),'MMIO_PL1'].values
                    pl2= esif_df.loc[esif_df['File_name'].apply(fnp_filt),'MMIO_PL2'].values
                    p=esif_df.loc[esif_df['File_name'].apply(fnp_filt),'POWER'].values
                    clip=esif_df.loc[esif_df['File_name'].apply(fnp_filt),'IA Clip'].values
                    ips=esif_df.loc[esif_df['File_name'].apply(fnp_filt),'ips'].values
                    score=esif_df.loc[esif_df['File_name'].apply(fnp_filt),'score'].values.astype(float)
                    print('folder',folder,' trace',trace,' file prefix',fnp,'found {} samples'.format(len(p)))
                    data_df=data_df.append(pd.DataFrame({'ts':ts,'tid':tid,'power':p,'pl1':pl1,'pl2':pl2,'clip':clip,'ips':ips,'score':score}),ignore_index=True)
#                     print('~'*10,'done with file',fnp,'~'*10)
#                 print('-'*20,'done with trace', trace,'-'*20)
            print('='*30,'done with esif',folder,'='*30)
        ############## post processing for the whole data frame ##############
        # clean the ips
        q999=data_df['ips'].quantile(0.999)
        data_df['ips']=data_df['ips'].apply(lambda x: min(x,q999))
        # 1-hot encoding for the IA Clip column
        ccn=['clip_{}'.format(i) for i in reversed(range(16))]
        data_df=data_df.reindex(columns=list(data_df.columns)+ccn)
        data_df.loc[:,ccn]=data_df['clip'].apply(lambda v: [int(b) for b in "{:016b}".format((v & 0xffff))]).to_list()
        # for easy drawing, label encode the clip:
        le=preprocessing.LabelEncoder()
        data_df['clip'] = data_df['clip'].apply(lambda x: x & 0xffff)
        data_df['clip'] = 10* le.fit_transform(data_df['clip'])
        ccd={10*c:hex(le.classes_[c]) for c in range(len(le.classes_))}
        print('total samples:',len(data_df))
        print('clip code', ccd)
        return data_df

In [None]:
### testing ###
folder_name='psvt_at-9_25_45_64-fixed_1'
folder_name=os.path.join(data_path,folder_name)
esif_file=find('*_esif.csv',os.path.join(data_path,folder_name))[0]
tat_file=find('*_TAT.csv',os.path.join(data_path,folder_name))[0]
score_file=find('*.xlsx',folder_name)[0]

esif_df=pd.read_csv(esif_file)
esif_df['timestamp']=pd.to_datetime(esif_df['timestamp'])
esif_df=esif_df.sort_values(by= 'timestamp')
esif_df.reset_index(inplace=True)

tslog_parser = lambda x: datetime.strptime(x, '%B%d %H:%M:%S')
xldf = pd.read_excel(score_file,  parse_dates = ['start_time', 'end_time'], date_parser = tslog_parser)
esif_df.columns

# tt=esif_df.loc[29800,'timestamp']
# get_trace_score_start(tt)

In [None]:
tid_score_extract=partial(get_score_tid,xldf)
esif_df['score']=0
esif_df['trace_name']=''
esif_df.loc[:,['score','trace_name']]=np.array([[a,b] for a,b in esif_df['timestamp'].apply(tid_score_extract)])


In [None]:
# esif_df['trace_name'].value_counts()

In [None]:
# esif_df['trace_name'].apply(lambda x: x.startswith('cb20_1')).sum()

In [None]:
# tt=esif_df.loc[795,'timestamp']
# sc,_=get_score_tid(xldf,tt)

In [None]:
tat_df=pd.read_csv(tat_file)
tat_df.columns

## train the model

In [None]:
# cb20
sec_between=[300,240,180,120,90,60,30,1]
traces=[('cb20',k) for k in sec_between]
train_data=[{'folders':['rl_rnd_64_3','fixed_25_64','psvt_at-9_25_45_64-base_1','psvt_at-9_25_45_64-fixed_1'],'traces':[('cb20',k) for k in sec_between]}]
test_data =[{'folders':['rl_rnd_64','rl_rnd_7','rl_rnd_8','psvt_at-9_25_45_64-greedy_1'],'traces':[('cb20',k) for k in sec_between]}] 

In [None]:
# cb15
sec_between=[300,240,180,120,90,60,30,1]
traces=[('cinebench',k) for k in sec_between]
train_data=[{'folders':['rl_rnd_64_3','rl_rnd_64_5','rl_rnd_64_6','fixed_25_64','psvt_at-9_25_45_64-base_1','psvt_at-9_25_45_64-fixed_1'],'traces':[('cinebench',k) for k in sec_between]}]
test_data =[{'folders':['rl_rnd_64','rl_rnd_7','rl_rnd_8','psvt_at-9_25_45_64-greedy_1'],'traces':[('cinebench',k) for k in sec_between]}] 

In [None]:
# get the train data
trndf=get_data_for_reward(train_data)
gbtr=trndf.groupby('tid')
tid_groups=list(gbtr.groups.keys())
tid_groups
# tstdf=get_data_for_reward(test_data)

In [None]:
# to explore specific signal
g0=gbtr.get_group(tid_groups[1])
g0[['power','pl1','clip']].plot(figsize=(16,4),grid=True)

In [None]:
fdf=gbtr.mean()
fdf.loc[fdf['score']!=0,:].plot.scatter('ips','score')

In [None]:
# train
sc_vs_ips = fdf.loc[fdf['score']!=0,['ips','score']]
scfit=LinearRegression().fit(sc_vs_ips['ips'].values[:,None],sc_vs_ips['score'].values)
# tmfit2=LinearRegression().fit(np.array([p_n,p_nm1,tmem_nm1]).T,tmem_n)
print(scfit.coef_)
print(scfit.intercept_)

## test the model

In [None]:
tstdf=get_data_for_reward(test_data)
gbtst=tstdf.groupby('tid')
tidg=list(gbtst.groups.keys())
tidg


In [None]:
# draw some signal
g0=gbtst.get_group(tidg[1])
g0[['power','pl1','clip']].plot(figsize=(16,4),grid=True)

In [None]:
fdf_tst=gbtst.mean()
sc_vs_ips_tst = fdf_tst.loc[fdf_tst['score']!=0,['ips','score']]

In [None]:
scest=scfit.predict(sc_vs_ips_tst['ips'].values[:,None])
sc_vs_ips_tst['scest']=scest
# sc_vs_ips.loc[:,['score','scest']].plot.scatter('scest','score')
# sc_vs_ips.plot.scatter('ips',['score','scest'])
ax1 = sc_vs_ips_tst.plot(kind='scatter', x='ips', y='score', color='r')    
ax2 = sc_vs_ips_tst.plot(kind='scatter', x='ips', y='scest', color='g', ax=ax1) 


## Reward Draft

In [None]:
# this function doesnt work well. need to figure out why
def get_data_for_reward2(data_filters):
    tslog_parser = lambda x: pd.datetime.strptime(x, '%B%d %H:%M:%S')
    ips_cols=['cpu{}_inst_delta'.format(i) for i in range(8)]
    # tid = trace_id = trace name + start time
    data_df=pd.DataFrame(columns=['ts','tid','power','pl1','pl2','clip','ips','score'])
    for filt in data_filters:   # e.g. filt = {'folders':['rl_rnd_64_3','rl_rnd_8'],'traces':[('cinebench',120),('cinebench',30)]}
        for folder in filt['folders']:   # folder = 'rl_rnd_64_3'
            fpath=os.path.join(data_path,folder)
            # extract file names
            esif_file=find('*_esif.csv',fpath)[0]
            tat_file=find('*_TAT.csv',fpath)[0]
            score_file=find('*.xlsx',fpath)[0]
            # read excel (score)
            xldf = pd.read_excel(score_file,  parse_dates = ['start_time', 'end_time'], date_parser = tslog_parser)           
            print('='*30,'analyzing esif',esif_file,'='*30)
            # read esif file
            esif_df=pd.read_csv(esif_file)
            esif_df['timestamp']=pd.to_datetime(esif_df['timestamp'])
            esif_df.sort_values(by= 'timestamp').reset_index(inplace=True)
            esif_df.loc[:,['MMIO_PL1','MMIO_PL2']] = esif_df.loc[:,['MMIO_PL1','MMIO_PL2']]/1000
            # add turbo budget calc
            esif_df['ewma']=(esif_df['MMIO_PL1'] - esif_df['POWER']).ewm(com=27.5, adjust=False).mean()
            # add information about the trace name and score
            tid_score_extract=partial(get_score_tid,xldf)
            esif_df['score']=0
            esif_df['trace_name']=''
            esif_df.loc[:,['score','trace_name']]=np.array([[a,b] for a,b in esif_df['timestamp'].apply(tid_score_extract)])
            # calc ips stats
            esif_df['ips']=esif_df.loc[:,ips_cols].mean(axis=1)
            # extract the data from the esif according to the 'traces'
            for trace in filt['traces']:   # e.g. trace = ('cinebench',120)
                print('-'*20,'analyzing trace',trace,'-'*20)
                tn = trace[0]+'_'+str(trace[1])    # e.g. 'cinebench_120'
#                 tn_filt=lambda x: (tn in x)
                tn_filt=lambda x: x.startswith(tn)
                ts = esif_df.loc[esif_df['trace_name'].apply(tn_filt),'timestamp'].values 
                tid=esif_df.loc[esif_df['trace_name'].apply(tn_filt),'trace_name'].values
                pl1= esif_df.loc[esif_df['trace_name'].apply(tn_filt),'MMIO_PL1'].values
                pl2= esif_df.loc[esif_df['trace_name'].apply(tn_filt),'MMIO_PL2'].values
                p=esif_df.loc[esif_df['trace_name'].apply(tn_filt),'POWER'].values
                clip=esif_df.loc[esif_df['trace_name'].apply(tn_filt),'IA Clip'].values
                ips=esif_df.loc[esif_df['trace_name'].apply(tn_filt),'ips'].values
                score=esif_df.loc[esif_df['trace_name'].apply(tn_filt),'score'].values.astype(float)
                print('folder',folder,' trace',trace,' file prefix',tn,'found {} samples'.format(len(p)))
#                 set_trace()
                data_df=data_df.append(pd.DataFrame({'ts':ts,'tid':tid,'power':p,'pl1':pl1,'pl2':pl2,'clip':clip,'ips':ips,'score':score}),ignore_index=True)
#                 print('-'*20,'done with trace', trace,'-'*20)
            print('='*30,'done with esif',folder,'='*30)
        ############## post processing for the whole data frame ##############
        # clean the ips
        q999=data_df['ips'].quantile(0.999)
        data_df['ips']=data_df['ips'].apply(lambda x: min(x,q999))
        # 1-hot encoding for the IA Clip column
        ccn=['clip_{}'.format(i) for i in reversed(range(16))]
        data_df=data_df.reindex(columns=list(data_df.columns)+ccn)
        data_df.loc[:,ccn]=data_df['clip'].apply(lambda v: [int(b) for b in "{:016b}".format((v & 0xffff))]).to_list()
        # for easy drawing, label encode the clip:
        le=preprocessing.LabelEncoder()
        data_df['clip'] = data_df['clip'].apply(lambda x: x & 0xffff)
        data_df['clip'] = 10* le.fit_transform(data_df['clip'])
        ccd={10*c:hex(le.classes_[c]) for c in range(len(le.classes_))}
        print('total samples:',len(data_df))
        print('clip code', ccd)
        return data_df

In [None]:
tstdf2=get_data_for_reward2(test_data)

In [None]:
tstdf2['tid'].value_counts()

In [None]:
gbtst2=tstdf2.groupby('tid')
tidg2=list(gbtst2.groups.keys())
tidg

In [None]:
g2=gbtst2.get_group(tidg2[1])
g2[['power','pl1','clip']].plot(figsize=(16,4),grid=True)

# Evaluate Fidelity

# Parser
Avishai's code

In [None]:
import glob
import seaborn as sns
df_parser = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
df1= pd.read_csv(os.path.join(data_path,'exp1/etl_output_esif.csv'), parse_dates=['timestamp'] ,date_parser=df_parser)
df2= pd.read_csv(os.path.join(data_path,'exp2/etl_output_esif.csv'), parse_dates=['timestamp'] ,date_parser=df_parser)
df3= pd.read_csv(os.path.join(data_path,'exp3/etl_output_esif.csv'), parse_dates=['timestamp'] ,date_parser=df_parser)
df4= pd.read_csv(os.path.join(data_path,'exp4/etl_output_esif.csv'), parse_dates=['timestamp'] ,date_parser=df_parser)

df = pd.concat([df1, df2, df3, df4])
df.sort_values(by= 'timestamp', inplace = True)

df['cpu0_delta'] = df['cpu0'].diff(1).fillna(0)
df['cpu1_delta'] = df['cpu1'].diff(1).fillna(0)
df['cpu2_delta'] = df['cpu2'].diff(1).fillna(0)
df['cpu3_delta'] = df['cpu3'].diff(1).fillna(0)
df['cpu4_delta'] = df['cpu4'].diff(1).fillna(0)
df['cpu5_delta'] = df['cpu5'].diff(1).fillna(0)
df['cpu6_delta'] = df['cpu6'].diff(1).fillna(0)
df['cpu7_delta'] = df['cpu7'].diff(1).fillna(0)

list_of_cpus = ['cpu0_delta', 'cpu1_delta', 'cpu2_delta', 'cpu3_delta', 'cpu4_delta', 'cpu5_delta', 'cpu6_delta', 'cpu7_delta']

df = df[df.cpu0_delta>0]
df = df[df.cpu1_delta>0]
df = df[df.cpu2_delta>0]

df = df[df.cpu3_delta>0]
df = df[df.cpu4_delta>0]
df = df[df.cpu5_delta>0]
df = df[df.cpu6_delta>0]
df = df[df.cpu7_delta>0]


df['cpu_avg'] = df[list_of_cpus].apply(lambda row: row.mean(), axis = 1)
df['cpu_max'] = df[list_of_cpus].apply(lambda row: row.max(), axis = 1)

tslog_parser = lambda x: pd.datetime.strptime(x, '%B%d %H:%M:%S')

score_data_path = data_path+'/*/DTT1*.xlsx'
all_ts_logs_path = list(glob.iglob(score_data_path, recursive=True))
df_tslog = []
for file in all_ts_logs_path:
    df_tslog.append(pd.read_excel(file , parse_dates=['start_time', 'end_time'] ,date_parser=tslog_parser))

df_tslog = pd.concat(df_tslog)

score_and_ips = {'score': [], 'ips_avg': [],  'trace_name': [], 'num_of_sec_between': [], 'pl1': [], 'tskin': []}
for i in range(len(df_tslog.start_time)):
        find_run_index = (df.timestamp >= df_tslog.start_time.iloc[i]) & (df.timestamp <= df_tslog.end_time.iloc[i])
        df.loc[find_run_index, 'File_name'] = df_tslog['trace_name'].iloc[i] + '_' + str(df_tslog.index[i]) + '_' + str(df_tslog['bursty_pl2'].iloc[i])
        score_and_ips['score'].append(df_tslog.score.iloc[i])
        data_temp = df[df.File_name ==  df_tslog['trace_name'].iloc[i] + '_' + str(df_tslog.index[i]) + '_' + str(df_tslog['bursty_pl2'].iloc[i])].reset_index(drop=True)
        score_and_ips['ips_avg'].append(data_temp['cpu_avg'].mean())
        score_and_ips['trace_name'].append( df_tslog['trace_name'].iloc[i])
        score_and_ips['num_of_sec_between'].append( df_tslog['num_of_sec_between'].iloc[i])
        score_and_ips['pl1'].append( df_tslog['bursty_pl2'].iloc[i])
        score_and_ips['tskin'].append( data_temp['tskin'].max())


score_and_ips = pd.DataFrame(score_and_ips)

score_and_ips['pl_sns'] = score_and_ips['pl1']
score_and_ips['pl_sns'][score_and_ips['pl_sns'] == 64000] = 4
score_and_ips['pl_sns'][score_and_ips['pl_sns'] == 60000] = 3
score_and_ips['pl_sns'][score_and_ips['pl_sns'] == 44000] = 2
score_and_ips['pl_sns'][score_and_ips['pl_sns'] == 24000] = 1

score_cinebench = score_and_ips[(score_and_ips.trace_name == 'cinebench') &(score_and_ips.num_of_sec_between == 300)]
score_cinebench1 = score_and_ips[(score_and_ips.trace_name == 'cinebench') &(score_and_ips.num_of_sec_between == 1)]
score_DCC = score_and_ips[(score_and_ips.trace_name == 'pcmark10_DCC') &(score_and_ips.num_of_sec_between == 300)]
score_pcmark10_essentials = score_and_ips[(score_and_ips.trace_name == 'pcmark10_essentials') &(score_and_ips.num_of_sec_between == 300)]
score_pcmark10_gaming = score_and_ips[(score_and_ips.trace_name == 'pcmark10_gaming') &(score_and_ips.num_of_sec_between == 300)]
score_pcmark10_productivity = score_and_ips[(score_and_ips.trace_name == 'pcmark10_productivity') &(score_and_ips.num_of_sec_between == 300)]

In [None]:
sns.scatterplot(data = score_cinebench, x= 'score', y = 'ips_avg', hue = 'pl_sns')

In [None]:
sns.scatterplot(data = score_cinebench1, x= 'score', y = 'ips_avg', hue = 'pl_sns')

In [None]:
sns.scatterplot(data = score_DCC, x= 'score', y = 'ips_avg', hue = 'pl_sns')

In [None]:
sns.scatterplot(data = score_pcmark10_essentials, x= 'score', y = 'ips_avg', hue = 'pl_sns')

In [None]:
sns.scatterplot(data = score_pcmark10_gaming , x= 'score', y = 'ips_avg', hue = 'pl_sns')

In [None]:
sns.scatterplot(data = score_pcmark10_productivity , x= 'score', y = 'ips_avg', hue = 'pl_sns')