In [1]:
import os
import sys

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import svm
from sklearn.neural_network import MLPClassifier

import seaborn as sns
import matplotlib.pyplot as plt

## Process data

In [2]:
!ls ../tiled_polybench/features.csv
!ls ../tiled_polybench/runtimes.csv
!ls ../tiled_polybench_lin_alg/features.csv
!ls ../tiled_polybench_lin_alg/runtimes.csv

../tiled_polybench/features.csv
../tiled_polybench/runtimes.csv
../tiled_polybench_lin_alg/features.csv
../tiled_polybench_lin_alg/runtimes.csv


In [3]:
features_path = os.path.expanduser("../tiled_polybench_lin_alg/features.csv")
runtimes_path = os.path.expanduser("../tiled_polybench_lin_alg/runtimes.csv")

features_df = pd.read_csv(features_path)
runtimes_df = pd.read_csv(runtimes_path)
runtimes_df.info()
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1512 entries, 0 to 1511
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   uniqueFilename  1512 non-null   object 
 1   run1            1512 non-null   float64
 2   run2            1512 non-null   float64
 3   run3            1512 non-null   float64
 4   run4            1512 non-null   float64
dtypes: float64(4), object(1)
memory usage: 59.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1680 entries, 0 to 1679
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   uniqueFilename        1680 non-null   object
 1   rootFilename          1680 non-null   object
 2   tileSize              1680 non-null   int64 
 3   readInvariant         1680 non-null   int64 
 4   readPrefetched        1680 non-null   int64 
 5   readNonPrefetched     1680 non-null   int64 
 6   writeInvariant

In [4]:
# clean up some generation artifacts and get average runtimes
runtimes_df['uniqueFilename'] = runtimes_df.uniqueFilename.str.rstrip('.out')
runtimes_df = runtimes_df[runtimes_df.uniqueFilename.str.endswith("_0")==False]
runtimes_df['avgRuntime'] = runtimes_df[['run1','run2','run3','run4']].mean(axis=1)
features_df = features_df[features_df.uniqueFilename.str.endswith("_0")==False]
features_df = features_df[features_df.uniqueFilename.str.endswith("_0")==False]

# merge the features and average runtimes together
merged_df = features_df.merge(runtimes_df[['uniqueFilename', 'avgRuntime']])

# get the fastest tile size for each unique loop
merged_df['uniqueLoopId'] = merged_df.uniqueFilename.str.split(pat='_').str[:3].str.join('_')
merged_df = merged_df.sort_values(['uniqueLoopId','avgRuntime'],ascending=True).groupby('uniqueLoopId').head(1)

# drop programs that predict tile size 0
# merged_df = merged_df[merged_df.tileSize != 1]

merged_df.info()
# merged_df.groupby('tileSize').count()
# print(plot_data.isnull().sum())
# sns.pairplot(merged_df.iloc[:, 3:10])
# plt.show()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168 entries, 586 to 856
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   uniqueFilename        168 non-null    object 
 1   rootFilename          168 non-null    object 
 2   tileSize              168 non-null    int64  
 3   readInvariant         168 non-null    int64  
 4   readPrefetched        168 non-null    int64  
 5   readNonPrefetched     168 non-null    int64  
 6   writeInvariant        168 non-null    int64  
 7   writePrefetched       168 non-null    int64  
 8   writeNonPrefetched    168 non-null    int64  
 9   distToDominatingLoop  168 non-null    int64  
 10  avgRuntime            168 non-null    float64
 11  uniqueLoopId          168 non-null    object 
dtypes: float64(1), int64(8), object(3)
memory usage: 17.1+ KB


## Train models

In [5]:
# prep data for training
X = merged_df.iloc[:, 3:10]
#X = (X-X.mean())/X.std()
y = merged_df.tileSize

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.9, random_state=1)

In [6]:
# linear regression

lin_reg = LinearRegression(positive=True)
lin_reg.fit(X_train,y_train)
predictions = lin_reg.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)

print('The r2 is: ', r2)
print('The rmse is: ', rmse)
print(predictions)

The r2 is:  0.0045406819277856725
The rmse is:  87.3723935316799
[65.27272727 45.66666667 64.5        64.5        65.27272727 84.87878788
 65.27272727 65.27272727 45.66666667 65.27272727 65.27272727 65.27272727
 45.66666667 65.27272727 65.27272727 45.66666667 84.87878788 45.66666667
 65.27272727 64.5        64.5        65.27272727 65.27272727 65.27272727
 65.27272727 65.27272727 65.27272727 65.27272727 45.66666667 64.5
 84.87878788 65.27272727 45.66666667 65.27272727 65.27272727 64.5
 65.27272727 65.27272727 65.27272727 65.27272727 65.27272727 45.66666667
 45.66666667 84.87878788 84.87878788 65.27272727 65.27272727 64.5
 45.66666667 45.66666667 64.5        65.27272727 65.27272727 65.27272727
 65.27272727 45.66666667 65.27272727 45.66666667 84.87878788 65.27272727
 45.66666667 45.66666667 65.27272727 65.27272727 45.66666667 65.27272727
 64.5        84.87878788 65.27272727 83.33333333 45.66666667 64.5
 65.27272727 65.27272727 45.66666667 65.27272727 45.66666667 65.27272727
 65.27272727 6

In [7]:
# random forest

forest = RandomForestClassifier(n_estimators=500, random_state=888)
forest.fit(X_train,y_train)
predictions = forest.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, predictions))
print(predictions)

Accuracy: 0.21710526315789475
[ 64   8 128 128 256  64  64   4   1  64   4 256   8  16  64   8  64   1
   4   1   1  16   4 256   1  64 256   4 256 128 256  64   1  16  64 128
  64 256 256  64  64   1   1 256 256  64 256 128   1   1   1  64 256  16
 256   1   4 128 256 256 256   1  16  64   1 256 128  64  16 128   1   1
   4  16   8   4   1 256   4 256 128  64 256   1  64   1  64  64  64   1
 256   8   1  16   1 128 256 256   1  64  16  64 256  16 128 256 256 128
  16  64 256   1  64 128   1 256 256  64   1 128   4  64  16 128   1   1
   4 256   1 128 128   1   8 256 128  64 128  64 128   1   1 256 256 256
  64 256   1   1 256   8   8  16]


In [8]:
# support vector machine regression

support = svm.SVR()
support.fit(X_train,y_train)
predictions = support.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)

print('The r2 is: ', r2)
print('The rmse is: ', rmse)
print(predictions)

The r2 is:  -0.2907023987878641
The rmse is:  99.48910722437253
[16.94437759 15.20942326 15.87397931 15.87397931 16.34135078 16.53863211
 16.94437759 14.97219919 15.96270413 16.94437759 14.97219919 16.34135078
 15.22145134 15.99999998 16.94437759 15.13044379 16.53863211 15.96270413
 14.97219919 14.8413059  14.8413059  15.69579947 14.97219919 16.34135078
 15.00296343 16.94437759 16.34135078 14.97219919 15.06562611 15.56221343
 16.1287189  16.94437759 15.83462052 15.99999998 16.94437759 15.71203664
 16.94437759 16.34135078 16.34135078 16.94437759 16.94437759 15.00296343
 15.83462052 16.1287189  16.1287189  16.94437759 16.34135078 15.56221343
 14.64453462 15.83462052 14.8413059  16.94437759 16.34135078 15.99999998
 16.34135078 14.89740049 14.97219919 15.86727969 16.1287189  16.34135078
 15.58284157 15.30798389 15.99999998 16.94437759 15.00296343 16.34135078
 15.87397931 16.53863211 15.99999998 15.40842411 15.83462052 14.88291833
 14.97219919 15.99999998 15.19362066 14.97219919 14.64453462

In [9]:
# support vector machine classifier

support = svm.SVC()
support.fit(X_train,y_train)
predictions = support.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)

print("Accuracy:", metrics.accuracy_score(y_test, predictions))
print(predictions)

Accuracy: 0.2565789473684211
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 8 1 1]


In [14]:
# multi-layer perceptron

mlp = MLPClassifier(random_state=1, max_iter=10000, hidden_layer_sizes=(20,))
mlp.fit(X_train,y_train)
predictions = mlp.predict(X_test)

print("Accuracy:", mlp.score(X_test, y_test))
print(predictions)

Accuracy: 0.2236842105263158
[ 64   8 128 128 256 256  64   8   1  64   8 256   8  16  64   8 256   1
   8   1   1  16   8 256   8  64 256   8   8 128 256  64 128  16  64 128
  64 256 256  64  64   1 128 256 256  64 256 128   1 128   1  64 256  16
 256   1   8 128 256 256   1   1  16  64   1 256 128 256  16   1 128   1
   8  16   8   8   1 256   8 256   1  64 256   1 256   1  64 256  64   1
 256   8   1  16   1 128   8 256   1  64  16  64 256   1 128 256 256 128
  16  64 256   1 256   1   1 256 256  64   1 128   8 256  16 128   1   1
   8 256   1 128 128   1   8 256   1  64   1  64   1   8   1 256   1   8
  64 256   1   1 256 128   1  16]
