In [1]:
import os
import sys

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

import pickle

import seaborn as sns
import matplotlib.pyplot as plt

## Process data

In [2]:
def process_data(features_path, runtimes_path):

    features_df = pd.read_csv(features_path)
    runtimes_df = pd.read_csv(runtimes_path)

    # clean up some generation artifacts and get average runtimes
    runtimes_df['uniqueFilename'] = runtimes_df.uniqueFilename.str.rstrip('.out')
    runtimes_df = runtimes_df[runtimes_df.uniqueFilename.str.endswith("_0")==False]
    runtimes_df['avgRuntime'] = runtimes_df[['run1','run2','run3','run4']].mean(axis=1)
    features_df = features_df[features_df.uniqueFilename.str.endswith("_0")==False]
    features_df = features_df[features_df.uniqueFilename.str.endswith("_0")==False]

    # merge the features and average runtimes together
    merged_df = features_df.merge(runtimes_df[['uniqueFilename', 'avgRuntime']])

    # get the fastest tile size for each unique loop
    merged_df['uniqueLoopId'] = merged_df.uniqueFilename.str.split(pat='_').str[:3].str.join('_')
    merged_df = merged_df.sort_values(['uniqueLoopId','avgRuntime'],ascending=True).groupby('uniqueLoopId').head(1)

    # drop programs that predict tile size 0
    # merged_df = merged_df[merged_df.tileSize != 1]

    # keep only programs that tile the innermost loop
    # merged_df = merged_df[(merged_df.distToDominatingLoop == 1) | (merged_df.distToDominatingLoop == 2)]
    merged_df = merged_df[(merged_df.distToDominatingLoop == 1)]

    merged_df.info()
    merged_df.groupby('tileSize').count()
    # print(plot_data.isnull().sum())
    # sns.pairplot(merged_df.iloc[:, 3:10])
    # plt.show()
    
    return merged_df

In [3]:
!ls ../tiled_polybench/features.csv
!ls ../tiled_polybench/runtimes.csv
!ls ../tiled_polybench_lin_alg/features.csv
!ls ../tiled_polybench_lin_alg/runtimes.csv

# features_path = os.path.expanduser("../tiled_polybench_lin_alg/features.csv")
# runtimes_path = os.path.expanduser("../tiled_polybench_lin_alg/runtimes.csv")
features_path = os.path.expanduser("../tiled_polybench/features.csv")
runtimes_path = os.path.expanduser("../tiled_polybench/runtimes_trimmed.csv")

merged_df = process_data(features_path, runtimes_path)

../tiled_polybench/features.csv
../tiled_polybench/runtimes.csv
../tiled_polybench_lin_alg/features.csv
../tiled_polybench_lin_alg/runtimes.csv
<class 'pandas.core.frame.DataFrame'>
Int64Index: 117 entries, 598 to 856
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   uniqueFilename        117 non-null    object 
 1   rootFilename          117 non-null    object 
 2   tileSize              117 non-null    int64  
 3   readInvariant         117 non-null    int64  
 4   readPrefetched        117 non-null    int64  
 5   readNonPrefetched     117 non-null    int64  
 6   writeInvariant        117 non-null    int64  
 7   writePrefetched       117 non-null    int64  
 8   writeNonPrefetched    117 non-null    int64  
 9   distToDominatingLoop  117 non-null    int64  
 10  avgRuntime            117 non-null    float64
 11  uniqueLoopId          117 non-null    object 
dtypes: float64(1), int64(8), objec

## Train models

In [4]:
def save_model_to_file(model, filename):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)
        
def load_model_from_file(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [5]:
# prep data for training
X = merged_df.iloc[:, 3:10]
#X = (X-X.mean())/X.std()
y = merged_df.tileSize

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=1)

X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36 entries, 120 to 1609
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   readInvariant         36 non-null     int64
 1   readPrefetched        36 non-null     int64
 2   readNonPrefetched     36 non-null     int64
 3   writeInvariant        36 non-null     int64
 4   writePrefetched       36 non-null     int64
 5   writeNonPrefetched    36 non-null     int64
 6   distToDominatingLoop  36 non-null     int64
dtypes: int64(7)
memory usage: 2.2 KB


In [6]:
# linear regression

lin_reg = LinearRegression(positive=True)
lin_reg.fit(X_train,y_train)
predictions = lin_reg.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)

print('The r2 is: ', r2)
print('The rmse is: ', rmse)
print(predictions)
print(y_test.to_numpy())

The r2 is:  0.0637199153030793
The rmse is:  85.40026777069802
[ 40.80493451  57.98982863  40.80493451  63.63251934  63.63251934
  40.80493451  40.92869782  63.63251934  63.63251934  63.63251934
  63.63251934  63.63251934  63.63251934  63.63251934  80.75553181
  40.80493451 109.28768899  63.63251934  58.05171029  63.63251934
  40.80493451  40.92869782  57.92794698  86.46010417 109.28768899
  63.63251934  57.98982863  63.63251934  63.63251934 109.28768899
  40.86681617  63.63251934  63.63251934  63.63251934  63.63251934
  63.63251934]
[  1   1   1   1 128  32  64   8   4  16   1   4   8  16 128   1 128 256
 256  64   1 256   1   1  64 256   1  16   1 256   1 128   8  32   4   4]


In [7]:
# random forest

forest = RandomForestClassifier(n_estimators=500, random_state=888)
forest.fit(X_train,y_train)
predictions = forest.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, predictions))
print(predictions)
print(y_test.to_numpy())

save_model_to_file(forest, "../models/rand_forest.pkl")

Accuracy: 0.25
[  1   1   1   1 256 128  16 256   8 256 256 256 256   8   1 128  64 256
 128 256 128   1   1 256   4   8   1   8   1   4   1 256 256 256   8 256]
[  1   1   1   1 128  32  64   8   4  16   1   4   8  16 128   1 128 256
 256  64   1 256   1   1  64 256   1  16   1 256   1 128   8  32   4   4]


In [8]:
# gradient boosted tree

boost = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.5, max_depth=1, random_state=8)
boost.fit(X_train,y_train)
predictions = boost.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, predictions))
print(predictions)
print(y_test.to_numpy())

Accuracy: 0.3055555555555556
[  1   1   1   1 256   1  16 256   8 256 256 256 256   8  16   1   1 256
  64 256   1   1   1 256   4   8   1   8   1   4   1 256 256 256   8 256]
[  1   1   1   1 128  32  64   8   4  16   1   4   8  16 128   1 128 256
 256  64   1 256   1   1  64 256   1  16   1 256   1 128   8  32   4   4]


In [9]:
# support vector machine regression

support = svm.SVR()
support.fit(X_train,y_train)
predictions = support.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)

print('The r2 is: ', r2)
print('The rmse is: ', rmse)
print(predictions)
print(y_test.to_numpy())

The r2 is:  -0.3533088697442057
The rmse is:  102.67265762947108
[4.36115225 3.91183252 7.79337925 2.81595809 9.33662737 6.41872565
 6.52987927 9.33662737 8.00000004 9.33662737 9.33662737 9.33662737
 9.33662737 8.00000004 4.69990322 6.41872565 5.01630568 9.33662737
 5.55768283 9.33662737 6.41872565 5.77203881 2.60114212 5.2174131
 6.86801603 8.00000004 3.91183252 8.00000004 3.53901509 6.86801603
 7.24530487 9.33662737 9.33662737 9.33662737 8.00000004 9.33662737]
[  1   1   1   1 128  32  64   8   4  16   1   4   8  16 128   1 128 256
 256  64   1 256   1   1  64 256   1  16   1 256   1 128   8  32   4   4]


In [10]:
# support vector machine classifier

support = svm.SVC(random_state=8, class_weight='balanced')
support.fit(X_train,y_train)
predictions = support.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)

print("Accuracy:", metrics.accuracy_score(y_test, predictions))
print(predictions)
print(y_test.to_numpy())

Accuracy: 0.16666666666666666
[  1 128 128   1 256  32 128 256   8 256 256 256 256   8   1  32  64 256
 128 256  32 128   1 256 256   8 128   8  64 256 128 256 256 256   8 256]
[  1   1   1   1 128  32  64   8   4  16   1   4   8  16 128   1 128 256
 256  64   1 256   1   1  64 256   1  16   1 256   1 128   8  32   4   4]


In [11]:
# multi-layer perceptron

mlp = MLPClassifier(random_state=8, max_iter=10000, hidden_layer_sizes=(30,))
mlp.fit(X_train,y_train)
predictions = mlp.predict(X_test)

print("Accuracy:", mlp.score(X_test, y_test))
print(predictions)
print(y_test.to_numpy())

Accuracy: 0.3055555555555556
[  1   1   1   1 256   1  16 256   8 256 256 256 256   8   1   1  64 256
 128 256   1   1   1 256   4   8   1   8   1   4   1 256 256 256   8 256]
[  1   1   1   1 128  32  64   8   4  16   1   4   8  16 128   1 128 256
 256  64   1 256   1   1  64 256   1  16   1 256   1 128   8  32   4   4]


In [12]:
# save the best performing models
forest = RandomForestClassifier(n_estimators=500, random_state=888)
forest.fit(X,y)
boost = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.5, max_depth=1, random_state=8)
boost.fit(X,y)
support = svm.SVC(random_state=8, class_weight='balanced')
support.fit(X,y)
mlp = MLPClassifier(random_state=8, max_iter=10000, hidden_layer_sizes=(30,))
mlp.fit(X,y)

save_model_to_file(forest, "../models/boosted_tree.pkl")
save_model_to_file(forest, "../models/rand_forest.pkl")
save_model_to_file(forest, "../models/svm.pkl")
save_model_to_file(forest, "../models/mlp.pkl")