In [1]:
import os
import sys

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

import pickle

import seaborn as sns
import matplotlib.pyplot as plt

## Process data

In [14]:
def process_data(features_path, runtimes_path):

    features_df = pd.read_csv(features_path)
    runtimes_df = pd.read_csv(runtimes_path)

    # clean up some generation artifacts and get average runtimes
    runtimes_df['uniqueFilename'] = runtimes_df.uniqueFilename.str.rstrip('.out')
    runtimes_df = runtimes_df[runtimes_df.uniqueFilename.str.endswith("_0")==False]
    runtimes_df['avgRuntime'] = runtimes_df[['run1','run2','run3','run4']].mean(axis=1)
    features_df = features_df[features_df.uniqueFilename.str.endswith("_0")==False]
    features_df = features_df[features_df.uniqueFilename.str.endswith("_0")==False]

    # merge the features and average runtimes together
    merged_df = features_df.merge(runtimes_df[['uniqueFilename', 'avgRuntime']])

    # get the fastest tile size for each unique loop
    merged_df['uniqueLoopId'] = merged_df.uniqueFilename.str.split(pat='_').str[:3].str.join('_')
    merged_df = merged_df.sort_values(['uniqueLoopId','avgRuntime'],ascending=True).groupby('uniqueLoopId').head(1)

    # drop programs that predict tile size 0
    # merged_df = merged_df[merged_df.tileSize != 1]

    # keep only programs that tile the innermost loop
    # merged_df = merged_df[(merged_df.distToDominatingLoop == 1) | (merged_df.distToDominatingLoop == 2)]
    merged_df = merged_df[(merged_df.distToDominatingLoop == 1)]

    merged_df.info()
    merged_df.groupby('tileSize').count()
    # print(plot_data.isnull().sum())
    # sns.pairplot(merged_df.iloc[:, 3:10])
    # plt.show()
    
    return merged_df

In [15]:
!ls ../tiled_polybench/features.csv
!ls ../tiled_polybench/runtimes.csv
!ls ../tiled_polybench_lin_alg/features.csv
!ls ../tiled_polybench_lin_alg/runtimes.csv

# features_path = os.path.expanduser("../tiled_polybench_lin_alg/features.csv")
# runtimes_path = os.path.expanduser("../tiled_polybench_lin_alg/runtimes.csv")
features_path = os.path.expanduser("../tiled_polybench/features.csv")
runtimes_path = os.path.expanduser("../tiled_polybench/runtimes_trimmed.csv")

merged_df = process_data(features_path, runtimes_path)

../tiled_polybench/features.csv
../tiled_polybench/runtimes.csv
../tiled_polybench_lin_alg/features.csv
../tiled_polybench_lin_alg/runtimes.csv
<class 'pandas.core.frame.DataFrame'>
Int64Index: 117 entries, 598 to 856
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   uniqueFilename        117 non-null    object 
 1   rootFilename          117 non-null    object 
 2   tileSize              117 non-null    int64  
 3   readInvariant         117 non-null    int64  
 4   readPrefetched        117 non-null    int64  
 5   readNonPrefetched     117 non-null    int64  
 6   writeInvariant        117 non-null    int64  
 7   writePrefetched       117 non-null    int64  
 8   writeNonPrefetched    117 non-null    int64  
 9   distToDominatingLoop  117 non-null    int64  
 10  avgRuntime            117 non-null    float64
 11  uniqueLoopId          117 non-null    object 
dtypes: float64(1), int64(8), objec

## Train models

In [16]:
def save_model_to_file(model, filename):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)
        
def load_model_from_file(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [17]:
# remove our 5 test benchmarks
print(merged_df[(merged_df['uniqueLoopId']=='syrk_77_7') |
                (merged_df['uniqueLoopId']=='3mm_99_2') |
                (merged_df['uniqueLoopId']=='symm_81_2')])
merged_df = merged_df[(merged_df['uniqueLoopId']!='syrk_77_7') &
                      #(merged_df['uniqueLoopId']!='syr2k_81_7') &
                      #(merged_df['uniqueLoopId']!='2mm_94_2') &
                      (merged_df['uniqueLoopId']!='3mm_99_2') &
                      (merged_df['uniqueLoopId']!='symm_81_2')]

# prep data for training
X = merged_df.iloc[:, 3:10]
#X = (X-X.mean())/X.std()
y = merged_df.tileSize

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=1)

X_test.info()

    uniqueFilename rootFilename  tileSize  readInvariant  readPrefetched  \
448     3mm_99_2_1          3mm         1              1               1   
70   symm_81_2_128         symm       128              1               0   
576    syrk_77_7_1         syrk         1              1               2   

     readNonPrefetched  writeInvariant  writePrefetched  writeNonPrefetched  \
448                  1               1                0                   0   
70                   4               0                0                   1   
576                  0               1                0                   0   

     distToDominatingLoop  avgRuntime uniqueLoopId  
448                     1   32.539447     3mm_99_2  
70                      1   28.433557    symm_81_2  
576                     1    4.751532    syrk_77_7  
<class 'pandas.core.frame.DataFrame'>
Int64Index: 35 entries, 1669 to 1785
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype
---  -----

In [6]:
# linear regression

lin_reg = LinearRegression(positive=True)
lin_reg.fit(X_train,y_train)
predictions = lin_reg.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)

print('The r2 is: ', r2)
print('The rmse is: ', rmse)
print(predictions)
print(y_test.to_numpy())

The r2 is:  0.00039281402156210454
The rmse is:  88.72314483148281
[68.4653673  59.30451587 59.30451587 68.4653673  59.30451587 59.30451587
 72.59961015 54.72409016 59.30451587 64.0803404  59.30451587 77.37543468
 59.30451587 59.30451587 59.30451587 54.72409016 68.01918444 73.43659064
 54.72409016 59.30451587 59.30451587 63.88494159 77.37543468 59.30451587
 59.30451587 68.4653673  91.95680441 59.30451587 59.30451587 63.88494159
 59.30451587 54.72409016 59.30451587 59.30451587 59.30451587]
[ 64   1   8 256  64 256 128   1   4   1   4   1  16   8   1  32   1  64
   1   1  16 256   1  16   1 128   1   8  32 256   4   1 256 128   4]


In [7]:
# random forest

forest = RandomForestClassifier(n_estimators=500, random_state=888)
forest.fit(X_train,y_train)
predictions = forest.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, predictions))
print(predictions)
print(y_test.to_numpy())

save_model_to_file(forest, "../models/rand_forest.pkl")

Accuracy: 0.34285714285714286
[  4   1 256   4 256 256   1   1   8   1   8   1 256 256 256   1   1  16
   1   1   8   1   1   8   1  64 256 256 256   1   8   1 256 256 256]
[ 64   1   8 256  64 256 128   1   4   1   4   1  16   8   1  32   1  64
   1   1  16 256   1  16   1 128   1   8  32 256   4   1 256 128   4]


In [8]:
# gradient boosted tree

boost = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.5, max_depth=1, random_state=8)
boost.fit(X_train,y_train)
predictions = boost.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, predictions))
print(predictions)
print(y_test.to_numpy())

Accuracy: 0.2857142857142857
[  4  64 256   4 256 256  16   1   8   1   8   1 256 256 256   1   1  16
   1   1   8   1   1   8  64   4 256 256 256   1   8   1 256 256 256]
[ 64   1   8 256  64 256 128   1   4   1   4   1  16   8   1  32   1  64
   1   1  16 256   1  16   1 128   1   8  32 256   4   1 256 128   4]


In [9]:
# support vector machine regression

support = svm.SVR()
support.fit(X_train,y_train)
predictions = support.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)

print('The r2 is: ', r2)
print('The rmse is: ', rmse)
print(predictions)
print(y_test.to_numpy())

The r2 is:  -0.3143179554103184
The rmse is:  101.73548572476079
[6.83845314 5.65662943 8.71676033 6.83845314 8.71676033 8.71676033
 6.444041   7.36721058 8.10000004 8.84920183 8.10000004 7.05607473
 8.71676033 8.71676033 8.71676033 7.36721058 5.33214708 9.16903857
 7.36721058 4.87732452 8.10000004 8.1        7.05607473 8.10000004
 5.65662943 6.38534076 8.74008333 8.71676033 8.71676033 8.1
 8.10000004 6.22477878 8.71676033 8.71676033 8.71676033]
[ 64   1   8 256  64 256 128   1   4   1   4   1  16   8   1  32   1  64
   1   1  16 256   1  16   1 128   1   8  32 256   4   1 256 128   4]


In [10]:
# support vector machine classifier

support = svm.SVC(random_state=8, class_weight='balanced')
support.fit(X_train,y_train)
predictions = support.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)

print("Accuracy:", metrics.accuracy_score(y_test, predictions))
print(predictions)
print(y_test.to_numpy())

Accuracy: 0.14285714285714285
[  4  64   4   4   4   4  64  32   8 128   8 128   4   4   4  32   1 128
  32   1   8   4 128   8  64  64 128   4   4   4   8   1   4   4   4]
[ 64   1   8 256  64 256 128   1   4   1   4   1  16   8   1  32   1  64
   1   1  16 256   1  16   1 128   1   8  32 256   4   1 256 128   4]


In [11]:
# multi-layer perceptron

mlp = MLPClassifier(random_state=8, max_iter=10000, hidden_layer_sizes=(30,))
mlp.fit(X_train,y_train)
predictions = mlp.predict(X_test)

print("Accuracy:", mlp.score(X_test, y_test))
print(predictions)
print(y_test.to_numpy())

Accuracy: 0.34285714285714286
[  4   1 256   4 256 256   1   1   8   1   8   1 256 256 256   1   1  16
   1   1   8   1   1   8   1  64 256 256 256   1   8   1 256 256 256]
[ 64   1   8 256  64 256 128   1   4   1   4   1  16   8   1  32   1  64
   1   1  16 256   1  16   1 128   1   8  32 256   4   1 256 128   4]


In [12]:
# save the best performing models
forest = RandomForestClassifier(n_estimators=500, random_state=888)
forest.fit(X,y)
boost = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.5, max_depth=1, random_state=8)
boost.fit(X,y)
support = svm.SVC(random_state=8, class_weight='balanced')
support.fit(X,y)
mlp = MLPClassifier(random_state=8, max_iter=10000, hidden_layer_sizes=(30,))
mlp.fit(X,y)

save_model_to_file(forest, "../models/boosted_tree.pkl")
save_model_to_file(forest, "../models/rand_forest.pkl")
save_model_to_file(forest, "../models/svm.pkl")
save_model_to_file(forest, "../models/mlp.pkl")