In [1]:
import os
import sys

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

import pickle

import seaborn as sns
import matplotlib.pyplot as plt

## Process data

In [2]:
def process_data(features_path, runtimes_path):

    features_df = pd.read_csv(features_path)
    runtimes_df = pd.read_csv(runtimes_path)

    # clean up some generation artifacts and get average runtimes
    runtimes_df['uniqueFilename'] = runtimes_df.uniqueFilename.str.rstrip('.out')
    runtimes_df = runtimes_df[runtimes_df.uniqueFilename.str.endswith("_0")==False]
    runtimes_df['avgRuntime'] = runtimes_df[['run1','run2','run3','run4']].mean(axis=1)
    features_df = features_df[features_df.uniqueFilename.str.endswith("_0")==False]
    features_df = features_df[features_df.uniqueFilename.str.endswith("_0")==False]

    # merge the features and average runtimes together
    merged_df = features_df.merge(runtimes_df[['uniqueFilename', 'avgRuntime']])

    # get the fastest tile size for each unique loop
    merged_df['uniqueLoopId'] = merged_df.uniqueFilename.str.split(pat='_').str[:3].str.join('_')
    merged_df = merged_df.sort_values(['uniqueLoopId','avgRuntime'],ascending=True).groupby('uniqueLoopId').head(1)

    # drop programs that predict tile size 1
    # merged_df = merged_df[merged_df.tileSize != 1]

    # keep only programs that tile their own dominating loop
    # merged_df = merged_df[(merged_df.distToDominatingLoop == 1)]

    merged_df.info()
    merged_df.groupby('tileSize').count()
    
    return merged_df

In [3]:
# features_path = os.path.expanduser("../tiled_polybench_lin_alg/features.csv")
# runtimes_path = os.path.expanduser("../tiled_polybench_lin_alg/runtimes.csv")
# features_path = os.path.expanduser("../tiled_polybench/features.csv")
# runtimes_path = os.path.expanduser("../tiled_polybench/runtimes_trimmed.csv")

merged_df = process_data(features_path, runtimes_path)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168 entries, 586 to 856
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   uniqueFilename        168 non-null    object 
 1   rootFilename          168 non-null    object 
 2   tileSize              168 non-null    int64  
 3   readInvariant         168 non-null    int64  
 4   readPrefetched        168 non-null    int64  
 5   readNonPrefetched     168 non-null    int64  
 6   writeInvariant        168 non-null    int64  
 7   writePrefetched       168 non-null    int64  
 8   writeNonPrefetched    168 non-null    int64  
 9   distToDominatingLoop  168 non-null    int64  
 10  avgRuntime            168 non-null    float64
 11  uniqueLoopId          168 non-null    object 
dtypes: float64(1), int64(8), object(3)
memory usage: 17.1+ KB


## Train models

In [4]:
def save_model_to_file(model, filename):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)
        
def load_model_from_file(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [5]:
# prep data for training
X = merged_df.iloc[:, 3:10]
y = merged_df.tileSize

# train and test set for final evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=8)

# train and validation for tuning
X_train_valid, X_test_valid, y_train_valid, y_test_valid = train_test_split(
        X_train, y_train, stratify=y_train, test_size=0.3, random_state=8)

In [6]:
# linear regression

lin_reg = LinearRegression(positive=True)
lin_reg.fit(X_train_valid,y_train_valid)
predictions = lin_reg.predict(X_test_valid)
r2 = r2_score(y_test_valid, predictions)
rmse = mean_squared_error(y_test_valid, predictions, squared=False)

print('The r2 is: ', r2)
print('The rmse is: ', rmse)
print(predictions)
print(y_test_valid.to_numpy())

The r2 is:  0.015438306894459908
The rmse is:  88.10093546295434
[ 52.53729187  69.93290521  62.0589901   62.0589901   62.0589901
  69.93290521  84.43065367  69.93290521  62.0589901   52.53729187
  52.53729187  60.41120698  69.93290521  88.16750099  52.53729187
  52.53729187  62.0589901   76.55673856  62.0589901   85.26963611
  43.82439432  69.93290521  69.93290521  69.93290521  69.93290521
  88.16750099  35.95047921  35.95047921  69.93290521  43.82439432
  44.66337676  62.0589901  102.69543122  62.0589901   62.0589901
  69.93290521  62.0589901   69.93290521  69.93290521  62.0589901
  69.93290521]
[  4   1 256  32   8   1   4   8 256  64 128   8  16   1   1 256  16   1
  64   4   1   8  32 256 128 256   1   1 128  32   1 256 128   1  16  64
  16   8  64 128   8]


In [7]:
# random forest

forest = RandomForestClassifier(n_estimators=500, random_state=888)
forest.fit(X_train_valid,y_train_valid)
predictions = forest.predict(X_test_valid)

print("Accuracy:", metrics.accuracy_score(y_test_valid, predictions))
print(predictions)
print(y_test_valid.to_numpy())

save_model_to_file(forest, "../models/rand_forest.pkl")

Accuracy: 0.2926829268292683
[ 32   8   8 256 256   8   1   8 256  64  64 128  16   1   1   1   8 128
 256 128 128   8  16   8  16   1   1   1  16   1   1 256  64 256 256   8
 256   8  16 256  16]
[  4   1 256  32   8   1   4   8 256  64 128   8  16   1   1 256  16   1
  64   4   1   8  32 256 128 256   1   1 128  32   1 256 128   1  16  64
  16   8  64 128   8]


In [8]:
# gradient boosted tree

boost = GradientBoostingClassifier(n_estimators=500, learning_rate=0.5, max_depth=1, random_state=8)
boost.fit(X_train_valid,y_train_valid)
predictions = boost.predict(X_test_valid)

print("Accuracy:", metrics.accuracy_score(y_test_valid, predictions))
print(predictions)
print(y_test_valid.to_numpy())

Accuracy: 0.21951219512195122
[ 32 256   8 256 256 256   1 256 256  64  64   4  16   1  64  64   8 128
 256   8  64 256  16 256  16   1   1   1  16  64   1 256  64 256 256 256
 256 256  16 256  16]
[  4   1 256  32   8   1   4   8 256  64 128   8  16   1   1 256  16   1
  64   4   1   8  32 256 128 256   1   1 128  32   1 256 128   1  16  64
  16   8  64 128   8]


In [9]:
# support vector machine regression

support = svm.SVR()
support.fit(X_train_valid,y_train_valid)
predictions = support.predict(X_test_valid)
r2 = r2_score(y_test_valid, predictions)
rmse = mean_squared_error(y_test_valid, predictions, squared=False)

print('The r2 is: ', r2)
print('The rmse is: ', rmse)
print(predictions)
print(y_test_valid.to_numpy())

The r2 is:  -0.3175395254321467
The rmse is:  101.91565005994183
[15.9130806  15.61684406 13.84084392 14.21340515 14.21340515 15.61684406
 15.6064222  15.61684406 14.21340515 14.14114691 14.14114691 15.48228832
 15.89985553 14.88992696 15.54889261 15.54889261 13.84084392 14.78596267
 14.21340515 15.46237186 16.00566987 15.61684406 15.89985553 15.61684406
 15.89985553 14.88992696 14.58830228 15.38292932 15.89985553 15.56691474
 12.65777396 14.21340515 16.85506357 14.21340515 14.21340515 15.61684406
 14.21340515 15.61684406 15.89985553 14.21340515 15.89985553]
[  4   1 256  32   8   1   4   8 256  64 128   8  16   1   1 256  16   1
  64   4   1   8  32 256 128 256   1   1 128  32   1 256 128   1  16  64
  16   8  64 128   8]


In [10]:
# support vector machine classifier

support = svm.SVC(random_state=8, class_weight='balanced')
support.fit(X_train_valid,y_train_valid)
predictions = support.predict(X_test_valid)
r2 = r2_score(y_test_valid, predictions)
rmse = mean_squared_error(y_test_valid, predictions, squared=False)

print("Accuracy:", metrics.accuracy_score(y_test_valid, predictions))
print(predictions)
print(y_test_valid.to_numpy())

Accuracy: 0.14634146341463414
[ 32 256  32   4   4 256  64 256   4  64  64  64  16 256  64  64  32 128
   4  64  64 256  16 256  16 256  64   1  16  64   1   4  64   4   4 256
   4 256  16   4  16]
[  4   1 256  32   8   1   4   8 256  64 128   8  16   1   1 256  16   1
  64   4   1   8  32 256 128 256   1   1 128  32   1 256 128   1  16  64
  16   8  64 128   8]


In [11]:
# multi-layer perceptron

mlp = MLPClassifier(random_state=8, max_iter=10000, hidden_layer_sizes=(30,))
mlp.fit(X_train_valid,y_train_valid)
predictions = mlp.predict(X_test_valid)

print("Accuracy:", mlp.score(X_test_valid, y_test_valid))
print(predictions)
print(y_test_valid.to_numpy())

Accuracy: 0.3170731707317073
[ 32   8   8 256 256   8   1   8 256  64  64   8  16   1   1   1   8 128
 256 128 128   8  16   8  16   1   1   1  16   1   1 256  64 256 256   8
 256   8  16 256  16]
[  4   1 256  32   8   1   4   8 256  64 128   8  16   1   1 256  16   1
  64   4   1   8  32 256 128 256   1   1 128  32   1 256 128   1  16  64
  16   8  64 128   8]


## Print test accuracies

In [12]:
predictions = forest.predict(X_test)
print("Random Forest Accuracy    :", metrics.accuracy_score(y_test, predictions))

predictions = boost.predict(X_test)
print("Grad-boosted Tree Accuracy:", metrics.accuracy_score(y_test, predictions))

predictions = support.predict(X_test)
print("SVM Accuracy              :", metrics.accuracy_score(y_test, predictions))

predictions = mlp.predict(X_test)
print("MLP Accuracy              :", mlp.score(X_test, y_test))

Random Forest Accuracy    : 0.14705882352941177
Grad-boosted Tree Accuracy: 0.14705882352941177
SVM Accuracy              : 0.08823529411764706
MLP Accuracy              : 0.11764705882352941


## Save trained models to directory

In [13]:
# # save the best performing models
# forest = RandomForestClassifier(n_estimators=500, random_state=888)
# forest.fit(X,y)
# boost = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.5, max_depth=1, random_state=8)
# boost.fit(X,y)
# support = svm.SVC(random_state=8, class_weight='balanced')
# support.fit(X,y)
# mlp = MLPClassifier(random_state=8, max_iter=10000, hidden_layer_sizes=(30,))
# mlp.fit(X,y)

# save_model_to_file(forest, "../models/boosted_tree.pkl")
# save_model_to_file(forest, "../models/rand_forest.pkl")
# save_model_to_file(forest, "../models/svm.pkl")
# save_model_to_file(forest, "../models/mlp.pkl")