In [1]:
import os
import sys

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier

import pickle

import matplotlib.pyplot as plt

## Process data

In [2]:
def process_data(features_path, runtimes_path):

    features_df = pd.read_csv(features_path)
    runtimes_df = pd.read_csv(runtimes_path)

    # clean up some generation artifacts and get average runtimes
    runtimes_df['uniqueFilename'] = runtimes_df.uniqueFilename.str.rstrip('.out')
    runtimes_df = runtimes_df[runtimes_df.uniqueFilename.str.endswith("_0")==False]
    runtimes_df['avgRuntime'] = runtimes_df[['run1','run2','run3','run4']].mean(axis=1)
    features_df = features_df[features_df.uniqueFilename.str.endswith("_0")==False]
    features_df = features_df[features_df.uniqueFilename.str.endswith("_0")==False]

    # merge the features and average runtimes together
    merged_df = features_df.merge(runtimes_df[['uniqueFilename', 'avgRuntime']])

    # get the fastest tile size for each unique loop
    merged_df['uniqueLoopId'] = merged_df.uniqueFilename.str.split(pat='_').str[:3].str.join('_')
    merged_df = merged_df.sort_values(['uniqueLoopId','avgRuntime'],ascending=True).groupby('uniqueLoopId').head(1)

    # drop programs that predict tile size 1
    # merged_df = merged_df[merged_df.tileSize != 1]

    # keep only programs that tile their own dominating loop
    # merged_df = merged_df[(merged_df.distToDominatingLoop == 1)]

    merged_df.info()
    merged_df.groupby('tileSize').count()
    
    return merged_df

In [3]:
# features_path = os.path.expanduser("../tiled_polybench_lin_alg/features.csv")
# runtimes_path = os.path.expanduser("../tiled_polybench_lin_alg/runtimes.csv")
features_path = os.path.expanduser("../tiled_polybench/features.csv")
runtimes_path = os.path.expanduser("../tiled_polybench/runtimes_trimmed.csv")

merged_df = process_data(features_path, runtimes_path)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 246 entries, 586 to 856
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   uniqueFilename        246 non-null    object 
 1   rootFilename          246 non-null    object 
 2   tileSize              246 non-null    int64  
 3   readInvariant         246 non-null    int64  
 4   readPrefetched        246 non-null    int64  
 5   readNonPrefetched     246 non-null    int64  
 6   writeInvariant        246 non-null    int64  
 7   writePrefetched       246 non-null    int64  
 8   writeNonPrefetched    246 non-null    int64  
 9   distToDominatingLoop  246 non-null    int64  
 10  avgRuntime            246 non-null    float64
 11  uniqueLoopId          246 non-null    object 
dtypes: float64(1), int64(8), object(3)
memory usage: 25.0+ KB


## Train models

In [4]:
def save_model_to_file(model, filename):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)
        
def load_model_from_file(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [5]:
# prep data for training
X = merged_df.iloc[:, 3:10]
y = merged_df.tileSize

# train and test set for final evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=8)

# train and validation for tuning
X_train_valid, X_test_valid, y_train_valid, y_test_valid = train_test_split(
        X_train, y_train, stratify=y_train, test_size=0.3, random_state=8)

In [6]:
# linear regression

lin_reg = LinearRegression(positive=True)
lin_reg.fit(X_train_valid,y_train_valid)
predictions = lin_reg.predict(X_test_valid)
r2 = r2_score(y_test_valid, predictions)
rmse = mean_squared_error(y_test_valid, predictions, squared=False)

print('The r2 is: ', r2)
print('The rmse is: ', rmse)
print(predictions)
print(y_test_valid.to_numpy())

The r2 is:  -0.0343478407925073
The rmse is:  91.4363784910055
[ 54.69301694  74.72957281  74.72957281  54.69301694  54.69301694
  73.26799804  63.28065458  39.46176733 119.50227084  63.28065458
  76.19114757  81.37318478  54.69301694  66.14193516  54.69301694
  28.0128491   54.69301694  54.69301694 127.53263397  54.69301694
  74.72957281  61.81907981  66.14193516  79.66390723  54.69301694
  54.69301694  66.14193516  73.26799804  81.37318478 116.08371575
  40.92334209  81.37318478  66.14193516  92.822103    77.59085339
  66.14193516  54.69301694  92.822103    54.69301694  66.14193516
  81.37318478  86.17849103  66.14193516  63.28065458  54.69301694
  54.69301694  54.69301694  81.37318478  92.822103    66.14193516
  54.69301694  40.92334209  28.0128491   66.14193516  66.14193516
 119.50227084 108.05335261  86.17849103  92.822103  ]
[256  64   1  16 256   1   1 128   4   1 128  64   8  64 128 128  16   4
 256 256   4   1   8  16   4   4   1 256  64  32  32   1  16   1  16  32
 256 256  3

In [7]:
# random forest

forest = RandomForestClassifier(n_estimators=500, random_state=888)
forest.fit(X_train_valid,y_train_valid)
predictions = forest.predict(X_test_valid)

print("Accuracy:", metrics.accuracy_score(y_test_valid, predictions))
print(predictions)
print(y_test_valid.to_numpy())

Accuracy: 0.1864406779661017
[  8  64  64   4   4  16   1   1   4   1   1 256   4   1   4   1   8   4
  32   4  64   1  64 128   8   4  64  16 256  16   4 256   1  16  32   1
   8  16   8  64 256 128  64   1   1   4   4 256  16   1   4   1   1  64
   1  16 256 128  16]
[256  64   1  16 256   1   1 128   4   1 128  64   8  64 128 128  16   4
 256 256   4   1   8  16   4   4   1 256  64  32  32   1  16   1  16  32
 256 256  32   8   1   8   1   1   1 128  16   8   1 128  64  16   1 256
   8  64 256   1   1]


In [8]:
# gradient boosted tree

boost = GradientBoostingClassifier(n_estimators=500, learning_rate=0.5, max_depth=1, random_state=8)
boost.fit(X_train_valid,y_train_valid)
predictions = boost.predict(X_test_valid)

print("Accuracy:", metrics.accuracy_score(y_test_valid, predictions))
print(predictions)
print(y_test_valid.to_numpy())

Accuracy: 0.22033898305084745
[  1   1   1   1   1   1   1   1   4   1  64  16   1  16   1   1   1   1
  32   1   1   1   1   8   1   1   1   1 256   1   1 256  16  16  16  16
   1  16   1   1 256 128   1   1   1   1   1 256  16  16   1   1   1   1
  16  16   4 128  16]
[256  64   1  16 256   1   1 128   4   1 128  64   8  64 128 128  16   4
 256 256   4   1   8  16   4   4   1 256  64  32  32   1  16   1  16  32
 256 256  32   8   1   8   1   1   1 128  16   8   1 128  64  16   1 256
   8  64 256   1   1]


In [9]:
# support vector machine regression

support = svm.SVR()
support.fit(X_train_valid,y_train_valid)
predictions = support.predict(X_test_valid)
r2 = r2_score(y_test_valid, predictions)
rmse = mean_squared_error(y_test_valid, predictions, squared=False)

print('The r2 is: ', r2)
print('The rmse is: ', rmse)
print(predictions)
print(y_test_valid.to_numpy())

The r2 is:  -0.3011430246068767
The rmse is:  102.5529669686721
[13.3983059  15.90032103 15.90032103 13.69099603 13.69099603 15.21364272
 14.50122708 14.71946951 16.13863795 14.50122708 17.313872   14.52427437
 13.69099603 15.35844292 13.69099603 12.83311413 13.3983059  13.69099603
 16.7452067  13.69099603 15.90032103 13.54747522 15.25857385 13.93373205
 13.3983059  13.69099603 15.25857385 15.21364272 14.67966712 15.64580112
 15.30610971 14.67966712 15.35844292 15.90032085 17.51924266 15.35844292
 13.3983059  16.10001525 13.3983059  15.25857385 14.67966712 17.23331077
 15.25857385 14.50122708 14.12208336 13.69099603 13.69099603 14.67966712
 15.90032085 15.35844292 13.69099603 15.21368496 12.83311413 15.25857385
 15.35844292 16.27733744 15.44558473 17.23331077 16.11136022]
[256  64   1  16 256   1   1 128   4   1 128  64   8  64 128 128  16   4
 256 256   4   1   8  16   4   4   1 256  64  32  32   1  16   1  16  32
 256 256  32   8   1   8   1   1   1 128  16   8   1 128  64  16   1 25

In [10]:
# support vector machine classifier

support = svm.SVC(random_state=8, class_weight='balanced')
support.fit(X_train_valid,y_train_valid)
predictions = support.predict(X_test_valid)
r2 = r2_score(y_test_valid, predictions)
rmse = mean_squared_error(y_test_valid, predictions, squared=False)

print("Accuracy:", metrics.accuracy_score(y_test_valid, predictions))
print(predictions)
print(y_test_valid.to_numpy())

Accuracy: 0.23728813559322035
[ 32  64  64   4   4  16   1  32   4   1  64  16   4   8   4  32  32   4
  16   4  64   1   8  32  32   4   8  16   4  16  64   4   8   4  16   8
  32 256  32   8   4  64   8   1  16   4   4   4   4   8   4  64  32   8
   8  16   4  64  16]
[256  64   1  16 256   1   1 128   4   1 128  64   8  64 128 128  16   4
 256 256   4   1   8  16   4   4   1 256  64  32  32   1  16   1  16  32
 256 256  32   8   1   8   1   1   1 128  16   8   1 128  64  16   1 256
   8  64 256   1   1]


In [11]:
# multi-layer perceptron

mlp = MLPClassifier(random_state=8, max_iter=10000, hidden_layer_sizes=(30,))
mlp.fit(X_train_valid,y_train_valid)
predictions = mlp.predict(X_test_valid)

print("Accuracy:", mlp.score(X_test_valid, y_test_valid))
print(predictions)
print(y_test_valid.to_numpy())

Accuracy: 0.2542372881355932
[  8  16  16   4   4  16   1   1   4   1   1  16   4  16   4   1   8   4
 128   4  16   1   8 128   8   4   8  16 256 128   4 256  16  16  16  16
   8  16   8   8 256   8   8   1   1   4   4 256  16  16   4   1   1   8
  16  16 256   8  16]
[256  64   1  16 256   1   1 128   4   1 128  64   8  64 128 128  16   4
 256 256   4   1   8  16   4   4   1 256  64  32  32   1  16   1  16  32
 256 256  32   8   1   8   1   1   1 128  16   8   1 128  64  16   1 256
   8  64 256   1   1]


## Print test accuracies

In [12]:
predictions = forest.predict(X_test)
print("Random Forest Accuracy    :", metrics.accuracy_score(y_test, predictions))

predictions = boost.predict(X_test)
print("Grad-boosted Tree Accuracy:", metrics.accuracy_score(y_test, predictions))

predictions = support.predict(X_test)
print("SVM Accuracy              :", metrics.accuracy_score(y_test, predictions))

predictions = mlp.predict(X_test)
print("MLP Accuracy              :", mlp.score(X_test, y_test))

Random Forest Accuracy    : 0.22
Grad-boosted Tree Accuracy: 0.32
SVM Accuracy              : 0.18
MLP Accuracy              : 0.3


## Save trained models to directory

In [13]:
# # save the best performing models
# forest = RandomForestClassifier(n_estimators=500, random_state=888)
# forest.fit(X,y)
# boost = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.5, max_depth=1, random_state=8)
# boost.fit(X,y)
# support = svm.SVC(random_state=8, class_weight='balanced')
# support.fit(X,y)
# mlp = MLPClassifier(random_state=8, max_iter=10000, hidden_layer_sizes=(30,))
# mlp.fit(X,y)

# save_model_to_file(forest, "../models/boosted_tree.pkl")
# save_model_to_file(forest, "../models/rand_forest.pkl")
# save_model_to_file(forest, "../models/svm.pkl")
# save_model_to_file(forest, "../models/mlp.pkl")