In [1]:
import os
import sys

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import svm
from sklearn.neural_network import MLPClassifier

import pickle

import seaborn as sns
import matplotlib.pyplot as plt

## Process data

In [2]:
def process_data(features_path, runtimes_path):

    features_df = pd.read_csv(features_path)
    runtimes_df = pd.read_csv(runtimes_path)

    # clean up some generation artifacts and get average runtimes
    runtimes_df['uniqueFilename'] = runtimes_df.uniqueFilename.str.rstrip('.out')
    runtimes_df = runtimes_df[runtimes_df.uniqueFilename.str.endswith("_0")==False]
    runtimes_df['avgRuntime'] = runtimes_df[['run1','run2','run3','run4']].mean(axis=1)
    features_df = features_df[features_df.uniqueFilename.str.endswith("_0")==False]
    features_df = features_df[features_df.uniqueFilename.str.endswith("_0")==False]

    # merge the features and average runtimes together
    merged_df = features_df.merge(runtimes_df[['uniqueFilename', 'avgRuntime']])

    # get the fastest tile size for each unique loop
    merged_df['uniqueLoopId'] = merged_df.uniqueFilename.str.split(pat='_').str[:3].str.join('_')
    merged_df = merged_df.sort_values(['uniqueLoopId','avgRuntime'],ascending=True).groupby('uniqueLoopId').head(1)

    # drop programs that predict tile size 0
    # merged_df = merged_df[merged_df.tileSize != 1]

    # keep only programs that tile the innermost loop
    # merged_df = merged_df[(merged_df.distToDominatingLoop == 1) | (merged_df.distToDominatingLoop == 2)]
    merged_df = merged_df[(merged_df.distToDominatingLoop == 1)]

    merged_df.info()
    merged_df.groupby('tileSize').count()
    # print(plot_data.isnull().sum())
    # sns.pairplot(merged_df.iloc[:, 3:10])
    # plt.show()
    
    return merged_df

In [3]:
!ls ../tiled_polybench/features.csv
!ls ../tiled_polybench/runtimes.csv
!ls ../tiled_polybench_lin_alg/features.csv
!ls ../tiled_polybench_lin_alg/runtimes.csv

# features_path = os.path.expanduser("../tiled_polybench_lin_alg/features.csv")
# runtimes_path = os.path.expanduser("../tiled_polybench_lin_alg/runtimes.csv")
features_path = os.path.expanduser("../tiled_polybench/features.csv")
runtimes_path = os.path.expanduser("../tiled_polybench/runtimes_trimmed.csv")

merged_df = process_data(features_path, runtimes_path)

../tiled_polybench/features.csv
../tiled_polybench/runtimes.csv
../tiled_polybench_lin_alg/features.csv
../tiled_polybench_lin_alg/runtimes.csv
<class 'pandas.core.frame.DataFrame'>
Int64Index: 117 entries, 598 to 856
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   uniqueFilename        117 non-null    object 
 1   rootFilename          117 non-null    object 
 2   tileSize              117 non-null    int64  
 3   readInvariant         117 non-null    int64  
 4   readPrefetched        117 non-null    int64  
 5   readNonPrefetched     117 non-null    int64  
 6   writeInvariant        117 non-null    int64  
 7   writePrefetched       117 non-null    int64  
 8   writeNonPrefetched    117 non-null    int64  
 9   distToDominatingLoop  117 non-null    int64  
 10  avgRuntime            117 non-null    float64
 11  uniqueLoopId          117 non-null    object 
dtypes: float64(1), int64(8), objec

## Train models

In [4]:
def save_model_to_file(model, filename):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)
        
def load_model_from_file(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [5]:
# prep data for training
X = merged_df.iloc[:, 3:10]
#X = (X-X.mean())/X.std()
y = merged_df.tileSize

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=1)

In [6]:
# linear regression

lin_reg = LinearRegression(positive=True)
lin_reg.fit(X_train,y_train)
predictions = lin_reg.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)

print('The r2 is: ', r2)
print('The rmse is: ', rmse)
print(predictions)
print(y_test.to_numpy())

The r2 is:  0.11846255593325383
The rmse is:  87.87142011246108
[ 59.73764468  59.73764468  37.95173214  41.80225478  59.73764468
  59.73764468 103.30946975  59.73764468  59.73764468  59.73764468
  59.73764468  59.73764468  59.73764468  37.95173214 103.30946975
  37.95173214  56.28078819 103.30946975  52.43026554  59.73764468
  59.73764468  59.73764468  70.36565543  59.73764468]
[  8  16   1   1 256   1  64  32   1  64 256 128   4   1 128   1 256 256
   1  16   4   8 128   1]


In [8]:
# random forest

forest = RandomForestClassifier(n_estimators=500, random_state=888)
forest.fit(X_train,y_train)
predictions = forest.predict(X_test)

print("Accuracy:", metrics.accuracy_score(y_test, predictions))
print(predictions)
print(y_test.to_numpy())

runtimes_path = os.path.expanduser("../tiled_polybench/runtimes_trimmed.csv")
save_model_to_file(forest, "./models/rand_forest.pkl")

Accuracy: 0.3333333333333333
[256   8   1   1 256   1   4 256 256 256   8 256 256   1  64   1 128   4
   1   8   8 256   1   1]
[  8  16   1   1 256   1  64  32   1  64 256 128   4   1 128   1 256 256
   1  16   4   8 128   1]


FileNotFoundError: [Errno 2] No such file or directory: './models/rand_forest.pkl'

In [None]:
# support vector machine regression

support = svm.SVR()
support.fit(X_train,y_train)
predictions = support.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)

print('The r2 is: ', r2)
print('The rmse is: ', rmse)
print(predictions)
print(y_test.to_numpy())

In [None]:
# support vector machine classifier

support = svm.SVC(random_state=8, class_weight='balanced')
support.fit(X_train,y_train)
predictions = support.predict(X_test)
r2 = r2_score(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)

print("Accuracy:", metrics.accuracy_score(y_test, predictions))
print(predictions)
print(y_test.to_numpy())

save_model_to_file(forest, "models/svm.pkl")

In [None]:
# multi-layer perceptron

mlp = MLPClassifier(random_state=8, max_iter=10000, hidden_layer_sizes=(30,))
mlp.fit(X_train,y_train)
predictions = mlp.predict(X_test)

print("Accuracy:", mlp.score(X_test, y_test))
print(predictions)
print(y_test.to_numpy())

save_model_to_file(forest, "models/mlp.pkl")