In [1]:
%run ml_header.py

# Get the data

In [None]:
%run get_data.py
benchmarks = ('adpcm_encoder', 'average', 'fir')
gd = GetData('data/ES', benchmarks, load_fpga_v5=False)
gd.main()
gd.data_v4.keys()

# Data analysis

In [None]:
%run methods.py

In [None]:
results_direct_mapping = {}
for i in benchmarks:
    print(i)
    results_direct_mapping[i] = DirectMapping.main(gd.data_v4[i], plot_figure=True, display_table=True)
    print_results(results_direct_mapping[i])
    print('=' * 40)

## Method starts

In [None]:
%run machine_learning.py

In [None]:
features = ['AREA', 'state', 'FU', 'REG', 'MUX', 'DEC', 'pin_pair',
            'net', 'max', 'min', 'ave', 'MISC', 'MEM', 'sim', 'Pmax',
            'Pmin', 'Pave', 'Latency', 'BlockMemoryBit', 'DSP', 'Slices']
# 'CP_delay',

invalid_features = ['Slices', 'Latency']
valid_features = [i for i in features if i not in invalid_features]
label = 'Slices'

In [None]:
# combine multiple benchmarks
benchmark_train = ('fir', 'average')
data_train = pd.concat([gd.data_v4[i] for i in benchmark_train], axis=0, ignore_index=True)[features]
data_train['Latency'].hist(bins=50)
plt.show()

In [None]:
# fix missing data
data_train = ML.fix_missing_data(data_train)
display(data_train.head(2))
data_train.info()

In [None]:
# X, y
X, y = ML.separate_feature_label(data_train, invalid_features=invalid_features, label=label)

In [None]:
# feature scaling
X = ML.feature_scaling(X)
print(X.shape)
X

In [None]:
# feature importance
estimator = RandomForestRegressor()
estimator.fit(X, y)
for i in np.argsort(estimator.feature_importances_)[::-1]:
    print('{}: {:7.3}'.format(valid_features[i], estimator.feature_importances_[i]))

In [None]:
%%time
title = 'Learning Curve (Linear Regression)'
cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
estimator = ensemble.GradientBoostingRegressor(random_state=42)
scores = ML.plot_learning_curve(estimator, title, X, y, ylim=(0.9, 1.01), cv=cv, n_jobs=4)
plt.show()
print(scores)

In [None]:
train_set, test_set = ML.split_data(data_train, distribution=data_train['Latency'], train_size=1100)

In [None]:
X, y = ML.separate_feature_label(train_set, invalid_features=['Slices', 'Latency'], label='Slices')
print(X.shape)
X.head(2)

In [None]:
X = ML.feature_scaling(X)

In [None]:
estimator.fit(X, y)

In [None]:
benchmark_test = [i for i in benchmarks if i not in benchmark_train]
for i in benchmark_test:
    data_test = ML.fix_missing_data(gd.data_v4[i][features])
    X, y = ML.separate_feature_label(data_test, valid_features=valid_features, label=label)
    X = ML.feature_scaling(X)
    y_pred = estimator.predict(X)
    data_test['AREA'] = y_pred
    print(i)
    display(data_test.head(2))
    print_results(DirectMapping.main(data_test, plot_figure=True))
    print()
    print_results(results_direct_mapping[i])

## Use regularization

In [None]:
%%time
alphas = np.arange(0.1, 10.1, 0.1)
scores_train = []
scores_test = []
X, y = ML.separate_feature_label(data_train, invalid_features=['Slices', 'Latency'], label='Slices')
X = ML.feature_scaling(X)
for alpha in alphas:
#     estimator = linear_model.Lasso(alpha=alpha)
#     estimator = linear_model.ElasticNet(alpha=alpha)
    estimator = linear_model.Ridge(alpha=alpha)
    scores_cv_train, scores_cv_test = [], []
    # cross-validation using shuffle split
    for train_indices, test_indices in ShuffleSplit(n_splits=5, random_state=42).split(X):
        estimator.fit(X[train_indices], y[train_indices])
        scores_cv_train.append(estimator.score(X[train_indices], y[train_indices]))
        scores_cv_test.append(estimator.score(X[test_indices], y[test_indices]))
    scores_train.append(np.mean(scores_cv_train))
    scores_test.append(np.mean(scores_cv_test))

In [None]:
plt.plot(alphas, scores_train, 'o-', color='r', label='Training score')
plt.plot(alphas, scores_test, 'o-', color='g', label='Cross-validation score')
plt.legend(loc='best')
plt.show()

In [None]:
np.sort(cross_val_score(linear_model.BayesianRidge(), X, y, cv=ShuffleSplit(n_splits=10, random_state=42)))

## Model analysis

Perform cross-validation on the estimators using all the benchmarks to find the best estimator for the prediction problem.

In [None]:
estimators = (
    linear_model.LinearRegression(),
    linear_model.Lasso(),
    linear_model.ElasticNet(),
    Ridge(),
    svm.LinearSVR(random_state=42),
    svm.SVR(kernel='linear'),
    tree.DecisionTreeRegressor(random_state=42),
    ensemble.RandomForestRegressor(random_state=42),
    ensemble.AdaBoostRegressor(random_state=42),
    ensemble.GradientBoostingRegressor(random_state=42),
)

In [None]:
%%time
scores_cv = dict()
for estimator in estimators:
    scores = list()
    for benchmark in benchmarks:
        score_cv = cross_val_score(estimator, X, y, cv=ShuffleSplit(n_splits=10, random_state=42))
        scores.append(np.mean(score_cv))
    scores_cv[estimator.__class__] = (np.mean(scores), np.std(scores))

In [None]:
scores_cv

## Feature selection

In [None]:
%%time
for estimator in estimators:
# estimator = ensemble.GradientBoostingRegressor(random_state=42)
    print(estimator.__class__)
    selector = RFECV(estimator, step=1, cv=10)
    for i in benchmarks:
        data = ML.fix_missing_data(gd.data_v4[i][features])
        X, y = ML.separate_feature_label(data, invalid_features=invalid_features, label=label)
        X = ML.feature_scaling(X)
        selector = selector.fit(X, y)
        print(i)
        print(np.array(valid_features)[selector.support_])
        print()