In [1]:
import pandas as pd
import numpy as np
import spacy

from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
data_train = pd.read_csv('cve_train.csv')
data_test = pd.read_csv('cve_test.csv')

description_train = data_train['description']
description_test = data_test['description']
cvss_train = data_train['base_score']
cvss_test = data_test['base_score']


In [4]:
%store -r description_train_vectors

In [5]:
sc_desc = StandardScaler()
description_train_vectors = sc_desc.fit_transform(description_train_vectors, cvss_train)


In [7]:
from sklearn.linear_model import SGDRegressor
print(SGDRegressor().get_params())

{'alpha': 0.0001, 'average': False, 'early_stopping': False, 'epsilon': 0.1, 'eta0': 0.01, 'fit_intercept': True, 'l1_ratio': 0.15, 'learning_rate': 'invscaling', 'loss': 'squared_error', 'max_iter': 1000, 'n_iter_no_change': 5, 'penalty': 'l2', 'power_t': 0.25, 'random_state': None, 'shuffle': True, 'tol': 0.001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}


In [8]:
from skopt import BayesSearchCV

opt = BayesSearchCV(
    SGDRegressor(max_iter=10000),
    {
        'alpha': (1e-8,1e-2, 'log-uniform'),
        'loss': (['squared_error', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']),
        
        'epsilon': (1e-4, 1e+1, 'log-uniform'),  
        'tol': (1e-7, 1e-1, 'log-uniform'),

    },
    n_iter=16,
    cv=3,
    verbose=10,
)

opt.fit(description_train_vectors, cvss_train)

print("val. score: %s" % opt.best_score_)
print("best params: %s" % str(opt.best_params_))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3; 1/1] START alpha=0.002692356745318716, epsilon=0.010383500991809656, loss=huber, tol=0.0003973595125077402
[CV 1/3; 1/1] END alpha=0.002692356745318716, epsilon=0.010383500991809656, loss=huber, tol=0.0003973595125077402;, score=-0.163 total time=   3.0s
[CV 2/3; 1/1] START alpha=0.002692356745318716, epsilon=0.010383500991809656, loss=huber, tol=0.0003973595125077402
[CV 2/3; 1/1] END alpha=0.002692356745318716, epsilon=0.010383500991809656, loss=huber, tol=0.0003973595125077402;, score=-0.156 total time=   2.5s
[CV 3/3; 1/1] START alpha=0.002692356745318716, epsilon=0.010383500991809656, loss=huber, tol=0.0003973595125077402
[CV 3/3; 1/1] END alpha=0.002692356745318716, epsilon=0.010383500991809656, loss=huber, tol=0.0003973595125077402;, score=-0.160 total time=   2.5s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3; 1/1] START alpha=5.143131340991635e-05, epsilon=0.3713609179348028, loss=huber, t

In [9]:

sgdreg = SGDRegressor(alpha=5.143131340991635e-05, epsilon=0.3713609179348028, loss='huber', tol=1.5938951157910062e-07, max_iter= 10000)

In [10]:
sgdreg.fit(description_train_vectors, cvss_train)

SGDRegressor(alpha=5.143131340991635e-05, epsilon=0.3713609179348028,
             loss='huber', max_iter=10000, tol=1.5938951157910062e-07)

In [11]:
%store -r list_description_vec

In [12]:
#scaling test data
list_description_vec = sc_desc.fit_transform(list_description_vec, cvss_test)


In [14]:
cvss_pred = []
for desc_vec in list_description_vec:
    cvss_pred.append(sgdreg.predict([desc_vec]))

In [15]:
cvss_true = []
for i in range(len(cvss_test)):
    cvss_true.append(cvss_test.iloc[i])

In [18]:
#evaluation
from sklearn import metrics
print("Mean squared error: %.3f" %metrics.mean_squared_error(cvss_true, cvss_pred))
print("Explained variance score (1 is the best): %.3f" %metrics.explained_variance_score(cvss_true, cvss_pred))
print("Max error: %.3f" %metrics.max_error(cvss_true, cvss_pred))
print("Mean absolute percentage error: %.3f" %metrics.mean_absolute_percentage_error(cvss_true, cvss_pred))

R2 = sgdreg.score(list_description_vec, cvss_test)
print("R squared: %.3f" %R2)


Mean squared error: 1.952
Explained variance score (1 is the best): 0.279
Max error: 14.952
Mean absolute percentage error: 0.169
R squared: 0.278
