In [1]:
import pandas as pd
import numpy as np
import spacy

from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
data_train = pd.read_csv('cve_train.csv')
data_test = pd.read_csv('cve_test.csv')

description_train = data_train['description']
description_test = data_test['description']
cvss_train = data_train['base_score']
cvss_test = data_test['base_score']


In [4]:
%store -r description_train_vectors

In [6]:
sc_desc = StandardScaler()
description_train_vectors = sc_desc.fit_transform(description_train_vectors, cvss_train)


In [14]:
from skopt import BayesSearchCV
opt = BayesSearchCV(
    LinearSVR(dual=False, loss='squared_epsilon_insensitive', max_iter=10000),
    {
        'C': (1e-2, 1e+2, 'log-uniform'),
        'epsilon': (1e-6, 1e+1, 'log-uniform'),  
        'tol': (1e-8, 1e-2, 'log-uniform'),

    },
    n_iter=16,
    cv=3,
    verbose=10,
)

opt.fit(description_train_vectors, cvss_train)

print("val. score: %s" % opt.best_score_)
print("best params: %s" % str(opt.best_params_))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3; 1/1] START C=1.6424840082639252, epsilon=9.480102676571966e-06, tol=0.00010565389076571371
[CV 1/3; 1/1] END C=1.6424840082639252, epsilon=9.480102676571966e-06, tol=0.00010565389076571371;, score=0.298 total time=   7.7s
[CV 2/3; 1/1] START C=1.6424840082639252, epsilon=9.480102676571966e-06, tol=0.00010565389076571371
[CV 2/3; 1/1] END C=1.6424840082639252, epsilon=9.480102676571966e-06, tol=0.00010565389076571371;, score=0.287 total time=   8.4s
[CV 3/3; 1/1] START C=1.6424840082639252, epsilon=9.480102676571966e-06, tol=0.00010565389076571371
[CV 3/3; 1/1] END C=1.6424840082639252, epsilon=9.480102676571966e-06, tol=0.00010565389076571371;, score=0.284 total time=   7.6s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3; 1/1] START C=23.775476182928614, epsilon=5.635875323709142e-06, tol=0.003847716189917322
[CV 1/3; 1/1] END C=23.775476182928614, epsilon=5.635875323709142e-06, tol=0.00384771618991

In [15]:
from sklearn.svm import LinearSVR

linreg = LinearSVR(C= 0.06518930262914055, epsilon=2.1581620202214426e-06, tol=0.00015386857022078144, max_iter= 10000, dual=False, loss='squared_epsilon_insensitive')

In [16]:
linreg.fit(description_train_vectors, cvss_train)

LinearSVR(C=0.06518930262914055, dual=False, epsilon=2.1581620202214426e-06,
          loss='squared_epsilon_insensitive', max_iter=10000,
          tol=0.00015386857022078144)

In [9]:
%store -r list_description_vec

In [10]:
#scaling test data
list_description_vec = sc_desc.fit_transform(list_description_vec, cvss_test)


In [17]:
cvss_pred = []
for desc_vec in list_description_vec:
    cvss_pred.append(linreg.predict([desc_vec]))

In [18]:
cvss_true = []
for i in range(len(cvss_test)):
    cvss_true.append(cvss_test.iloc[i])

In [21]:
#evaluation
from sklearn import metrics
print("Mean squared error: %.3f" %metrics.mean_squared_error(cvss_true, cvss_pred))
print("Explained variance score (1 is the best): %.3f" %metrics.explained_variance_score(cvss_true, cvss_pred))
print("Max error: %.3f" %metrics.max_error(cvss_true, cvss_pred))
print("Mean absolute percentage error: %.3f" %metrics.mean_absolute_percentage_error(cvss_true, cvss_pred))

R2 = linreg.score(list_description_vec, cvss_test)
print("R squared: %.3f" %R2)


Mean squared error: 1.888
Explained variance score (1 is the best): 0.302
Max error: 9.786
Mean absolute percentage error: 0.169
R squared: 0.302


In [20]:
for i in range(0, 20):
    print(cvss_pred[i])
    print(cvss_true[i])
    print()

[7.69888219]
7.5

[7.58447715]
7.8

[7.91776061]
8.8

[3.8901966]
4.3

[5.51097977]
5.5

[7.07508075]
7.0

[7.99662996]
4.7

[7.46021527]
9.8

[6.27587311]
5.5

[7.51795919]
9.1

[8.09886167]
9.8

[7.15293171]
7.2

[5.39200976]
4.7

[7.95976935]
7.8

[5.82185106]
4.8

[7.66412367]
8.8

[6.96079965]
7.8

[7.15364847]
8.4

[5.79968342]
8.7

[6.27082359]
7.3

