In [1]:
import os
import sys

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import spacy
import seaborn as sns

from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler



In [2]:
nlp = spacy.load("en_core_web_lg")

In [3]:
data_train = pd.read_csv('cve_train.csv')
data_test = pd.read_csv('cve_test.csv')

description_train = data_train['description']
description_test = data_test['description']
cvss_train = data_train['base_score']
cvss_test = data_test['base_score']


In [8]:
#transform tokens of a description and summarize them
vectors = description_train.apply(lambda row: sum([token.vector for token in nlp(row)]))
print(vectors.shape)

(61616,)


In [9]:
description_train_vectors = list(vectors)


In [10]:
%store description_train_vectors

Stored 'description_train_vectors' (list)


In [5]:
%store -r description_train_vectors

In [6]:
sc_desc = StandardScaler()
description_train_vectors = sc_desc.fit_transform(description_train_vectors, cvss_train)

print(description_train_vectors[0])

[ 0.14649557 -0.55227798  0.90057199 -1.2803768   0.0576298  -0.47833431
 -0.22835618  0.36439333  0.00328053 -0.85903832  0.62636007  0.00451147
 -0.42435703 -0.73285399  0.77436626  0.67126756  0.96820716 -0.84549552
  0.56878143  0.3666353   0.580081   -0.31789413  0.72521085  0.29677413
 -0.15000359 -0.31159218  0.24030109  0.95618542 -0.23835819  0.39491722
 -0.24082956 -0.0341159   0.75534603 -0.52700404 -0.55173798  0.46733022
  0.40960478 -0.29781466 -1.07355784  0.60805827  0.11033828  0.10402227
  0.4670759   0.67419648  0.89649253 -0.749397    0.32171477 -0.56424338
  0.22245114 -0.08143247 -0.39916558  0.34185232  0.49286773 -0.07949017
  0.7608605   0.00264919  0.47645816  0.72439171  0.54586855  1.0372079
 -0.06666585 -0.82707155  0.41600241 -0.39324226 -0.81003632  0.0590296
 -0.64041     0.28927566 -0.62875011 -0.6278614  -1.08536251 -0.53952493
 -0.91877433  0.63108185 -0.94669088  0.47865593 -0.90043977  0.90849993
  0.82597314 -0.63455542  0.66652547 -0.1929344   0.8

In [27]:
from sklearn.svm import LinearSVR

linreg = LinearSVR(C= 0.03850588924849597, epsilon=0.031167867862421057, random_state=0, tol=4.305645458379762e-06, max_iter= 10000, dual=False, loss='squared_epsilon_insensitive')

In [28]:
linreg.fit(description_train_vectors, cvss_train)

LinearSVR(C=0.03850588924849597, dual=False, epsilon=0.031167867862421057,
          loss='squared_epsilon_insensitive', max_iter=10000, random_state=0,
          tol=4.305645458379762e-06)

In [17]:
from sklearn.linear_model import SGDRegressor
sgdreg = SGDRegressor(max_iter=1000, tol=1e-3)
sgdreg.fit(description_train_vectors, cvss_train)

SGDRegressor()

In [41]:
print(SVR().get_params())

{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [24]:
print(LinearSVR().get_params())

{'C': 1.0, 'dual': True, 'epsilon': 0.0, 'fit_intercept': True, 'intercept_scaling': 1.0, 'loss': 'epsilon_insensitive', 'max_iter': 1000, 'random_state': None, 'tol': 0.0001, 'verbose': 0}


In [7]:
subset_train_X = description_train_vectors[:10000]
subset_train_y = cvss_train[:10000]

In [40]:
from skopt import BayesSearchCV
opt = BayesSearchCV(
    SVR(),
    {
        'kernel': (['linear', 'rbf']),
        'C': (1e-2, 1e+2, 'log-uniform'),
        'epsilon': (1e-6, 1e+1, 'log-uniform'),  
        'tol': (1e-6, 1e-2, 'log-uniform'),

    },
    n_iter=10,
    cv=3,
    verbose=10,
)

opt.fit(subset_train_X, subset_train_y)

print("val. score: %s" % opt.best_score_)
print("best params: %s" % str(opt.best_params_))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3; 1/1] START C=87.34956711284099, epsilon=0.46954582109726767, kernel=rbf, tol=0.0025029997077590125
[CV 1/3; 1/1] END C=87.34956711284099, epsilon=0.46954582109726767, kernel=rbf, tol=0.0025029997077590125;, score=0.309 total time=  15.1s
[CV 2/3; 1/1] START C=87.34956711284099, epsilon=0.46954582109726767, kernel=rbf, tol=0.0025029997077590125
[CV 2/3; 1/1] END C=87.34956711284099, epsilon=0.46954582109726767, kernel=rbf, tol=0.0025029997077590125;, score=0.332 total time=  14.8s
[CV 3/3; 1/1] START C=87.34956711284099, epsilon=0.46954582109726767, kernel=rbf, tol=0.0025029997077590125
[CV 3/3; 1/1] END C=87.34956711284099, epsilon=0.46954582109726767, kernel=rbf, tol=0.0025029997077590125;, score=0.347 total time=  14.0s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3; 1/1] START C=0.24386711948308082, epsilon=0.010369041327770417, kernel=rbf, tol=0.00025608351102980955
[CV 1/3; 1/1] END C=0.2438671

In [9]:
reg = SVR(C=87.34956711284099, epsilon=0.46954582109726767, kernel='rbf', tol= 0.0025029997077590125)

In [36]:
subset_train_X_fit = description_train_vectors[:45000]
subset_train_y_fit = cvss_train[:45000]

In [37]:
reg.fit(subset_train_X_fit, subset_train_y_fit)

In [14]:
#put test data into a list
list_description = []
for i in range(len(description_test)):
    list_description.append(description_test.iloc[i])

In [15]:
list_description_vec = []
for desc in list_description:
    list_description_vec.append(sum([token.vector for token in nlp(desc)]))

In [16]:
%store list_description_vec

Stored 'list_description_vec' (list)


In [28]:
%store -r list_description_vec

In [29]:
#scaling test data
list_description_vec = sc_desc.fit_transform(list_description_vec, cvss_test)


In [30]:
subset_test_X = list_description_vec[:6000] 

In [31]:
cvss_pred = []
for desc_vec in subset_test_X:
    cvss_pred.append(reg.predict([desc_vec]))

In [32]:
subset_test_y = cvss_test[:6000]

In [33]:
cvss_true = []
for i in range(len(subset_test_y)):
    cvss_true.append(subset_test_y.iloc[i])

In [34]:
#evaluation
from sklearn import metrics
print("Mean squared error: %.3f" %metrics.mean_squared_error(cvss_true, cvss_pred))
print("Explained variance score (1 is the best): %.3f" %metrics.explained_variance_score(cvss_true, cvss_pred))
print("Max error: %.3f" %metrics.max_error(cvss_true, cvss_pred))
print("Mean absolute percentage error: %.3f" %metrics.mean_absolute_percentage_error(cvss_true, cvss_pred))

R2 = reg.score(subset_test_X, subset_test_y)
print("R squared: %.3f" %R2)


Mean squared error: 1.511
Explained variance score (1 is the best): 0.444
Max error: 5.117
Mean absolute percentage error: 0.139
R squared: 0.444


In [35]:
for i in range(0, 20):
    print(cvss_pred[i])
    print(cvss_true[i])
    print()

[7.62601486]
7.5

[8.03640233]
7.8

[8.91833507]
8.8

[4.38993263]
4.3

[4.88315988]
5.5

[7.11599151]
7.0

[7.60036377]
4.7

[6.66141597]
9.8

[5.78100234]
5.5

[8.66549537]
9.1

[9.32605834]
9.8

[5.61558254]
7.2

[4.21811144]
4.7

[8.32114055]
7.8

[6.91011473]
4.8

[8.41492193]
8.8

[7.17709375]
7.8

[5.63582432]
8.4

[6.8942281]
8.7

[5.85222489]
7.3



In [None]:
import pickle
pickle.dump(reg, open('svm_model', 'wb'))