In [1]:
import os
import sys

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import spacy
import seaborn as sns

from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler



In [3]:
nlp = spacy.load("en_core_web_lg")

In [7]:
data_train = pd.read_csv('cve_train.csv')
data_test = pd.read_csv('cve_test.csv')

In [8]:
#60000 too much data for SVR

#taking 10000 sample for parameter tuning
#taking 30000 sample for the model


#stratifying subset 

from scipy.stats import gaussian_kde

def samplestrat(df, stratifying_column_name, num_to_sample, maxrows_to_est = 10000, bw_per_range = 50, eval_points = 1000 ):
    '''Take a sample of dataframe df stratified by stratifying_column_name
    '''
    strat_col_values = df[stratifying_column_name].values
    samplcol = (df.sample(maxrows_to_est)  if df.shape[0] > maxrows_to_est else df  )[stratifying_column_name].values
    vmin, vmax = min(samplcol), max(samplcol)
    pts = np.linspace(vmin,vmax  ,eval_points) 
    kernel = gaussian_kde( samplcol , bw_method = float(  (vmax - vmin)/bw_per_range  )   )
    density_estim_full = np.interp(strat_col_values, pts , kernel.evaluate(pts) )
    return df.sample(n=num_to_sample, weights = 1/(density_estim_full))

df_stratified_sample_par = samplestrat(data_train, 'base_score', 10000)

df_stratified_sample_train = samplestrat(data_train, 'base_score', 30000)
df_stratified_sample_test = samplestrat(data_test, 'base_score', 6000)

In [9]:
data_par_desc = df_stratified_sample_par['description']
data_par_label = df_stratified_sample_par['base_score']

description_train = df_stratified_sample_train['description']
description_test = df_stratified_sample_test['description']
cvss_train = df_stratified_sample_train['base_score']
cvss_test = df_stratified_sample_test['base_score']


In [10]:
#transform tokens of a description and summarize them
vectors = description_train.apply(lambda row: sum([token.vector for token in nlp(row)]))
print(vectors.shape)

(30000,)


In [11]:
description_train_vectors = list(vectors)


In [12]:
%store description_train_vectors

Stored 'description_train_vectors' (list)


In [21]:
%store -r description_train_vectors

In [13]:
sc_desc = StandardScaler()
description_train_vectors = sc_desc.fit_transform(description_train_vectors, cvss_train)

print(description_train_vectors[0])

[ 0.1245122  -0.25801587 -0.10209522 -0.01510228  0.04310671 -0.19506345
 -0.61113676  0.20186753  0.04483638 -0.31447435  0.4451127  -0.07482078
 -0.38495869 -0.06195115  0.42013075 -1.08013211  0.67138999 -0.53271895
 -0.2137922   0.62954027 -0.06912114  0.05011227  0.10429723 -0.41640463
 -0.68656238  0.18970473 -0.10256787 -0.1896575   0.03612746 -0.92586813
  0.40986534 -0.26668716  0.53808125  0.07132054  0.08055102  0.04999981
  0.33535083 -1.04572235 -0.79976425  0.76547732  0.63722697 -1.22760789
 -0.18567674  0.65270683 -0.04576949 -0.56779373 -0.02167229 -1.33686556
  0.66972635 -0.61200699  0.3391031  -0.77737214 -0.05992034 -0.0271695
 -0.12678932  0.55201463  0.18100628 -0.37521987 -1.01813239  0.64420233
  0.31927977 -0.28655715  0.5099606  -0.06361728 -0.77188526 -0.0103316
  0.35185937  0.40612761  0.38422565 -0.3934119  -0.17132857  0.0831471
 -0.26456515  0.24031414 -0.06194476 -0.2786562   0.44771887  0.85489662
  0.37155195 -0.581639    0.34777811  0.13862769  0.21

In [15]:
vectors_par = data_par_desc.apply(lambda row: sum([token.vector for token in nlp(row)]))
data_par_desc = list(vectors_par)

In [16]:
sc_desc = StandardScaler()
data_par_desc = sc_desc.fit_transform(data_par_desc, data_par_label)

In [24]:
print(SVR().get_params())

{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [17]:
from skopt import BayesSearchCV
opt = BayesSearchCV(
    SVR(kernel='rbf'),
    {
        #'kernel': (['linear', 'rbf']),
        'C': (1e-2, 1e+2, 'log-uniform'),
        'epsilon': (1e-6, 1e+1, 'log-uniform'),  
        'tol': (1e-6, 1e-2, 'log-uniform'),

    },
    n_iter=10,
    cv=3,
    verbose=10,
)

opt.fit(data_par_desc, data_par_label)

print("val. score: %s" % opt.best_score_)
print("best params: %s" % str(opt.best_params_))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3; 1/1] START C=0.3644681309334306, epsilon=0.00016794683823041705, tol=0.0036783764167289982
[CV 1/3; 1/1] END C=0.3644681309334306, epsilon=0.00016794683823041705, tol=0.0036783764167289982;, score=0.296 total time=  15.9s
[CV 2/3; 1/1] START C=0.3644681309334306, epsilon=0.00016794683823041705, tol=0.0036783764167289982
[CV 2/3; 1/1] END C=0.3644681309334306, epsilon=0.00016794683823041705, tol=0.0036783764167289982;, score=0.330 total time=  17.0s
[CV 3/3; 1/1] START C=0.3644681309334306, epsilon=0.00016794683823041705, tol=0.0036783764167289982
[CV 3/3; 1/1] END C=0.3644681309334306, epsilon=0.00016794683823041705, tol=0.0036783764167289982;, score=0.245 total time=  15.1s
Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV 1/3; 1/1] START C=0.08022049287843402, epsilon=5.2194449863061625e-06, tol=5.410355643682597e-05
[CV 1/3; 1/1] END C=0.08022049287843402, epsilon=5.2194449863061625e-06, tol=5.410355643

In [18]:
reg = SVR(C=31.66676109489223, epsilon=0.7320953088587447, kernel='rbf', tol= 0.0031250754673674357)

In [19]:
reg.fit(description_train_vectors, cvss_train)

SVR(C=31.66676109489223, epsilon=0.7320953088587447, tol=0.0031250754673674357)

In [20]:
#put test data into a list
list_description = []
for i in range(len(description_test)):
    list_description.append(description_test.iloc[i])

In [21]:
list_description_vec = []
for desc in list_description:
    list_description_vec.append(sum([token.vector for token in nlp(desc)]))

In [22]:
%store list_description_vec

Stored 'list_description_vec' (list)


In [None]:
%store -r list_description_vec

In [23]:
#scaling test data
list_description_vec = sc_desc.fit_transform(list_description_vec, cvss_test)


In [24]:
cvss_pred = []
for desc_vec in list_description_vec:
    cvss_pred.append(reg.predict([desc_vec]))

In [25]:
cvss_true = []
for i in range(len(cvss_test)):
    cvss_true.append(cvss_test.iloc[i])

In [27]:
#evaluation
from sklearn import metrics
print("Mean squared error: %.3f" %metrics.mean_squared_error(cvss_true, cvss_pred))
print("Explained variance score (1 is the best): %.3f" %metrics.explained_variance_score(cvss_true, cvss_pred))
print("Max error: %.3f" %metrics.max_error(cvss_true, cvss_pred))
print("Mean absolute percentage error: %.3f" %metrics.mean_absolute_percentage_error(cvss_true, cvss_pred))

R2 = reg.score(list_description_vec, cvss_test)
print("R squared: %.3f" %R2)


Mean squared error: 1.711
Explained variance score (1 is the best): 0.504
Max error: 6.186
Mean absolute percentage error: 0.167
R squared: 0.503


In [28]:
for i in range(0, 20):
    print(cvss_pred[i])
    print(cvss_true[i])
    print()

[5.5857803]
4.0

[4.2722534]
3.1

[5.53628132]
4.8

[4.5160291]
2.4

[6.8411017]
9.8

[1.64159088]
2.4

[5.99479267]
4.3

[4.78684343]
5.4

[4.98412064]
4.7

[7.09561064]
3.3

[4.88469674]
5.5

[7.42438029]
6.5

[5.89568625]
8.1

[7.38185366]
7.5

[6.34007356]
3.3

[8.11896706]
3.5

[4.6089984]
2.7

[6.81428832]
4.3

[6.96021133]
6.5

[8.78866624]
8.8



In [29]:
import pickle
pickle.dump(reg, open('svm_model', 'wb'))