In [3]:
import os
import sys

In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import spacy
import seaborn as sns

from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler



In [5]:
nlp = spacy.load("en_core_web_lg")

In [6]:
data_train = pd.read_csv('cve_train.csv')
data_test = pd.read_csv('cve_test.csv')

description_train = data_train['description']
description_test = data_test['description']
cvss_train = data_train['base_score']
cvss_test = data_test['base_score']


In [7]:
print(cvss_train)

0        8.8
1        9.1
2        5.5
3        6.1
4        7.3
        ... 
61611    8.2
61612    6.1
61613    7.8
61614    3.3
61615    9.8
Name: base_score, Length: 61616, dtype: float64


In [8]:
print(description_train.head())

0    A remote code exection vulnerability was ident...
1    An information disclosure vulnerability in the...
2    The JPEGSetupEncode function in tiff_jpeg.c in...
3    Cloud Foundry UAA, versions prior to 74.0.0, i...
4    This affects the package ini before 1.3.6. If ...
Name: description, dtype: object


In [10]:
#transform tokens of a description and summarize them
vectors = description_train.apply(lambda row: sum([token.vector for token in nlp(row)]))
print(vectors.shape)

(61616,)


In [None]:
print(vectors[1])

[-1.15818858e+00  3.20513463e+00 -2.11440849e+00 -1.35695076e+00
 -1.70707002e-01  3.36606598e+00  2.38366127e+00 -6.12833452e+00
  2.37718487e+00  3.37264366e+01 -8.45949936e+00  1.67045796e+00
  2.18603706e+00 -1.85747015e+00 -1.66831899e+00 -9.96696055e-01
 -2.89692450e+00  4.62275085e+01 -7.20108032e+00 -3.54545259e+00
 -3.95664597e+00 -1.00073904e-01 -2.85759354e+00  1.18697608e+00
 -1.19508982e-01  1.40146601e+00 -9.38791692e-01 -2.52932727e-01
  5.30392599e+00 -3.85707200e-01  1.48543942e+00 -5.33185959e+00
 -8.91161621e-01  1.92931247e+00  2.21282196e+00 -3.64784098e+00
 -1.25901508e+00  5.12619352e+00 -2.66949296e+00 -6.00011230e-01
 -1.02207020e-01 -1.68748403e+00  2.04021215e+00 -2.34481692e+00
 -5.99267054e+00  3.61351180e+00 -6.34684944e+00  5.09514928e-01
 -1.67792702e+00  1.32149184e+00  4.26762193e-01 -1.94802547e+00
 -3.08140063e+00 -3.79134274e+00 -3.32561493e+00  1.86886084e+00
 -3.88159013e+00  8.42651069e-01 -9.82628644e-01  9.58293796e-01
 -4.85893250e-01 -4.97421

In [11]:
description_train_vectors = list(vectors)


In [12]:
%store description_train_vectors

Stored 'description_train_vectors' (list)


In [None]:
print(description_train_vectors[0])

[-2.9097399e-01  1.6305345e+00 -3.1637299e-01 -3.2254272e+00
  1.0255990e+00  3.1177318e+00  6.3263583e-01 -4.0955658e+00
  9.0866482e-01  2.7703457e+01 -3.0052614e+00  2.5117311e-01
  1.3151779e+00 -3.4891067e+00 -1.3455660e+00  7.6646185e-01
 -4.5220292e-01  2.4408628e+01 -2.6345510e+00 -3.1180274e+00
 -2.1593940e+00 -3.3411272e+00 -2.3462613e+00  8.7407589e-01
  4.2166245e-01 -3.5875309e-01 -4.5307434e-01  9.0078807e-01
  1.7367808e+00  6.7267936e-01 -2.0033400e+00 -1.3464687e+00
  2.0223699e+00  7.4084783e-01 -1.0947891e+00 -1.8976130e+00
 -3.1543201e-01  1.9982188e+00 -3.3499620e+00 -1.8089513e+00
 -5.3849779e-03  3.0492699e+00  3.2745681e+00 -1.0925552e+00
 -3.8164452e-01 -4.8307502e-01 -1.8578640e+00  1.4051197e-01
 -8.4399778e-01  5.4060000e-01 -2.4459698e+00  4.6786949e-02
 -1.9553985e-01 -1.1665798e+00  1.1857218e+00  1.7931049e+00
 -8.5360998e-01 -6.9249791e-01  2.0617735e+00  1.4248120e+00
  6.8658864e-01 -1.9014518e-01 -1.4674100e+00  3.4258080e+00
  1.1413754e+00 -7.70668

In [13]:
sc_desc = StandardScaler()
description_train_vectors = sc_desc.fit_transform(description_train_vectors, cvss_train)

print(description_train_vectors[0])

[ 0.14649557 -0.55227798  0.90057199 -1.2803768   0.0576298  -0.47833431
 -0.22835618  0.36439333  0.00328053 -0.85903832  0.62636007  0.00451147
 -0.42435703 -0.73285399  0.77436626  0.67126756  0.96820716 -0.84549552
  0.56878143  0.3666353   0.580081   -0.31789413  0.72521085  0.29677413
 -0.15000359 -0.31159218  0.24030109  0.95618542 -0.23835819  0.39491722
 -0.24082956 -0.0341159   0.75534603 -0.52700404 -0.55173798  0.46733022
  0.40960478 -0.29781466 -1.07355784  0.60805827  0.11033828  0.10402227
  0.4670759   0.67419648  0.89649253 -0.749397    0.32171477 -0.56424338
  0.22245114 -0.08143247 -0.39916558  0.34185232  0.49286773 -0.07949017
  0.7608605   0.00264919  0.47645816  0.72439171  0.54586855  1.0372079
 -0.06666585 -0.82707155  0.41600241 -0.39324226 -0.81003632  0.0590296
 -0.64041     0.28927566 -0.62875011 -0.6278614  -1.08536251 -0.53952493
 -0.91877433  0.63108185 -0.94669088  0.47865593 -0.90043977  0.90849993
  0.82597314 -0.63455542  0.66652547 -0.1929344   0.8

In [14]:
reg = SVR(kernel='rbf')


In [15]:
from sklearn.model_selection import GridSearchCV
parameters = [{'C': [1, 10], 'kernel': ['linear'], 'epsilon' : [0.05, 0.1, 0.5]},
              {'C': [1, 10], 'kernel': ['rbf'], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'epsilon' : [0.05, 0.1, 0.5]}]
grid_search = GridSearchCV(estimator = reg,
                           param_grid = parameters,
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(description_train_vectors, cvss_train)

In [None]:
grid_search.best_params_

In [None]:
reg.fit(description_train_vectors, cvss_train)

In [None]:
#put test data into a list
list_description = []
for i in range(len(description_test)):
    list_description.append(description_test.iloc[i])

In [None]:
list_description_vec = []
for desc in list_description:
    list_description_vec.append(sum([token.vector for token in nlp(desc)]))

In [None]:
cvss_pred = []
for desc_vec in list_description_vec:
    cvss_pred.append(reg.predict([desc_vec]))

In [None]:
cvss_true = []
for i in range(len(cvss_test)):
    cvss_true.append(cvss_test.iloc[i])

In [None]:
#visualization
#How to visualize the vectors?

plt.scatter(description_train_vectors, cvss_train)
plt.plot(description_train_vectors, cvss_pred, color="green")
plt.xlabel('Description vector sum')
plt.ylabel('CVSS score')
plt.show()

NameError: name 'description_train_vectors' is not defined

In [None]:
#evaluation
from sklearn import metrics
print("Mean squared error: %.3f" %metrics.mean_squared_error(cvss_true, cvss_pred))
print("Explained variance score (1 is the best): %.3f" %metrics.explained_variance_score(cvss_true, cvss_pred))
print("Max error: %.3f" %metrics.max_error(cvss_true, cvss_pred))
print("Mean absolute percentage error: %.3f" %metrics.mean_absolute_percentage_error(cvss_true, cvss_pred))
print("Correlation coefficient: %.3f" %metrics.r2_score(cvss_true, cvss_pred))

R2 = reg.score(description_train_vectors, cvss_train)
print("R squared: %.3f" %R2)


NameError: name 'cvss_true' is not defined

In [None]:
for i in range(0, 20):
    print(cvss_pred[i])
    print(cvss_true[i])
    print()

[7.4661249]
7.5

[7.63278205]
7.8

[8.22439322]
8.8

[4.14713951]
4.3

[5.15562947]
5.5

[6.89972806]
7.0

[7.88468669]
4.7

[7.53229076]
9.8

[5.87283644]
5.5

[7.77672928]
9.1

[8.50735935]
9.8

[6.63063347]
7.2

[5.19119846]
4.7

[7.59984634]
7.8

[6.01000603]
4.8

[8.70402406]
8.8

[7.59104949]
7.8

[7.3477621]
8.4

[5.61356849]
8.7

[6.19843648]
7.3



In [None]:
import pickle
pickle.dump(reg, open('svm_model', 'wb'))