In [1]:
import os
import sys

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import spacy
import seaborn as sns

from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler



In [3]:
nlp = spacy.load("en_core_web_lg")

In [23]:
data_train = pd.read_csv('cve_train.csv')
data_test = pd.read_csv('cve_test.csv')

description_train = data_train['description']
description_test = data_test['description']
cvss_train = data_train['base_score']
cvss_test = data_test['base_score']


In [25]:
print(cvss_train)

0        8.8
1        9.1
2        5.5
3        6.1
4        7.3
        ... 
61611    8.2
61612    6.1
61613    7.8
61614    3.3
61615    9.8
Name: base_score, Length: 61616, dtype: float64


In [12]:
print(description_train.head())

0    A remote code exection vulnerability was ident...
1    An information disclosure vulnerability in the...
2    The JPEGSetupEncode function in tiff_jpeg.c in...
3    Cloud Foundry UAA, versions prior to 74.0.0, i...
4    This affects the package ini before 1.3.6. If ...
Name: description, dtype: object


In [26]:
#transform tokens of a description and summarize them
vectors = description_train.apply(lambda row: sum([token.vector for token in nlp(row)]))
print(vectors.shape)

In [27]:
print(vectors)
print(cvss_train)

0        [-0.290974, 1.6305345, -0.316373, -3.2254272, ...
1        [-1.1581886, 3.2051346, -2.1144085, -1.3569508...
2        [-1.6946248, 3.195875, -2.814146, 1.3719728, -...
3        [-1.231077, 0.8149179, -1.9220165, 1.7865016, ...
4        [-0.4985961, 2.9195516, -4.0118856, 1.028444, ...
                               ...                        
61611    [-10.701544, 14.247082, -19.5394, 3.019741, -0...
61612    [-0.79379004, 9.049169, -2.4394453, -0.6517568...
61613    [-0.023308024, 0.8243772, 0.19929908, -1.43010...
61614    [-7.1296678, 1.413902, -7.289835, 4.026337, -4...
61615    [-4.5847507, 3.6791933, -0.65691996, 2.024829,...
Name: description, Length: 61616, dtype: object
0        8.8
1        9.1
2        5.5
3        6.1
4        7.3
        ... 
61611    8.2
61612    6.1
61613    7.8
61614    3.3
61615    9.8
Name: base_score, Length: 61616, dtype: float64


In [30]:
description_train_vectors = list(vectors)

sc_desc = StandardScaler()
#sc_cvss = StandardScaler()
description_train_vectors = sc_desc.fit_transform(description_train_vectors)

#cvss_train = sc_cvss.fit_transform(cvss_train.reshape(-1,1))

reg = SVR(kernel='rbf')
reg.fit(description_train_vectors, cvss_train)

SVR()

In [31]:
#put test data into a list
list_description = []
for i in range(len(description_test)):
    list_description.append(description_test.iloc[i])

In [32]:
list_description_vec = []
for desc in list_description:
    list_description_vec.append(sum([token.vector for token in nlp(desc)]))

In [33]:
cvss_pred = []
for desc_vec in list_description_vec:
    cvss_pred.append(reg.predict([desc_vec]))

In [34]:
cvss_true = []
for i in range(len(cvss_test)):
    cvss_true.append(cvss_test.iloc[i])

In [3]:
#visualization
#How to visualize the vectors?

plt.scatter(description_train_vectors, cvss_train)
plt.plot(description_train_vectors, cvss_pred, color="green")
plt.xlabel('Description vector sum')
plt.ylabel('CVSS score')
plt.show()

NameError: name 'description_train_vectors' is not defined

In [4]:
#evaluation
from sklearn import metrics
print("Mean squared error: %.3f" %metrics.mean_squared_error(cvss_true, cvss_pred))
print("Explained variance score (1 is the best): %.3f" %metrics.explained_variance_score(cvss_true, cvss_pred))
print("Max error: %.3f" %metrics.max_error(cvss_true, cvss_pred))
print("Mean absolute percentage error: %.3f" %metrics.mean_absolute_percentage_error(cvss_true, cvss_pred))
print("Correlation coefficient: %.3f" %metrics.r2_score(cvss_true, cvss_pred))

R2 = reg.score(description_train_vectors, cvss_train)
print("R squared: %.3f" %R2)


NameError: name 'cvss_true' is not defined

In [33]:
for i in range(0, 20):
    print(cvss_pred[i])
    print(cvss_true[i])
    print()

[7.4661249]
7.5

[7.63278205]
7.8

[8.22439322]
8.8

[4.14713951]
4.3

[5.15562947]
5.5

[6.89972806]
7.0

[7.88468669]
4.7

[7.53229076]
9.8

[5.87283644]
5.5

[7.77672928]
9.1

[8.50735935]
9.8

[6.63063347]
7.2

[5.19119846]
4.7

[7.59984634]
7.8

[6.01000603]
4.8

[8.70402406]
8.8

[7.59104949]
7.8

[7.3477621]
8.4

[5.61356849]
8.7

[6.19843648]
7.3



In [None]:
import pickle
pickle.dump(reg, open('svm_model', 'wb'))