In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import os
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score
import json
import numpy as np
from tqdm import tqdm
import xgboost as xgb
from collections import defaultdict

# Data prep

In [20]:
labels = ['attackComplexity', 'attackVector', 'availabilityImpact', 'confidentialityImpact',
          'integrityImpact', 'privilegesRequired', 'scope', 'userInteraction']

In [5]:
with open(f'data/{labels[1]}/label_mapping.txt') as f:
    content = f.read().replace("'",'"')
    class_names = list(json.loads(content).keys())

In [6]:
train = pd.read_csv(f'data/{labels[1]}/train.csv')
test = pd.read_csv(f'data/{labels[1]}/test.csv')

x_train, y_train = train['text'], train['labels']
x_valid, y_valid   = test['text'], test['labels']

## Option 1: TFIDFVectorizer

In [18]:
vec = TfidfVectorizer(decode_error='ignore',stop_words='english', max_df=0.8, max_features=10000)

x_train_vec = vec.fit_transform(x_train).todense()
x_train_vec = pd.DataFrame(x_train_vec, columns=vec.get_feature_names_out())

x_valid_vec = vec.transform(x_valid).todense()
x_valid_vec = pd.DataFrame(x_valid_vec, columns=vec.get_feature_names_out())

In [19]:
x_train_vec.to_parquet("data/x_train_vec.parquet",index=False)

In [20]:
x_valid_vec.to_parquet("data/x_valid_vec.parquet",index=False)

## Option 2: CountVectorizer

In [106]:
# from sklearn.feature_extraction.text import CountVectorizer
# vec = CountVectorizer(decode_error='ignore', stop_words='english',max_df=0.8, max_features=10000)

# x_train_vec = vec.fit_transform(x_train).todense()
# x_train_vec = pd.DataFrame(x_train_vec, columns=vec.get_feature_names_out())

# x_valid_vec = vec.transform(x_valid).todense()
# x_valid_vec = pd.DataFrame(x_valid_vec, columns=vec.get_feature_names_out())

# Step 2: One Hot encoding

In [200]:
# x_train_dummies = pd.DataFrame(0, index=np.arange(len(x_train)), columns=to_keep)
# x_valid_dummies = pd.DataFrame(0, index=np.arange(len(x_valid)), columns=to_keep)

# to_keep = list(vec.vocabulary_.keys())

# for i in tqdm(range(len(x_train))):
#     x_train[i] = ' '.join([x for x in x_train[i].lower().split() if x in to_keep])
    
    
# for i in tqdm(range(len(x_valid))):
#     x_valid[i] = ' '.join([x for x in x_valid[i].lower().split() if x in to_keep])

    
# for i,row in tqdm(x_train_dummies.iterrows()):
#     for word in x_train[i].lower().split():
#         x_train_dummies.at[i,word] = 1


# for i,row in tqdm(x_valid_dummies.iterrows()):
#     for word in x_valid[i].lower().split():
#         x_valid_dummies.at[i,word] = 1
        
        
# x_train_dummies.drop(columns=x_train_dummies.columns[-39:], axis=1, inplace=True)

  0%|                                                                                                      | 0/65511 [00:00<?, ?it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  7%|██████▏                                                                                  | 4523/65511 [00:01<00:25, 2406.78it/s]


KeyboardInterrupt: 

# Modeling

In [19]:
%%time
model = XGBClassifier(learning_rate=0.5, random_state=1,
                          tree_method='gpu_hist', gpu_id=0, 
                          predictor="gpu_predictor",  
                          n_estimators=300)

model.fit(x_train_vec, y_train)

preds = model.predict(x_valid_vec)
print(i)
f1 = f1_score(preds, y_valid,average="weighted")
print("F1 Score: %.4f" %f1)
acc = accuracy_score(preds, y_valid)
print("Accuracy: %.4f" %acc)

0.5
F1 Score: 0.9233
Accuracy: 0.9211
0.6
F1 Score: 0.9231
Accuracy: 0.9210
0.7
F1 Score: 0.9230
Accuracy: 0.9209
CPU times: total: 12min 21s
Wall time: 6min 43s


In [13]:
preds = model.predict(x_valid_vec)

f1 = f1_score(preds, y_valid,average="weighted")
print("F1 Score: %.4f" %f1)
acc = accuracy_score(preds, y_valid)
print("Accuracy: %.4f" %acc)

F1 Score: 0.9131
Accuracy: 0.9100


In [11]:
model.save_model("model.json")

In [14]:
model_xgb_2 = xgb.Booster()
model_xgb_2.load_model("model.json")

# Pipeline

In [21]:
# raja3 'attackVector'
labels = ['attackComplexity', 'availabilityImpact', 'confidentialityImpact',
          'integrityImpact', 'privilegesRequired', 'scope', 'userInteraction']

In [31]:
scores = defaultdict(dict)

for label in tqdm(labels):
    scores[label] = {}
    with open(f'data/{label}/label_mapping.txt') as f:
        content = f.read().replace("'",'"')
        class_names = list(json.loads(content).keys())

    train = pd.read_csv(f'data/{label}/train.csv')
    test = pd.read_csv(f'data/{label}/test.csv')

    x_train, y_train = train['text'], train['labels']
    x_valid, y_valid   = test['text'], test['labels']

    vec = TfidfVectorizer(decode_error='ignore',stop_words='english', max_df=0.8, max_features=10000)

    x_train_vec = vec.fit_transform(x_train).todense()
    x_train_vec = pd.DataFrame(x_train_vec, columns=vec.get_feature_names_out())

    x_valid_vec = vec.transform(x_valid).todense()
    x_valid_vec = pd.DataFrame(x_valid_vec, columns=vec.get_feature_names_out())

    for i in [0.5]:
        model = XGBClassifier(learning_rate=i, random_state=1,
                              tree_method='gpu_hist', gpu_id=0, 
                              predictor="gpu_predictor",  
                              n_estimators=300)

        model.fit(x_train_vec, y_train)

        preds = model.predict(x_valid_vec)
        
        scores[label][i] = {}
        scores[label][i]['Accuracy'] = round(accuracy_score(preds, y_valid),3)
        scores[label][i]['f1'] = round(f1_score(preds, y_valid,average="weighted"),3)
    
    #model.save_model(f"models/xgb/{labTel}.json")
    print (label, scores[label])

 12%|██████████▍                                                                        | 1/8 [02:20<16:24, 140.67s/it]

attackComplexity {0.5: {'Accuracy': 0.972, 'f1': 0.975}, 0.6: {'Accuracy': 0.97, 'f1': 0.973}, 0.7: {'Accuracy': 0.971, 'f1': 0.974}}


 25%|████████████████████▊                                                              | 2/8 [09:07<29:41, 296.99s/it]

attackVector {0.5: {'Accuracy': 0.921, 'f1': 0.923}, 0.6: {'Accuracy': 0.921, 'f1': 0.923}, 0.7: {'Accuracy': 0.921, 'f1': 0.923}}


 38%|███████████████████████████████▏                                                   | 3/8 [14:32<25:50, 310.19s/it]

availabilityImpact {0.5: {'Accuracy': 0.898, 'f1': 0.901}, 0.6: {'Accuracy': 0.896, 'f1': 0.9}, 0.7: {'Accuracy': 0.898, 'f1': 0.901}}


 50%|█████████████████████████████████████████▌                                         | 4/8 [20:09<21:22, 320.75s/it]

confidentialityImpact {0.5: {'Accuracy': 0.872, 'f1': 0.874}, 0.6: {'Accuracy': 0.87, 'f1': 0.871}, 0.7: {'Accuracy': 0.868, 'f1': 0.87}}


 62%|███████████████████████████████████████████████████▉                               | 5/8 [25:40<16:12, 324.22s/it]

integrityImpact {0.5: {'Accuracy': 0.881, 'f1': 0.881}, 0.6: {'Accuracy': 0.88, 'f1': 0.881}, 0.7: {'Accuracy': 0.879, 'f1': 0.88}}


 75%|██████████████████████████████████████████████████████████████▎                    | 6/8 [31:05<10:49, 324.61s/it]

privilegesRequired {0.5: {'Accuracy': 0.838, 'f1': 0.843}, 0.6: {'Accuracy': 0.838, 'f1': 0.843}, 0.7: {'Accuracy': 0.837, 'f1': 0.842}}


 88%|████████████████████████████████████████████████████████████████████████▋          | 7/8 [33:35<04:27, 267.36s/it]

scope {0.5: {'Accuracy': 0.963, 'f1': 0.964}, 0.6: {'Accuracy': 0.963, 'f1': 0.964}, 0.7: {'Accuracy': 0.963, 'f1': 0.963}}


100%|███████████████████████████████████████████████████████████████████████████████████| 8/8 [36:07<00:00, 270.94s/it]

userInteraction {0.5: {'Accuracy': 0.941, 'f1': 0.941}, 0.6: {'Accuracy': 0.94, 'f1': 0.94}, 0.7: {'Accuracy': 0.94, 'f1': 0.941}}





In [10]:
with open('models/xgb/scores.txt','w') as f:
    f.write(json.dumps(scores))

In [11]:
scores

{'integrityImpact': {'Accuracy': 0.874, 'F1': 0.875},
 'privilegesRequired': {'Accuracy': 0.826, 'F1': 0.834},
 'scope': {'Accuracy': 0.965, 'F1': 0.966},
 'attackComplexity': {'Accuracy': 0.973, 'F1': 0.977},
 'confidentialityImpact': {'Accuracy': 0.862, 'F1': 0.866},
 'availabilityImpact': {'Accuracy': 0.89, 'F1': 0.894},
 'userInteraction': {'Accuracy': 0.934, 'F1': 0.935},
 'attackVector': {'Accuracy': 0.908, 'F1': 0.912}}

### Scope

In [20]:
import shap
S_samples = valid_texts
S_explainer, S_shap_values, S_expected_value, S_class_names = build_shap_values("scope", PR_samples)
S_class_names

['CHANGED', 'UNCHANGED']

In [65]:
shap.initjs()
shap.force_plot(S_expected_value, S_shap_values[0],feature_names=valid_texts.columns)