##Step 1: Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split

##This step is optional and may be skipped if not running on Google Colab or if not using Google Drive for data storage.
from google.colab import drive
drive.mount('/content/drive')

## Step 2: Loading Data
This step reads the CSV file, shuffles the data, and saves the shuffled dataframe in a new CSV file.

In [None]:
data = pd.read_csv("/content/drive/MyDrive/projects/finetuning_LLMs_with_MIRTE_data/data/MITRE_Tactic_and_Techniques_Descriptions.csv", encoding='latin-1')
shuffled_data = data.sample(frac=1).reset_index(drop=True)
shuffled_data.to_csv('/content/drive/MyDrive/projects/finetuning_LLMs_with_MIRTE_data/data/shuffled_new_MITRE.csv', index=False)
shuffled_data

##Step 3: Cleaning Dataset for training
The data is read into a Pandas DataFrame named data. Columns 'URL' and 'Technique' are dropped from the DataFrame, and the column 'Technique_ID' is renamed to 'ID'. The code then extracts unique tactic names from columns 'Tactic1' to 'Tactic4' and creates a list my_list containing only non-null string values. Finally, a list Tactic_column is defined to store the column names 'Tactic1' to 'Tactic4'. These operations prepare the data by cleaning unnecessary columns, renaming columns, extracting unique tactic names, and defining the relevant columns for further processing or analysis.

In [None]:
data = pd.read_csv("/content/drive/MyDrive/projects/finetuning_LLMs_with_MIRTE_data/data/shuffled_new_MITRE.csv")
data = data.drop(columns=['URL', 'Technique'])
data.rename(columns={'Technique_ID': 'ID'}, inplace=True)

unique_tactic_names = pd.Series(data['Tactic1'].tolist() + data['Tactic2'].tolist() + data['Tactic3'].tolist() + data['Tactic4'].tolist()).unique()
my_list = [x for x in unique_tactic_names if (isinstance(x, str) or not math.isnan(x))]
Tactic_column = ['Tactic1', 'Tactic2', 'Tactic3', 'Tactic4']
data

We have two options for saving data. Option one is used to keep all the data for training, while option two is used when evaluation is needed.

In [None]:
#Option 1
data.to_csv('/content/drive/MyDrive/cleaned_shuffled_new_MITRE.csv', index=False)


#option 2
#split data into train, test, val
train_df, test_df = train_test_split(data, test_size=0.1, random_state=42)
train_df , val_df = train_test_split(train_df, test_size=0.1, random_state=42)
train_df.to_csv('/content/drive/MyDrive/projects/cleaned_MITRE_data_trainset.csv', index=False)
test_df.to_csv('/content/drive/MyDrive/projects/cleaned_MITRE_data_testset.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/projects/cleaned_MITRE_data_valset.csv', index=False)