<a href="https://colab.research.google.com/github/cmikke97/Automatic-Malware-Signature-Generation/blob/main/src/JointEmbedding/JointEmbedding_Github.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Train and Evaluate Joint Embedding ML model**

The code used in this Colab is available at https://github.com/cmikke97/Automatic-Malware-Signature-Generation

## **Setup**

### **Install needed packages**

In [None]:
!pip install boto3
!pip install baker
!pip install -U logzero
!pip install lmdb
!pip install waiting
!pip install --upgrade scikit-learn

### **Import modules**

In [None]:
import json
import os

### **Set up Drive**

In [None]:
from google.colab import drive

# set path where to mount drive
drive_path = "/content/drive"

# mount drive
drive.mount(drive_path)

### **Clone git repository**

In [None]:
# remove previously cloned (if present) git repository from local Colab instance
!rm -r /content/Automatic-Malware-Signature-Generation

In [None]:
# clone code git repository onto the local Colab instance
!git clone https://github.com/cmikke97/Automatic-Malware-Signature-Generation.git

# set current working directory (needed for correctly importing tool modules)
os.chdir("/content/Automatic-Malware-Signature-Generation/src/JointEmbedding")

### **Set Run variables**

In [None]:
import config
from dataset import Dataset

# set base path (if using google Drive)
base_path = os.path.join(drive_path, "MyDrive/thesis")

# set dataset destination dir
dataset_dir = "/content/Dataset"

# set checkpoint base dir
checkpoint_base_dir = config.checkpoint_dir

# set results base dir
results_base_dir = config.results_dir

# set number of times to run the model (for plotting results with mean and confidence)
runs = 2

### **Download SOREL20M dataset**

In [None]:
# download SOREL 20M dataset onto the local Colab instance
!python /content/Automatic-Malware-Signature-Generation/src/DatasetDownloader/sorel20mDownloader.py sorel20m_download $dataset_dir

### **Configuration**

To change tool configuration change values in local copy of "config.py" located at "/content/Automatic-Malware-Signature-Generation/src/JointEmbedding/config.py".

## **Train Network**

In [None]:
# for the number of configured runs
for i in range(runs):
    checkpoint_dir = os.path.join(checkpoint_base_dir, str(i))
    remove_missing_features = os.path.join(base_path, "Dataset/09-DEC-2020/processed-data/shas_missing_ember_features.json")
    loss_history_filename = os.path.join(checkpoint_dir, "loss_history.json")

    # execute train.py script
    !python train.py train_network --checkpoint_dir $checkpoint_dir --remove_missing_features $remove_missing_features --loss_history_filename $loss_history_filename --use_malicious_labels --use_count_labels

    # execute train.py script starting from saved model state at epoch 5
    #!python train.py train_network --checkpoint_dir $checkpoint_dir --remove_missing_features $remove_missing_features --starting_from_epoch 5 --loss_history_filename $loss_history_filename --use_malicious_labels --use_count_labels

## **Evaluate Network**

In [None]:
#instantiate results_files dictionary
results_files = {}

# for the number of configured runs
for i in range(runs):
    # add file path to results_files dictionary (used for plotting results)
    results_files["run_id_" + str(i)] = os.path.join(results_base_dir, str(i), "results.csv");

    results_dir = os.path.join(results_base_dir, str(i))
    checkpoint_file = os.path.join(checkpoint_base_dir, str(i), "epoch_10.pt")
    remove_missing_features = os.path.join(base_path, "Dataset/09-DEC-2020/processed-data/shas_missing_ember_features.json")

    # execute evaluate.py script
    !python evaluate.py evaluate_network --results_dir $results_dir --checkpoint_file $checkpoint_file --remove_missing_features $remove_missing_features --evaluate_malware --evaluate_count
    
# create and open the results.json file in write mode
with open(os.path.join(results_base_dir, "results.json"), "w") as output_file:
    # save results_files dictionary as a json file
    json.dump(results_files, output_file)

## **Compute and plot results**

### **Plot Training Loss trend**

In [None]:
# for the number of configured runs
for i in range(runs):
    loss_history_path = os.path.join(checkpoint_base_dir, str(i), "loss_history.json")
    output_filename = os.path.join(checkpoint_base_dir, str(i), "loss_trend.png")

    # execute plot.py to plot the model mean loss trend
    !python plots.py plot_loss_trend --loss_history_path $loss_history_path --output_filename $output_filename

### **Plot Results**

In [None]:
# for the number of configured runs
for i in range(runs):
    results_file = os.path.join(results_base_dir, str(i), "results.csv")
    output_filename = os.path.join(results_base_dir, str(i), "results.png")

    # execute plot.py to plot per-tag results for the single run
    !python plots.py plot_tag_result --results_file $results_file --output_filename $output_filename


run_to_filename_json = os.path.join(results_base_dir, "results.json")
output_filename = os.path.join(results_base_dir, "results.png")
tag_to_plot = 'malware'

# execute plot.py to plot the model results mean and confidence (at least 2 runs are needed)
!python plots.py plot_roc_distribution_for_tag --run_to_filename_json $run_to_filename_json --output_filename $output_filename --tag_to_plot $tag_to_plot

for tag in Dataset.tags:
    output_filename = os.path.join(results_base_dir, tag + "_tag_results.png")
    tag_to_plot = tag + "_tag"

    # execute plot.py to plot the model results mean and confidence for the specified tag (at least 2 runs are needed)
    !python plots.py plot_roc_distribution_for_tag --run_to_filename_json $run_to_filename_json --output_filename $output_filename --tag_to_plot $tag_to_plot --tag_to_plot $tag_to_plot

### **Compute scores**

In [None]:
# for the number of configured runs
for i in range(runs):
    results_file = os.path.join(results_base_dir, str(i), "results.csv")

    tag = "malware"
    output_filename = os.path.join(results_base_dir, str(i), tag + "_scores.csv")

    # execute plot.py to compute various scores for the specified tag
    !python plots.py compute_scores --results_file $results_file --output_filename $output_filename --tag $tag

    for tag in Dataset.tags:
        tag = tag + "_tag"
        output_filename = os.path.join(results_base_dir, str(i), tag + "_scores.csv")

        # execute plot.py to compute various scores for the specified tag
        !python plots.py compute_scores --results_file $results_file --output_filename $output_filename --tag $tag
    
    output_filename = os.path.join(results_base_dir, str(i), "mean_per_sample_scores.csv")

    !python plots.py compute_jaccard_similarity_score --results_file $results_file --output_filename $output_filename