# 16.1. Data Cleaning - Malware Bazaar

In [1]:
from datetime import datetime
import os
import time
import pandas as pd
import requests
import urllib3
import json
import sys
import numpy as np

pd.options.display.max_colwidth = 1000
pd.set_option('display.max_columns', None)

In [5]:
path_MB_Entries     = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\DataAnalysis\\malware_bazaar\\Entries.csv"
path_MB_YaraRules   = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\DataAnalysis\\malware_bazaar\\Yara_Rules.csv"
path_MB_TV_Analysis = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\DataAnalysis\\malware_bazaar\\TV_Analysis.csv"
path_MB_Behaviours  = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\DataAnalysis\\malware_bazaar\\Behaviours.csv"
path_MB_Tags        = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\DataAnalysis\\malware_bazaar\\Tags.csv"

# Entries

In [6]:
entries = pd.read_csv(path_MB_Entries, low_memory = False)
entries.head(3)


Unnamed: 0,sha1_hash,sha256_hash,md5_hash,first_seen,last_seen,file_type,delivery_method
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,2021-11-09 10:55:58,2021-11-09 11:47:31,exe,email_attachment
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,2023-07-19 07:41:13,2023-08-25 17:28:03,exe,web_download
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,2022-01-04 20:46:14,2022-02-04 16:00:00,exe,other


## Data Wrangling

#### Column Renaming

In [7]:
entries.rename(columns=
               {   'sha1_hash'                            : 'sha1', 
                   'sha256_hash'                          : 'sha256', 
                   'md5_hash'                             : 'md5', }
          , inplace=True)
entries

Unnamed: 0,sha1,sha256,md5,first_seen,last_seen,file_type,delivery_method
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,2021-11-09 10:55:58,2021-11-09 11:47:31,exe,email_attachment
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,2023-07-19 07:41:13,2023-08-25 17:28:03,exe,web_download
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,2022-01-04 20:46:14,2022-02-04 16:00:00,exe,other
3,55cbf8dae95d350ad7e5a63b418f6cd203488f51,8b7ae9f195b075a789d6d8277d500d27754bfa3c53ecca8db7beac8ccd07884f,97d9d216f2627cb238ca6637580b1739,2022-02-09 15:18:45,,exe,other
4,a2915c1be9e6134b7bf3ca5ca00eeb0c969bedab,6c2a2251861a6d2701814843fadac940cf4d34db9f446f0698352fd866b31739,1b09de36dfe5850d7e3fbd6b39c89a43,2021-02-11 09:58:16,,exe,web_download
...,...,...,...,...,...,...,...
619,45c4340c4bfcf6145638b6a9556c42b25a07db76,eff2bda9797c042cbae44c8aed29b31b853733c0676a687dc62676752197c05d,7ebd8264cdecb8f522b51b0490a3f901,2020-11-10 07:24:48,2020-11-10 08:40:04,exe,web_download
620,dc040262f90dd233b35fe922ec98834eb9f44f9b,7f7670c31d0f9eb80f8839e8e063aa01d44c77712d90066323fe155480f5f69c,6d7e39aeeb28c8352269a6357f583c11,2020-12-23 18:50:37,2020-12-23 20:34:50,exe,email_attachment
621,b70e2199703c316465af96540561bcd002321045,0252937e5af74369499c21efe832306dccf718402b08c67b2d338061ba1e67a0,69b64df678f3e397ab18b4e3db830d96,2021-02-28 07:22:14,2021-02-28 08:59:44,exe,
622,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,2021-06-21 15:19:58,,exe,other


## Data Cleaning

#### Datetype Conversion


- first_seen field : from string to date
- last_seen  field : from string to date

In [8]:
entries['first_seen'] = pd.to_datetime(entries['first_seen']).dt.strftime('%Y-%m-%d')

entries['last_seen']  = pd.to_datetime(entries['last_seen']).dt.strftime('%Y-%m-%d')


In [7]:
entries

Unnamed: 0,sha1,sha256,md5,first_seen,last_seen,originalFileName,filesize,origin_country,delivery_method,reporter
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,2021-11-09 10:55:58,2021-11-09 11:47:31,"SOMECO Nov Acil PO_76565,pdf.exe",852480,,email_attachment,GovCERT_CH
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,2023-07-19 07:41:13,2023-08-25 17:28:03,da9534900ee0d11c9b30cf33152ea03c,721920,FR,web_download,zbetcheckin
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,2022-01-04 20:46:14,2022-02-04 16:00:00,rat.exe,474112,,other,James_inthe_box
3,55cbf8dae95d350ad7e5a63b418f6cd203488f51,8b7ae9f195b075a789d6d8277d500d27754bfa3c53ecca8db7beac8ccd07884f,97d9d216f2627cb238ca6637580b1739,2022-02-09 15:18:45,NaT,Xhsjvezljjvyeubpsyvkjosttfnghnibfs.exe,986624,,other,James_inthe_box
4,a2915c1be9e6134b7bf3ca5ca00eeb0c969bedab,6c2a2251861a6d2701814843fadac940cf4d34db9f446f0698352fd866b31739,1b09de36dfe5850d7e3fbd6b39c89a43,2021-02-11 09:58:16,NaT,1b09de36dfe5850d7e3fbd6b39c89a43.exe,4538880,,web_download,abuse_ch
...,...,...,...,...,...,...,...,...,...,...
619,45c4340c4bfcf6145638b6a9556c42b25a07db76,eff2bda9797c042cbae44c8aed29b31b853733c0676a687dc62676752197c05d,7ebd8264cdecb8f522b51b0490a3f901,2020-11-10 07:24:48,2020-11-10 08:40:04,7ebd8264cdecb8f522b51b0490a3f901.exe,926720,,web_download,abuse_ch
620,dc040262f90dd233b35fe922ec98834eb9f44f9b,7f7670c31d0f9eb80f8839e8e063aa01d44c77712d90066323fe155480f5f69c,6d7e39aeeb28c8352269a6357f583c11,2020-12-23 18:50:37,2020-12-23 20:34:50,PGHT2012023 (Invoice & Packing).exe,1354752,,email_attachment,abuse_ch
621,b70e2199703c316465af96540561bcd002321045,0252937e5af74369499c21efe832306dccf718402b08c67b2d338061ba1e67a0,69b64df678f3e397ab18b4e3db830d96,2021-02-28 07:22:14,2021-02-28 08:59:44,0252937e5af74369499c21efe832306dccf718402b08c67b2d338061ba1e67a0,814592,,,JAMESWT_MHT
622,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,2021-06-21 15:19:58,NaT,stub.exe,48640,,other,James_inthe_box


#### Discretization

- substitution of '_' with ' ' in the field 'delivery_method'


In [9]:
entries['delivery_method'] = entries['delivery_method'].replace('_', ' ', regex=True)

#### Missing Values
- Imputation : the term 'other' in missing values is set to NaN or 'other' because it's interpretated as unknown 

In [10]:
entries['delivery_method'] = entries['delivery_method'].replace('other', np.nan)
entries['delivery_method'] = entries['delivery_method'].replace(np.nan, 'unknown')

In [11]:
entries

Unnamed: 0,sha1,sha256,md5,first_seen,last_seen,file_type,delivery_method
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,2021-11-09,2021-11-09,exe,email attachment
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,2023-07-19,2023-08-25,exe,web download
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,2022-01-04,2022-02-04,exe,unknown
3,55cbf8dae95d350ad7e5a63b418f6cd203488f51,8b7ae9f195b075a789d6d8277d500d27754bfa3c53ecca8db7beac8ccd07884f,97d9d216f2627cb238ca6637580b1739,2022-02-09,,exe,unknown
4,a2915c1be9e6134b7bf3ca5ca00eeb0c969bedab,6c2a2251861a6d2701814843fadac940cf4d34db9f446f0698352fd866b31739,1b09de36dfe5850d7e3fbd6b39c89a43,2021-02-11,,exe,web download
...,...,...,...,...,...,...,...
619,45c4340c4bfcf6145638b6a9556c42b25a07db76,eff2bda9797c042cbae44c8aed29b31b853733c0676a687dc62676752197c05d,7ebd8264cdecb8f522b51b0490a3f901,2020-11-10,2020-11-10,exe,web download
620,dc040262f90dd233b35fe922ec98834eb9f44f9b,7f7670c31d0f9eb80f8839e8e063aa01d44c77712d90066323fe155480f5f69c,6d7e39aeeb28c8352269a6357f583c11,2020-12-23,2020-12-23,exe,email attachment
621,b70e2199703c316465af96540561bcd002321045,0252937e5af74369499c21efe832306dccf718402b08c67b2d338061ba1e67a0,69b64df678f3e397ab18b4e3db830d96,2021-02-28,2021-02-28,exe,unknown
622,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,2021-06-21,,exe,unknown


- Imputation  based on dependency between 'first_seen' and 'last_seen'

In [12]:
from sklearn.linear_model import LinearRegression

columns_selected = ['first_seen', 'last_seen']
entries_predicted = entries[columns_selected]

entries_predicted['first_seen'] = pd.to_datetime(entries_predicted['first_seen'])
entries_predicted['last_seen'] = pd.to_datetime(entries_predicted['last_seen'])


entries_predicted['first_seen_timestamp'] = entries_predicted['first_seen'].astype('int64')
entries_predicted['last_seen_timestamp'] = entries_predicted['last_seen'].astype('int64') 

# Separate data into two sets: one with missing values and one without
df_missing = entries_predicted[entries_predicted.isna().any(axis=1)]
df_not_missing = entries_predicted.dropna()

# Prepare data for linear regression
X_train = df_not_missing['first_seen_timestamp'].values.reshape(-1, 1)
y_train = df_not_missing['last_seen_timestamp'].values.reshape(-1, 1)

# Fit linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

X_missing = df_missing['first_seen_timestamp'].values.reshape(-1, 1)
df_missing['last_seen_timestamp'] = pd.to_datetime(model.predict(X_missing), unit='ns')

# Drop the temporary timestamp column
df_missing = df_missing.drop(columns=['first_seen_timestamp'])

# Combine datasets
df_imputed = pd.concat([df_not_missing, df_missing]).reset_index(drop=True)

mask = df_imputed['last_seen'].isna()
df_imputed.loc[mask, 'last_seen'] = df_imputed.loc[mask, 'last_seen_timestamp'].apply(lambda x: x[0].strftime('%Y-%m-%d'))

entries[columns_selected] = df_imputed[columns_selected]

entries

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entries_predicted['first_seen'] = pd.to_datetime(entries_predicted['first_seen'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entries_predicted['last_seen'] = pd.to_datetime(entries_predicted['last_seen'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entries_predicted['first_seen_timestamp'] =

Unnamed: 0,sha1,sha256,md5,first_seen,last_seen,file_type,delivery_method
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,2021-11-09,2021-11-09,exe,email attachment
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,2023-07-19,2023-08-25,exe,web download
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,2022-01-04,2022-02-04,exe,unknown
3,55cbf8dae95d350ad7e5a63b418f6cd203488f51,8b7ae9f195b075a789d6d8277d500d27754bfa3c53ecca8db7beac8ccd07884f,97d9d216f2627cb238ca6637580b1739,2023-06-13,2023-06-13,exe,unknown
4,a2915c1be9e6134b7bf3ca5ca00eeb0c969bedab,6c2a2251861a6d2701814843fadac940cf4d34db9f446f0698352fd866b31739,1b09de36dfe5850d7e3fbd6b39c89a43,2022-03-27,2022-04-06,exe,web download
...,...,...,...,...,...,...,...
619,45c4340c4bfcf6145638b6a9556c42b25a07db76,eff2bda9797c042cbae44c8aed29b31b853733c0676a687dc62676752197c05d,7ebd8264cdecb8f522b51b0490a3f901,2022-12-23,2023-01-05,exe,web download
620,dc040262f90dd233b35fe922ec98834eb9f44f9b,7f7670c31d0f9eb80f8839e8e063aa01d44c77712d90066323fe155480f5f69c,6d7e39aeeb28c8352269a6357f583c11,2022-11-21,2022-12-03,exe,email attachment
621,b70e2199703c316465af96540561bcd002321045,0252937e5af74369499c21efe832306dccf718402b08c67b2d338061ba1e67a0,69b64df678f3e397ab18b4e3db830d96,2020-11-21,2020-11-23,exe,unknown
622,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,2022-09-08,2022-09-19,exe,unknown



- drop rows with missing values related to sha1 sha256, md5 fields because they are the keys of the entity Entries

In [13]:
entries.dropna(subset=['sha1', 'sha256', 'md5'], inplace=True)
entries.reset_index(drop=True, inplace=True)

entries


Unnamed: 0,sha1,sha256,md5,first_seen,last_seen,file_type,delivery_method
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,2021-11-09,2021-11-09,exe,email attachment
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,2023-07-19,2023-08-25,exe,web download
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,2022-01-04,2022-02-04,exe,unknown
3,55cbf8dae95d350ad7e5a63b418f6cd203488f51,8b7ae9f195b075a789d6d8277d500d27754bfa3c53ecca8db7beac8ccd07884f,97d9d216f2627cb238ca6637580b1739,2023-06-13,2023-06-13,exe,unknown
4,a2915c1be9e6134b7bf3ca5ca00eeb0c969bedab,6c2a2251861a6d2701814843fadac940cf4d34db9f446f0698352fd866b31739,1b09de36dfe5850d7e3fbd6b39c89a43,2022-03-27,2022-04-06,exe,web download
...,...,...,...,...,...,...,...
619,45c4340c4bfcf6145638b6a9556c42b25a07db76,eff2bda9797c042cbae44c8aed29b31b853733c0676a687dc62676752197c05d,7ebd8264cdecb8f522b51b0490a3f901,2022-12-23,2023-01-05,exe,web download
620,dc040262f90dd233b35fe922ec98834eb9f44f9b,7f7670c31d0f9eb80f8839e8e063aa01d44c77712d90066323fe155480f5f69c,6d7e39aeeb28c8352269a6357f583c11,2022-11-21,2022-12-03,exe,email attachment
621,b70e2199703c316465af96540561bcd002321045,0252937e5af74369499c21efe832306dccf718402b08c67b2d338061ba1e67a0,69b64df678f3e397ab18b4e3db830d96,2020-11-21,2020-11-23,exe,unknown
622,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,2022-09-08,2022-09-19,exe,unknown


# Yara Rules 

In [46]:
yaraRules = pd.read_csv(path_MB_YaraRules, low_memory = False)
yaraRules.head(3)


Unnamed: 0,sha1,sha256,md5,rule_name,description,author,reference
0,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,iexplorer_remcos,Detect iexplorer being taken over by Remcos,iam-py-test,
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,INDICATOR_SUSPICIOUS_EXE_UACBypass_CMSTPCOM,Detects Windows exceutables bypassing UAC using CMSTP COM interfaces. MITRE (T1218.003),ditekSHen,
2,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,pe_imphash,,,


## Data Wrangling

#### Column Renaming

In [47]:
yaraRules.rename(columns= { 'rule_name' : 'yara_rule_name', 'description' : 'yara_rule_description', 'author' : 'yara_rule_author', 'reference' : 'yara_rule_reference' }, inplace=True)
yaraRules

Unnamed: 0,sha1,sha256,md5,yara_rule_name,yara_rule_description,yara_rule_author,yara_rule_reference
0,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,iexplorer_remcos,Detect iexplorer being taken over by Remcos,iam-py-test,
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,INDICATOR_SUSPICIOUS_EXE_UACBypass_CMSTPCOM,Detects Windows exceutables bypassing UAC using CMSTP COM interfaces. MITRE (T1218.003),ditekSHen,
2,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,pe_imphash,,,
3,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,Remcos,Remcos Payload,kevoreilly,
4,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,REMCOS_RAT_variants,,,
...,...,...,...,...,...,...,...
3433,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,Skystars_Malware_Imphash,imphash,Skystars LightDefender,
3434,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,win_asyncrat_j1,detects AsyncRAT,Johannes Bader @viql,
3435,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,win_asyncrat_w0,detect AsyncRat in memory,JPCERT/CC Incident Response Group,internal research
3436,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,linux_generic_ipv6_catcher,ELF samples using IPv6 addresses,@_lubiedo,


## Data Cleaning

#### Missing Values

- drop rows with missing values related to sha1 sha256, md5 and yaraRules field because they are the keys of that row

In [48]:
yaraRules.dropna(subset=['sha1', 'sha256', 'md5', 'yara_rule_name'], inplace=True)
yaraRules.reset_index(drop=True, inplace=True)

yaraRules


Unnamed: 0,sha1,sha256,md5,yara_rule_name,yara_rule_description,yara_rule_author,yara_rule_reference
0,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,iexplorer_remcos,Detect iexplorer being taken over by Remcos,iam-py-test,
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,INDICATOR_SUSPICIOUS_EXE_UACBypass_CMSTPCOM,Detects Windows exceutables bypassing UAC using CMSTP COM interfaces. MITRE (T1218.003),ditekSHen,
2,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,pe_imphash,,,
3,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,Remcos,Remcos Payload,kevoreilly,
4,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,REMCOS_RAT_variants,,,
...,...,...,...,...,...,...,...
3433,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,Skystars_Malware_Imphash,imphash,Skystars LightDefender,
3434,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,win_asyncrat_j1,detects AsyncRAT,Johannes Bader @viql,
3435,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,win_asyncrat_w0,detect AsyncRat in memory,JPCERT/CC Incident Response Group,internal research
3436,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,linux_generic_ipv6_catcher,ELF samples using IPv6 addresses,@_lubiedo,


- drop column 'reference' due to high percentage of missing values

In [49]:
yaraRules = yaraRules.drop(columns=['yara_rule_reference', ])

yaraRules


Unnamed: 0,sha1,sha256,md5,yara_rule_name,yara_rule_description,yara_rule_author
0,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,iexplorer_remcos,Detect iexplorer being taken over by Remcos,iam-py-test
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,INDICATOR_SUSPICIOUS_EXE_UACBypass_CMSTPCOM,Detects Windows exceutables bypassing UAC using CMSTP COM interfaces. MITRE (T1218.003),ditekSHen
2,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,pe_imphash,,
3,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,Remcos,Remcos Payload,kevoreilly
4,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,REMCOS_RAT_variants,,
...,...,...,...,...,...,...
3433,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,Skystars_Malware_Imphash,imphash,Skystars LightDefender
3434,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,win_asyncrat_j1,detects AsyncRAT,Johannes Bader @viql
3435,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,win_asyncrat_w0,detect AsyncRat in memory,JPCERT/CC Incident Response Group
3436,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,linux_generic_ipv6_catcher,ELF samples using IPv6 addresses,@_lubiedo


#### Standardization
- the columns yara_rule_name and yara_rule_description are all converted to lower case and _ are replaces with spaces

In [61]:
yaraRules['yara_rule_name'] = yaraRules['yara_rule_name'].replace('_', ' ', regex=True).str.lower()
yaraRules['yara_rule_description'] = yaraRules['yara_rule_description'].replace('_', ' ', regex=True).str.lower()
yaraRules

Unnamed: 0,sha1,sha256,md5,yara_rule_name,yara_rule_description,yara_rule_author
0,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,iexplorer remcos,detect iexplorer being taken over by remcos,iam-py-test
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,indicator suspicious exe uacbypass cmstpcom,detects windows exceutables bypassing uac using cmstp com interfaces. mitre (t1218.003),ditekSHen
2,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,pe imphash,,
3,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,remcos,remcos payload,kevoreilly
4,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,remcos rat variants,,
...,...,...,...,...,...,...
3433,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,skystars malware imphash,imphash,Skystars LightDefender
3434,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,win asyncrat j1,detects asyncrat,Johannes Bader @viql
3435,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,win asyncrat w0,detect asyncrat in memory,JPCERT/CC Incident Response Group
3436,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,linux generic ipv6 catcher,elf samples using ipv6 addresses,@_lubiedo


# Behaviours

In [63]:
behaviours = pd.read_csv(path_MB_Behaviours, low_memory = False)
behaviours

Unnamed: 0,sha1,sha256,md5,threat_name,threat_level,score
0,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Searching for the window,neutral,
1,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Сreating synchronization primitives,neutral,
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Unauthorized injection to a recently created process,neutral,
3,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Reading critical registry keys,neutral,
4,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Creating a file in the %temp% directory,neutral,
...,...,...,...,...,...,...
9547,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,AsyncRat,,10.0
9548,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,Contacts a large (345195) amount of remote hosts,,9.0
9549,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,Creates a large amount of network flows,,9.0
9550,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,Modifies the Watchdog daemon,,9.0


#### Note : There is a dependency between the 2 fields threat_level and threat_score

## Data Cleaning

#### Column Renaming

In [64]:
behaviours.rename(columns= { 'score' : 'action_maliciousness_score', 'threat_level' : 'action_maliciousness_level', 'threat_name' : 'action_name' }, inplace=True)
behaviours

Unnamed: 0,sha1,sha256,md5,action_name,action_maliciousness_level,action_maliciousness_score
0,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Searching for the window,neutral,
1,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Сreating synchronization primitives,neutral,
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Unauthorized injection to a recently created process,neutral,
3,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Reading critical registry keys,neutral,
4,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Creating a file in the %temp% directory,neutral,
...,...,...,...,...,...,...
9547,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,AsyncRat,,10.0
9548,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,Contacts a large (345195) amount of remote hosts,,9.0
9549,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,Creates a large amount of network flows,,9.0
9550,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,Modifies the Watchdog daemon,,9.0


#### Domain-specific transformations
Malicious attribute is not a score but is a threat_level ->  if i found the threat levele empty i use malicious as threat level

In [65]:
behaviours['action_maliciousness_level'].unique()

array(['neutral', 'suspicious', 'malicious', nan], dtype=object)

In [66]:
behaviours['action_maliciousness_score'].unique()

array([nan, 10.,  6.,  5.,  8.,  7.,  4.,  3.,  9.,  1.])

Both the fields 'threat_level' and 'threat_score' try to identify the maliciousness of a threat but they are not present at the same time, so:
- replace 'neutral' as 'threat_level' to 5
- replace 'suspicious' as 'threat_level' to 7.5
- replace 'malicious' as 'threat_level' to 10


In [67]:
completeness_t = behaviours['action_maliciousness_score'].count() / len(behaviours['action_maliciousness_score']) * 100
print(f"Completeness della colonna 'action_maliciousness_score': {completeness_t:.2f}%")
completeness_l = behaviours['action_maliciousness_level'].count() / len(behaviours['action_maliciousness_level']) * 100
print(f"Completeness della colonna 'action_maliciousness_level': {completeness_l:.2f}%")

Completeness della colonna 'action_maliciousness_score': 30.62%
Completeness della colonna 'action_maliciousness_level': 47.45%


In [68]:
all_nan_rows = behaviours[['action_maliciousness_score', 'action_maliciousness_level']].apply(lambda row: any(row.isna()), axis=1)

# Check if it's true for all rows
is_true_for_all_rows = all_nan_rows.all()

# Display the result
print(f"Are 'behaviours.action_maliciousness_score' or 'behaviours.action_maliciousness_level' disjonted?  {is_true_for_all_rows}")

Are 'behaviours.action_maliciousness_score' or 'behaviours.action_maliciousness_level' disjonted?  True


Due to the fact that the 2 attribute are disjonted, so i cannot predict the missing values of a column from the values of the other column. I have imposed a simple rule on values of action_maliciousness_score on which i have assigned a action_maliciousness_level

If found :
- 'threat_score' > 8 , set 'threat_level' to 'malicious'
- 'threat_score' <= 6 , set 'threat_level' to 'neutral'
- 'threat_score' > 6 and <= 8 , set 'threat_level' to 'neutral'

In [69]:
behaviours.loc[(behaviours['action_maliciousness_level'].isna()) & (behaviours['action_maliciousness_score'] > 8), 'action_maliciousness_level'] = 'malicious'
behaviours.loc[(behaviours['action_maliciousness_level'].isna()) & (behaviours['action_maliciousness_score'] <= 8 ) & (behaviours['action_maliciousness_score'] > 6 ), 'action_maliciousness_level'] = 'suspicious'
behaviours.loc[(behaviours['action_maliciousness_level'].isna()) & (behaviours['action_maliciousness_score'] <= 6), 'action_maliciousness_level'] = 'neutral'
behaviours.loc[(behaviours['action_maliciousness_level'].isna()) & behaviours['action_maliciousness_level'].isna(), 'action_maliciousness_level'] = 'unknown'

behaviours

Unnamed: 0,sha1,sha256,md5,action_name,action_maliciousness_level,action_maliciousness_score
0,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Searching for the window,neutral,
1,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Сreating synchronization primitives,neutral,
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Unauthorized injection to a recently created process,neutral,
3,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Reading critical registry keys,neutral,
4,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Creating a file in the %temp% directory,neutral,
...,...,...,...,...,...,...
9547,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,AsyncRat,malicious,10.0
9548,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,Contacts a large (345195) amount of remote hosts,malicious,9.0
9549,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,Creates a large amount of network flows,malicious,9.0
9550,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,Modifies the Watchdog daemon,malicious,9.0


In [70]:
completeness_t = behaviours['action_maliciousness_score'].count() / len(behaviours['action_maliciousness_score']) * 100
print(f"Completeness della colonna 'action_maliciousness_score': {completeness_t:.2f}%")
completeness_l = behaviours['action_maliciousness_level'].count() / len(behaviours['action_maliciousness_level']) * 100
print(f"Completeness della colonna 'action_maliciousness_level': {completeness_l:.2f}%")

Completeness della colonna 'action_maliciousness_score': 30.62%
Completeness della colonna 'action_maliciousness_level': 100.00%


#### Missing Values

- drop rows with missing values related to sha1 sha256 and md5 field because they cannot be associate to any file
- drop the action_maliciousness_score column due to high missings 

In [71]:
behaviours.dropna(subset=['sha1', 'sha256', 'md5', 'action_name'], inplace=True)
behaviours.reset_index(drop=True, inplace=True)

behaviours

Unnamed: 0,sha1,sha256,md5,action_name,action_maliciousness_level,action_maliciousness_score
0,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Searching for the window,neutral,
1,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Сreating synchronization primitives,neutral,
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Unauthorized injection to a recently created process,neutral,
3,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Reading critical registry keys,neutral,
4,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Creating a file in the %temp% directory,neutral,
...,...,...,...,...,...,...
9547,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,AsyncRat,malicious,10.0
9548,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,Contacts a large (345195) amount of remote hosts,malicious,9.0
9549,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,Creates a large amount of network flows,malicious,9.0
9550,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,Modifies the Watchdog daemon,malicious,9.0


In [72]:
behaviours = behaviours.drop(columns=['action_maliciousness_score'])
behaviours

Unnamed: 0,sha1,sha256,md5,action_name,action_maliciousness_level
0,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Searching for the window,neutral
1,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Сreating synchronization primitives,neutral
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Unauthorized injection to a recently created process,neutral
3,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Reading critical registry keys,neutral
4,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Creating a file in the %temp% directory,neutral
...,...,...,...,...,...
9547,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,AsyncRat,malicious
9548,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,Contacts a large (345195) amount of remote hosts,malicious
9549,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,Creates a large amount of network flows,malicious
9550,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,Modifies the Watchdog daemon,malicious


#### Standardization
- the column action_name is converted to lower case and _ are replaces with spaces

In [74]:
behaviours['action_name'] = behaviours['action_name'].replace('_', ' ', regex=True).str.lower()
behaviours

Unnamed: 0,sha1,sha256,md5,action_name,action_maliciousness_level
0,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,searching for the window,neutral
1,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,сreating synchronization primitives,neutral
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,unauthorized injection to a recently created process,neutral
3,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,reading critical registry keys,neutral
4,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,creating a file in the %temp% directory,neutral
...,...,...,...,...,...
9547,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,asyncrat,malicious
9548,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,contacts a large (345195) amount of remote hosts,malicious
9549,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,creates a large amount of network flows,malicious
9550,4b3f7c7b3d8f6748505115fc291b6ac02e1b99a0,c3217157c0ab2d83047f679a073d19d296a03306072ef9636a8c8be52858347d,aff1636c687b809a106aadf5eb0e40ee,modifies the watchdog daemon,malicious


# Threat Vendor Analysis

In [82]:
tv_analysis = pd.read_csv(path_MB_TV_Analysis, low_memory = False)
tv_analysis

Unnamed: 0,sha1,sha256,md5,malware,verdict,threat_vendor_name
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,PUA.Win.Packer.BorlandDelphi-15,,clamav
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,SecuriteInfo.com.Trojan.GenericKD.68242337.20401.3632.UNOFFICIAL,,clamav
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Win.Trojan.Remcos-9753190-0,,clamav
3,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Win.Trojan.Remcos-9841897-0,,clamav
4,a2915c1be9e6134b7bf3ca5ca00eeb0c969bedab,6c2a2251861a6d2701814843fadac940cf4d34db9f446f0698352fd866b31739,1b09de36dfe5850d7e3fbd6b39c89a43,SecuriteInfo.com.Generic.mg.1b09de36dfe5850d.24204.UNOFFICIAL,,clamav
...,...,...,...,...,...,...
7169,45c4340c4bfcf6145638b6a9556c42b25a07db76,eff2bda9797c042cbae44c8aed29b31b853733c0676a687dc62676752197c05d,7ebd8264cdecb8f522b51b0490a3f901,,,VMRay
7170,dc040262f90dd233b35fe922ec98834eb9f44f9b,7f7670c31d0f9eb80f8839e8e063aa01d44c77712d90066323fe155480f5f69c,6d7e39aeeb28c8352269a6357f583c11,,,VMRay
7171,b70e2199703c316465af96540561bcd002321045,0252937e5af74369499c21efe832306dccf718402b08c67b2d338061ba1e67a0,69b64df678f3e397ab18b4e3db830d96,,,VMRay
7172,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,,,VMRay


## Data Cleaning

#### Missing Values
- The keys 'sha1', 'sha256', 'md5', 'threat_vendor_name' must be always different from NaN: drop
- The rows with malware and verdict Nan are dropped

In [83]:
tv_analysis.dropna(subset=['sha1', 'sha256', 'md5'], inplace=True)
tv_analysis.dropna(subset=['malware', 'verdict'], how = 'all', inplace=True)

tv_analysis.reset_index(drop=True, inplace=True)

tv_analysis

Unnamed: 0,sha1,sha256,md5,malware,verdict,threat_vendor_name
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,PUA.Win.Packer.BorlandDelphi-15,,clamav
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,SecuriteInfo.com.Trojan.GenericKD.68242337.20401.3632.UNOFFICIAL,,clamav
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Win.Trojan.Remcos-9753190-0,,clamav
3,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,Win.Trojan.Remcos-9841897-0,,clamav
4,a2915c1be9e6134b7bf3ca5ca00eeb0c969bedab,6c2a2251861a6d2701814843fadac940cf4d34db9f446f0698352fd866b31739,1b09de36dfe5850d7e3fbd6b39c89a43,SecuriteInfo.com.Generic.mg.1b09de36dfe5850d.24204.UNOFFICIAL,,clamav
...,...,...,...,...,...,...
5240,591a3cecce37189369faf4c127ba28f7f44b2ff4,fcdb09f9b63c86b92059cb0c636da21a2daee289a120cff346a7d73b02bdff4e,a311cef429085f54e95b32fd836c56b6,Remcos,malicious,VMRay
5241,891e8e84555f3fc08bff5e9ba5dc128685c87b40,99e8dfa23cef1d5d67c765df3de3bc6e750a2d8fa4628a9442d08fc40aaaa656,1804fd17264130f722df702777fa9e9c,Netwire,malicious,VMRay
5242,51c778339d04ae3436b22c42ecc3a0031a8fe9ad,9e2504ae76b162ef7eee19d01cb5892510d8f27732e93ac4a7e91b1b16ba18c9,1738c41acc50acec984a336fa0ecbab1,"xRAT,QuasarRAT",malicious,VMRay
5243,b1e6a13c57c8e28bbcd25b851475c60116302589,81d77842292a2b26a9340665963f782bab88f4e3f2091cc8a60826e9f3648669,ac96eb9b8ff366bcda004060171dd2f1,FormBook,malicious,VMRay


#### Standardization
In the Field verdict_prediction is converted to lower case

In [85]:
tv_analysis['malware'] = tv_analysis['malware'].str.lower()
tv_analysis['verdict'] = tv_analysis['verdict'].str.lower()
tv_analysis['threat_vendor_name'] = tv_analysis['threat_vendor_name'].str.lower()


tv_analysis

Unnamed: 0,sha1,sha256,md5,malware,verdict,threat_vendor_name
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,pua.win.packer.borlanddelphi-15,,clamav
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,securiteinfo.com.trojan.generickd.68242337.20401.3632.unofficial,,clamav
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,win.trojan.remcos-9753190-0,,clamav
3,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,win.trojan.remcos-9841897-0,,clamav
4,a2915c1be9e6134b7bf3ca5ca00eeb0c969bedab,6c2a2251861a6d2701814843fadac940cf4d34db9f446f0698352fd866b31739,1b09de36dfe5850d7e3fbd6b39c89a43,securiteinfo.com.generic.mg.1b09de36dfe5850d.24204.unofficial,,clamav
...,...,...,...,...,...,...
5240,591a3cecce37189369faf4c127ba28f7f44b2ff4,fcdb09f9b63c86b92059cb0c636da21a2daee289a120cff346a7d73b02bdff4e,a311cef429085f54e95b32fd836c56b6,remcos,malicious,vmray
5241,891e8e84555f3fc08bff5e9ba5dc128685c87b40,99e8dfa23cef1d5d67c765df3de3bc6e750a2d8fa4628a9442d08fc40aaaa656,1804fd17264130f722df702777fa9e9c,netwire,malicious,vmray
5242,51c778339d04ae3436b22c42ecc3a0031a8fe9ad,9e2504ae76b162ef7eee19d01cb5892510d8f27732e93ac4a7e91b1b16ba18c9,1738c41acc50acec984a336fa0ecbab1,"xrat,quasarrat",malicious,vmray
5243,b1e6a13c57c8e28bbcd25b851475c60116302589,81d77842292a2b26a9340665963f782bab88f4e3f2091cc8a60826e9f3648669,ac96eb9b8ff366bcda004060171dd2f1,formbook,malicious,vmray


#### Discretization
In the Field verdict_prediction:
- terms 'Malicious activity', 'malicious', 'MALICIOUS' are set to 'malicious'
- terms 'suspicious', 'SUSPICIOUS' are set to 'suspicious'

In [86]:
tv_analysis['verdict'].unique()

array([nan, 'malicious activity', 'no threats detected',
       'suspicious activity', 'malicious file', 'remcos', 'kryptik',
       'njrat', 'suspicious file', 'ispy keylogger', 'legit file',
       'backdoor', 'eldorado', 'agenttesla', 'bashlite', 'gamarue',
       'redline', 'farheyt', 'unknown', 'nanocore',
       'netsupportmanager rat', 'mirai', 'emotet', 'vjw0rm', 'dropper',
       'tinba', 'revengerat', 'alphacrypt', 'trojan', 'password stealer',
       'starter', 'ave maria', 'legal file', 'formbook', 'malicious',
       'suspicious', 'trusted', 'known', 'likely_malicious',
       'informational'], dtype=object)

In [93]:
# MALICIOUS
tv_analysis.loc[tv_analysis['verdict'].str.contains('malicious', case=False, na=False), 'verdict'] = 'malicious'

# SUSPICIOUS
tv_analysis.loc[tv_analysis['verdict'].str.contains('suspicious', case=False, na=False), 'verdict'] = 'suspicious'

# UNKNOWN
tv_analysis.loc[tv_analysis['verdict'].str.contains('unknown', case=False, na=False), 'verdict'] = 'unknown'

# LEGIT
tv_analysis.loc[tv_analysis['verdict'].str.contains('legit', case=False, na=False), 'verdict'] = 'legit'
tv_analysis.loc[tv_analysis['verdict'].str.contains('no threat', case=False, na=False), 'verdict'] = 'legit'
tv_analysis.loc[tv_analysis['verdict'].str.contains('legal', case=False, na=False), 'verdict'] = 'legit'
tv_analysis.loc[tv_analysis['verdict'].str.contains('trusted', case=False, na=False), 'verdict'] = 'legit'
tv_analysis.loc[tv_analysis['verdict'].str.contains('known', case=False, na=False), 'verdict'] = 'legit'
tv_analysis.loc[tv_analysis['verdict'].str.contains('informational', case=False, na=False), 'verdict'] = 'legit'


In [94]:
tv_analysis.loc[~tv_analysis['verdict'].isin(['malicious', 'legit', 'suspicious', np.nan]), 'verdict'] = 'malicious'


In [95]:
tv_analysis['verdict'].unique()

array([nan, 'malicious', 'legit', 'suspicious'], dtype=object)

# Tags

In [96]:
tags = pd.read_csv(path_MB_Tags, low_memory = False)
tags

Unnamed: 0,sha1,sha256,md5,tag_name
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,exe
1,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,RemcosRAT
2,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,32
3,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,exe
4,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,RemcosRAT
...,...,...,...,...
1620,dc040262f90dd233b35fe922ec98834eb9f44f9b,7f7670c31d0f9eb80f8839e8e063aa01d44c77712d90066323fe155480f5f69c,6d7e39aeeb28c8352269a6357f583c11,RAT
1621,b70e2199703c316465af96540561bcd002321045,0252937e5af74369499c21efe832306dccf718402b08c67b2d338061ba1e67a0,69b64df678f3e397ab18b4e3db830d96,RemcosRAT
1622,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,AsyncRAT
1623,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,exe


## Data Cleaning

#### Missing Values

- drop rows with missing values related to sha1 sha256 and md5 field because they cannot be associate to any file
- drop rows with missing values related to 'tag name'

In [97]:
tags.dropna(subset=['sha1', 'sha256', 'md5'], inplace=True)
tags.dropna(subset=['tag_name'], inplace=True)

tags.reset_index(drop=True, inplace=True)

tags

Unnamed: 0,sha1,sha256,md5,tag_name
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,exe
1,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,RemcosRAT
2,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,32
3,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,exe
4,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,RemcosRAT
...,...,...,...,...
1620,dc040262f90dd233b35fe922ec98834eb9f44f9b,7f7670c31d0f9eb80f8839e8e063aa01d44c77712d90066323fe155480f5f69c,6d7e39aeeb28c8352269a6357f583c11,RAT
1621,b70e2199703c316465af96540561bcd002321045,0252937e5af74369499c21efe832306dccf718402b08c67b2d338061ba1e67a0,69b64df678f3e397ab18b4e3db830d96,RemcosRAT
1622,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,AsyncRAT
1623,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,exe


#### Standardization
In the Field tag_name is converted to lower case

In [100]:
tags['tag_name'] = tags['tag_name'].str.lower()

tags

Unnamed: 0,sha1,sha256,md5,tag_name
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,exe
1,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,remcosrat
2,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,32
3,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,exe
4,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,remcosrat
...,...,...,...,...
1620,dc040262f90dd233b35fe922ec98834eb9f44f9b,7f7670c31d0f9eb80f8839e8e063aa01d44c77712d90066323fe155480f5f69c,6d7e39aeeb28c8352269a6357f583c11,rat
1621,b70e2199703c316465af96540561bcd002321045,0252937e5af74369499c21efe832306dccf718402b08c67b2d338061ba1e67a0,69b64df678f3e397ab18b4e3db830d96,remcosrat
1622,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,asyncrat
1623,5d0cf2954a43c613f270ee9c6ea2e74798787725,eaf087b8fe0cf69bed568070ae460d63483525842d4c84c7750ef87c4985e622,7b25379caa4340a714b210dfd01fd5f9,exe


## Saving

In [101]:
dataframes_dict = {}

# Add the DataFrames to the dictionary with keys
dataframes_dict['Entries'] = entries
dataframes_dict['TV_Analysis'] = tv_analysis
dataframes_dict['Behaviours'] = behaviours
dataframes_dict['Tags'] = tags
dataframes_dict['YaraRules'] = yaraRules

In [102]:
save_directory = "C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\FileAnalysis\\PostDataCleaning\\Malware_Bazaar\\"
for key, df in dataframes_dict.items():
    file_name = f"{key}.csv"
    file_path = save_directory + file_name
    df.to_csv(file_path, index=False)
    print(f'{key} added')

Entries added
TV_Analysis added
Behaviours added
Tags added
YaraRules added
