# 16.0. Data Cleaning - OTX Alien Vault

In [1]:
from datetime import datetime
import os
import time
import pandas as pd
import requests
import urllib3
import json
import sys
import numpy as np

pd.options.display.max_colwidth = 1000
pd.set_option('display.max_columns', None)

In [2]:
path_OTX_Entries     = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\DataAnalysis\\otx_alien_vault\\Entries.csv"
path_OTX_Anomalies   = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\DataAnalysis\\otx_alien_vault\\Anomalies.csv"
path_OTX_Attacks     = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\DataAnalysis\\otx_alien_vault\\Attacks.csv"
path_OTX_Behaviours  = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\DataAnalysis\\otx_alien_vault\\Behaviours.csv"
path_OTX_TV_Analysis = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\DataAnalysis\\otx_alien_vault\\TV_Analysis.csv"

# Entries

In [3]:
entries = pd.read_csv(path_OTX_Entries, low_memory = False)
entries.head(3)


Unnamed: 0,analysis.info.results.sha1,analysis.info.results.sha256,analysis.info.results.md5,page_type,analysis.datetime_int
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,PEXE,2022-02-25T20:30:34
1,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31,PEXE,2018-11-21T00:58:28
2,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,PEXE,2022-07-07T00:45:04


## Data Wrangling

#### Column Renaming

In [4]:
entries.rename(columns=
               {   'analysis.info.results.sha1'                            : 'sha1', 
                   'analysis.info.results.sha256'                          : 'sha256', 
                   'analysis.info.results.md5'                             : 'md5', 
                   'analysis.datetime_int'                                 : 'analysis_date', 
                   'page_type'                                             : 'file_type'}
          , inplace=True)
entries

Unnamed: 0,sha1,sha256,md5,file_type,analysis_date
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,PEXE,2022-02-25T20:30:34
1,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31,PEXE,2018-11-21T00:58:28
2,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,PEXE,2022-07-07T00:45:04
3,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,PEXE,2021-12-20T02:27:16
4,18a703514e47e1c3b59cf12de95f1ea5c7475083,28b6bbc6b1d6ee21ccdc25c5eefccad5e21556954542cd19a0a87f7dde91df78,260bf0106c006d4e988a2685df71a758,PEXE,2020-10-09T19:29:05
...,...,...,...,...,...
2653,3ab53037e2360c42b57d213234ced58717844a71,5de4932cb0ccab3fec711135cdd1af5a09c26a870bfb2d2adc650d0fdbf16488,e0fea510c784eb0e440e99e53d83d414,PEXE,2021-07-14T04:25:36
2654,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,PEXE,2020-09-08T06:08:36
2655,ba5e50a39d1502e38196724ec9d844ee4bf8d002,f3fcbb0fedb1e3b732185aebbf845ca185c950ca3635026d8a754312220577c9,0550342e66698384f232ac39b700dd14,PEXE,2020-11-21T16:28:28
2656,d14540087dd6d350fb49e644e53ccefc0e4c1582,dba8a4717d516772b8cc06bd258265f21f7db6ddab62328917524c5a243c67a1,fe783ff4bccd76cb215ee1f7738d9ed5,PEXE,2021-12-20T19:52:42


## Data Cleaning

#### Standardization
- the columns file_format are all converted to lower case 

In [8]:
entries['file_type'] = entries['file_type'].str.lower()


In [10]:
entries['file_type'] = entries['file_type'].replace('pexe', 'exe', regex=True).str.lower()

entries.file_type.unique()

array(['exe', 'elf', nan, 'sh', 'vbs', 'lnk', 'js', 'ooxml', 'android',
       'generic', 'ps', 'msi', 'rtf', 'cdf', 'hta', 'bat', 'pdf'],
      dtype=object)

#### Data type Conversion


- analysis_date field : from string to date

In [11]:
entries['analysis_date'] = pd.to_datetime(entries['analysis_date']).dt.strftime('%Y-%m-%d')
entries

Unnamed: 0,sha1,sha256,md5,file_type,analysis_date
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,exe,2022-02-25
1,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31,exe,2018-11-21
2,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,exe,2022-07-07
3,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,exe,2021-12-20
4,18a703514e47e1c3b59cf12de95f1ea5c7475083,28b6bbc6b1d6ee21ccdc25c5eefccad5e21556954542cd19a0a87f7dde91df78,260bf0106c006d4e988a2685df71a758,exe,2020-10-09
...,...,...,...,...,...
2653,3ab53037e2360c42b57d213234ced58717844a71,5de4932cb0ccab3fec711135cdd1af5a09c26a870bfb2d2adc650d0fdbf16488,e0fea510c784eb0e440e99e53d83d414,exe,2021-07-14
2654,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,exe,2020-09-08
2655,ba5e50a39d1502e38196724ec9d844ee4bf8d002,f3fcbb0fedb1e3b732185aebbf845ca185c950ca3635026d8a754312220577c9,0550342e66698384f232ac39b700dd14,exe,2020-11-21
2656,d14540087dd6d350fb49e644e53ccefc0e4c1582,dba8a4717d516772b8cc06bd258265f21f7db6ddab62328917524c5a243c67a1,fe783ff4bccd76cb215ee1f7738d9ed5,exe,2021-12-20


#### Missing Values

- drop rows with missing values related to sha1 sha256, md5 fields because they are the keys of the entity Entries

In [12]:
entries.dropna(subset=['sha1', 'sha256', 'md5'], inplace=True)
entries.reset_index(drop=True, inplace=True)

entries


Unnamed: 0,sha1,sha256,md5,file_type,analysis_date
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,exe,2022-02-25
1,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31,exe,2018-11-21
2,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,exe,2022-07-07
3,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,exe,2021-12-20
4,18a703514e47e1c3b59cf12de95f1ea5c7475083,28b6bbc6b1d6ee21ccdc25c5eefccad5e21556954542cd19a0a87f7dde91df78,260bf0106c006d4e988a2685df71a758,exe,2020-10-09
...,...,...,...,...,...
2641,3ab53037e2360c42b57d213234ced58717844a71,5de4932cb0ccab3fec711135cdd1af5a09c26a870bfb2d2adc650d0fdbf16488,e0fea510c784eb0e440e99e53d83d414,exe,2021-07-14
2642,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,exe,2020-09-08
2643,ba5e50a39d1502e38196724ec9d844ee4bf8d002,f3fcbb0fedb1e3b732185aebbf845ca185c950ca3635026d8a754312220577c9,0550342e66698384f232ac39b700dd14,exe,2020-11-21
2644,d14540087dd6d350fb49e644e53ccefc0e4c1582,dba8a4717d516772b8cc06bd258265f21f7db6ddab62328917524c5a243c67a1,fe783ff4bccd76cb215ee1f7738d9ed5,exe,2021-12-20


In [14]:
completeness = 1 - entries['sha1'].isna().mean()

# Print the completeness percentage
print(f"Completeness for 'sha1': {completeness * 100:.2f}%")

completeness = 1 - entries['sha256'].isna().mean()

# Print the completeness percentage
print(f"Completeness for 'sha256': {completeness * 100:.2f}%")

completeness = 1 - entries['md5'].isna().mean()

# Print the completeness percentage
print(f"Completeness for 'md5': {completeness * 100:.2f}%")

completeness = 1 - entries['analysis_date'].isna().mean()

# Print the completeness percentage
print(f"Completeness for 'analysis_date': {completeness * 100:.2f}%")

completeness = 1 - entries['file_type'].isna().mean()

# Print the completeness percentage
print(f"Completeness for 'file_type': {completeness * 100:.2f}%")

Completeness for 'sha1': 100.00%
Completeness for 'sha256': 100.00%
Completeness for 'md5': 100.00%
Completeness for 'analysis_date': 100.00%
Completeness for 'file_type': 100.00%


# Anomalies 

In [15]:
anomalies = pd.read_csv(path_OTX_Anomalies, low_memory = False)
anomalies.head(3)


Unnamed: 0,name,sha1,sha256,md5
0,checksum_header_zero,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38
1,checksum_header_zero,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860
2,entropy_based,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135


In [16]:
print("Any NaN values in the DataFrame:", anomalies.isna().any().any())


Any NaN values in the DataFrame: False


## Data Wrangling

#### Column Renaming

In [17]:
anomalies.rename(columns= { 'name' : 'anomaly_name' }, inplace=True)
anomalies

Unnamed: 0,anomaly_name,sha1,sha256,md5
0,checksum_header_zero,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38
1,checksum_header_zero,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860
2,entropy_based,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135
3,checksum_header_zero,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135
4,checksum_header_zero,18a703514e47e1c3b59cf12de95f1ea5c7475083,28b6bbc6b1d6ee21ccdc25c5eefccad5e21556954542cd19a0a87f7dde91df78,260bf0106c006d4e988a2685df71a758
...,...,...,...,...
3233,entropy_based,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29
3234,checksum_header_zero,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29
3235,checksum_header_zero,ba5e50a39d1502e38196724ec9d844ee4bf8d002,f3fcbb0fedb1e3b732185aebbf845ca185c950ca3635026d8a754312220577c9,0550342e66698384f232ac39b700dd14
3236,checksum_header_zero,d14540087dd6d350fb49e644e53ccefc0e4c1582,dba8a4717d516772b8cc06bd258265f21f7db6ddab62328917524c5a243c67a1,fe783ff4bccd76cb215ee1f7738d9ed5


In [18]:
anomaly_columns = ['sha1', 'sha256', 'md5', 'anomaly_name']
anomalies = anomalies[anomaly_columns]
anomalies

Unnamed: 0,sha1,sha256,md5,anomaly_name
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,checksum_header_zero
1,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,checksum_header_zero
2,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,entropy_based
3,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,checksum_header_zero
4,18a703514e47e1c3b59cf12de95f1ea5c7475083,28b6bbc6b1d6ee21ccdc25c5eefccad5e21556954542cd19a0a87f7dde91df78,260bf0106c006d4e988a2685df71a758,checksum_header_zero
...,...,...,...,...
3233,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,entropy_based
3234,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,checksum_header_zero
3235,ba5e50a39d1502e38196724ec9d844ee4bf8d002,f3fcbb0fedb1e3b732185aebbf845ca185c950ca3635026d8a754312220577c9,0550342e66698384f232ac39b700dd14,checksum_header_zero
3236,d14540087dd6d350fb49e644e53ccefc0e4c1582,dba8a4717d516772b8cc06bd258265f21f7db6ddab62328917524c5a243c67a1,fe783ff4bccd76cb215ee1f7738d9ed5,checksum_header_zero


## Data Cleaning

#### Missing Values

- drop rows with missing values related to sha1 sha256, md5 and anomaly_name field because they are the keys of that row

In [19]:
anomalies.dropna(subset=['sha1', 'sha256', 'md5', 'anomaly_name'], inplace=True)
anomalies.reset_index(drop=True, inplace=True)

anomalies


Unnamed: 0,sha1,sha256,md5,anomaly_name
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,checksum_header_zero
1,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,checksum_header_zero
2,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,entropy_based
3,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,checksum_header_zero
4,18a703514e47e1c3b59cf12de95f1ea5c7475083,28b6bbc6b1d6ee21ccdc25c5eefccad5e21556954542cd19a0a87f7dde91df78,260bf0106c006d4e988a2685df71a758,checksum_header_zero
...,...,...,...,...
3233,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,entropy_based
3234,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,checksum_header_zero
3235,ba5e50a39d1502e38196724ec9d844ee4bf8d002,f3fcbb0fedb1e3b732185aebbf845ca185c950ca3635026d8a754312220577c9,0550342e66698384f232ac39b700dd14,checksum_header_zero
3236,d14540087dd6d350fb49e644e53ccefc0e4c1582,dba8a4717d516772b8cc06bd258265f21f7db6ddab62328917524c5a243c67a1,fe783ff4bccd76cb215ee1f7738d9ed5,checksum_header_zero


#### Standardization
- the columns yara_rule_name and yara_rule_description are all converted to lower case and _ are replaces with spaces

In [20]:
anomalies['anomaly_name'] = anomalies['anomaly_name'].replace('_', ' ', regex=True).str.lower()
anomalies

Unnamed: 0,sha1,sha256,md5,anomaly_name
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,checksum header zero
1,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,checksum header zero
2,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,entropy based
3,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,checksum header zero
4,18a703514e47e1c3b59cf12de95f1ea5c7475083,28b6bbc6b1d6ee21ccdc25c5eefccad5e21556954542cd19a0a87f7dde91df78,260bf0106c006d4e988a2685df71a758,checksum header zero
...,...,...,...,...
3233,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,entropy based
3234,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,checksum header zero
3235,ba5e50a39d1502e38196724ec9d844ee4bf8d002,f3fcbb0fedb1e3b732185aebbf845ca185c950ca3635026d8a754312220577c9,0550342e66698384f232ac39b700dd14,checksum header zero
3236,d14540087dd6d350fb49e644e53ccefc0e4c1582,dba8a4717d516772b8cc06bd258265f21f7db6ddab62328917524c5a243c67a1,fe783ff4bccd76cb215ee1f7738d9ed5,checksum header zero


# Attacks

In [21]:
attacks = pd.read_csv(path_OTX_Attacks, low_memory = False)
attacks.head(3)


Unnamed: 0,attack_type,attack_name,attack_id,sha1,sha256,md5
0,Installation,Registry Run Keys / Start Folder,T1060,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31
1,Objectives,Input Capture,T1056,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31
2,Installation,Registry Run Keys / Start Folder,T1060,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860


In [22]:
attack_columns = ['sha1', 'sha256', 'md5', 'attack_id', 'attack_name', 'attack_type']
attacks = attacks[attack_columns]
attacks

Unnamed: 0,sha1,sha256,md5,attack_id,attack_name,attack_type
0,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31,T1060,Registry Run Keys / Start Folder,Installation
1,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31,T1056,Input Capture,Objectives
2,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,T1060,Registry Run Keys / Start Folder,Installation
3,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,T1119,Automated Collection,Objectives
4,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,T1045,Software Packing,Installation
...,...,...,...,...,...,...
5391,d14540087dd6d350fb49e644e53ccefc0e4c1582,dba8a4717d516772b8cc06bd258265f21f7db6ddab62328917524c5a243c67a1,fe783ff4bccd76cb215ee1f7738d9ed5,T1055,"Defense Evasion, Privilege Escalation",Installation
5392,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,T1060,Registry Run Keys / Start Folder,Installation
5393,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,T1045,Software Packing,Installation
5394,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,T1055,"Defense Evasion, Privilege Escalation",Installation


#### Missing Values

- drop rows with missing values related to sha1 sha256 and md5 field because they cannot be associate to any file

In [23]:
attacks.dropna(subset=['sha1', 'sha256', 'md5'], inplace=True)
attacks.reset_index(drop=True, inplace=True)

attacks

Unnamed: 0,sha1,sha256,md5,attack_id,attack_name,attack_type
0,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31,T1060,Registry Run Keys / Start Folder,Installation
1,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31,T1056,Input Capture,Objectives
2,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,T1060,Registry Run Keys / Start Folder,Installation
3,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,T1119,Automated Collection,Objectives
4,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,T1045,Software Packing,Installation
...,...,...,...,...,...,...
5391,d14540087dd6d350fb49e644e53ccefc0e4c1582,dba8a4717d516772b8cc06bd258265f21f7db6ddab62328917524c5a243c67a1,fe783ff4bccd76cb215ee1f7738d9ed5,T1055,"Defense Evasion, Privilege Escalation",Installation
5392,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,T1060,Registry Run Keys / Start Folder,Installation
5393,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,T1045,Software Packing,Installation
5394,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,T1055,"Defense Evasion, Privilege Escalation",Installation


# Behaviours

In [24]:
actions = pd.read_csv(path_OTX_Behaviours, low_memory = False)
actions.head(3)

Unnamed: 0,sha1,sha256,md5,name,description,severity,markcount
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,checks_debugger,Checks if process is being debugged by a debugger,1.0,2.0
1,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,generates_crypto_key,Uses Windows APIs to generate a cryptographic key,1.0,3.0
2,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,antivm_memory_available,"Checks amount of memory in system, this can be used to detect virtual machines that have a low amount of memory available",1.0,1.0


## Data Wrangling

#### Column Renaming

In [25]:
actions.rename(columns=
               {   'name'        : 'action_name', 
                   'description' : 'action_description', 
                   'severity'    : 'action_severity', 
                   'markcount'   : 'action_counter'}, inplace=True)
actions

Unnamed: 0,sha1,sha256,md5,action_name,action_description,action_severity,action_counter
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,checks_debugger,Checks if process is being debugged by a debugger,1.0,2.0
1,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,generates_crypto_key,Uses Windows APIs to generate a cryptographic key,1.0,3.0
2,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,antivm_memory_available,"Checks amount of memory in system, this can be used to detect virtual machines that have a low amount of memory available",1.0,1.0
3,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,allocates_rwx,Allocates read-write-execute memory (usually to unpack itself),2.0,28.0
4,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,antivm_network_adapters,Checks adapter addresses which can be used to detect virtual network interfaces,2.0,1.0
...,...,...,...,...,...,...,...
44962,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,injection_ntsetcontextthread,Used NtSetContextThread to modify a thread in a remote process indicative of process injection,3.0,2.0
44963,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,injection_resumethread,Resumed a suspended thread in a remote process potentially indicative of process injection,3.0,2.0
44964,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,network_icmp,Generates some ICMP traffic,4.0,0.0
44965,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,dead_host,Connects to an IP address that is no longer responding to requests (legitimate services will remain up-and-running usually),5.0,1.0


In [26]:
completeness = 1 - actions['sha1'].isna().mean()

# Print the completeness percentage
print(f"Completeness for 'sha1': {completeness * 100:.2f}%")

completeness = 1 - actions['sha256'].isna().mean()

# Print the completeness percentage
print(f"Completeness for 'sha256': {completeness * 100:.2f}%")

completeness = 1 - actions['md5'].isna().mean()

# Print the completeness percentage
print(f"Completeness for 'md5': {completeness * 100:.2f}%")

completeness = 1 - actions['action_counter'].isna().mean()

# Print the completeness percentage
print(f"Completeness for 'action_counter': {completeness * 100:.2f}%")

completeness = 1 - actions['action_severity'].isna().mean()

# Print the completeness percentage
print(f"Completeness for 'action_severity': {completeness * 100:.2f}%")

completeness = 1 - actions['action_name'].isna().mean()

# Print the completeness percentage
print(f"Completeness for 'action_name': {completeness * 100:.2f}%")

completeness = 1 - actions['action_description'].isna().mean()

# Print the completeness percentage
print(f"Completeness for 'action_description': {completeness * 100:.2f}%")

Completeness for 'sha1': 100.00%
Completeness for 'sha256': 100.00%
Completeness for 'md5': 100.00%
Completeness for 'action_counter': 84.35%
Completeness for 'action_severity': 100.00%
Completeness for 'action_name': 100.00%
Completeness for 'action_description': 100.00%


## Data Cleaning

#### Data Type Conversion

- action_counter field : from float to int


In [27]:
actions['action_counter'] = actions['action_counter'].astype('Int64')

- action_severity field : from float to int


In [28]:
actions['action_severity'] = actions['action_severity'].astype('Int64')

In [29]:
actions

Unnamed: 0,sha1,sha256,md5,action_name,action_description,action_severity,action_counter
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,checks_debugger,Checks if process is being debugged by a debugger,1,2
1,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,generates_crypto_key,Uses Windows APIs to generate a cryptographic key,1,3
2,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,antivm_memory_available,"Checks amount of memory in system, this can be used to detect virtual machines that have a low amount of memory available",1,1
3,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,allocates_rwx,Allocates read-write-execute memory (usually to unpack itself),2,28
4,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,antivm_network_adapters,Checks adapter addresses which can be used to detect virtual network interfaces,2,1
...,...,...,...,...,...,...,...
44962,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,injection_ntsetcontextthread,Used NtSetContextThread to modify a thread in a remote process indicative of process injection,3,2
44963,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,injection_resumethread,Resumed a suspended thread in a remote process potentially indicative of process injection,3,2
44964,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,network_icmp,Generates some ICMP traffic,4,0
44965,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,dead_host,Connects to an IP address that is no longer responding to requests (legitimate services will remain up-and-running usually),5,1


## Error Correction

#### Missing Values

- drop rows with missing values related to sha1 sha256 and md5 field because they cannot be associate to any file
- drop action counter due to missing and because not so interesting for organizations

In [30]:
actions.dropna(subset=['sha1', 'sha256', 'md5'], inplace=True)
actions = actions.drop(columns=['action_counter'])

actions.reset_index(drop=True, inplace=True)

actions

Unnamed: 0,sha1,sha256,md5,action_name,action_description,action_severity
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,checks_debugger,Checks if process is being debugged by a debugger,1
1,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,generates_crypto_key,Uses Windows APIs to generate a cryptographic key,1
2,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,antivm_memory_available,"Checks amount of memory in system, this can be used to detect virtual machines that have a low amount of memory available",1
3,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,allocates_rwx,Allocates read-write-execute memory (usually to unpack itself),2
4,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,antivm_network_adapters,Checks adapter addresses which can be used to detect virtual network interfaces,2
...,...,...,...,...,...,...
44962,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,injection_ntsetcontextthread,Used NtSetContextThread to modify a thread in a remote process indicative of process injection,3
44963,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,injection_resumethread,Resumed a suspended thread in a remote process potentially indicative of process injection,3
44964,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,network_icmp,Generates some ICMP traffic,4
44965,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,dead_host,Connects to an IP address that is no longer responding to requests (legitimate services will remain up-and-running usually),5


#### Standardization

The format of columns 'action_name' and 'action_description' is standardizaded: all to lower cases and _ replaced with spaces

In [31]:
actions['action_name'] = actions['action_name'].replace('_', ' ', regex=True).str.lower()
actions['action_description'] = actions['action_description'].str.lower()

actions

Unnamed: 0,sha1,sha256,md5,action_name,action_description,action_severity
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,checks debugger,checks if process is being debugged by a debugger,1
1,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,generates crypto key,uses windows apis to generate a cryptographic key,1
2,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,antivm memory available,"checks amount of memory in system, this can be used to detect virtual machines that have a low amount of memory available",1
3,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,allocates rwx,allocates read-write-execute memory (usually to unpack itself),2
4,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,antivm network adapters,checks adapter addresses which can be used to detect virtual network interfaces,2
...,...,...,...,...,...,...
44962,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,injection ntsetcontextthread,used ntsetcontextthread to modify a thread in a remote process indicative of process injection,3
44963,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,injection resumethread,resumed a suspended thread in a remote process potentially indicative of process injection,3
44964,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,network icmp,generates some icmp traffic,4
44965,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,dead host,connects to an ip address that is no longer responding to requests (legitimate services will remain up-and-running usually),5


## Data Enrichment
#### Prepare to data Integration
Adding a field 'action_severity_level' by looking at the distribution we use a rule in order to determine malicious, sospicious and neutral actions with a rule.
- 'action_severity' > 4 , set 'action_severity_level' to 'malicious'
- 'action_severity' <= 2 , set 'action_severity_level' to 'neutral'
- 'action_severity' > 2 and <= 4 , set 'action_severity_level' to 'suspicious'

And drop action_severity column

In [32]:
actions.loc[(actions['action_severity'] > 4), 'action_level'] = 'malicious'
actions.loc[(actions['action_severity'] <= 4 ) & (actions['action_severity'] > 2 ), 'action_level'] = 'suspicious'
actions.loc[(actions['action_severity'] <= 2), 'action_level'] = 'neutral'

actions

Unnamed: 0,sha1,sha256,md5,action_name,action_description,action_severity,action_level
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,checks debugger,checks if process is being debugged by a debugger,1,neutral
1,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,generates crypto key,uses windows apis to generate a cryptographic key,1,neutral
2,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,antivm memory available,"checks amount of memory in system, this can be used to detect virtual machines that have a low amount of memory available",1,neutral
3,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,allocates rwx,allocates read-write-execute memory (usually to unpack itself),2,neutral
4,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,antivm network adapters,checks adapter addresses which can be used to detect virtual network interfaces,2,neutral
...,...,...,...,...,...,...,...
44962,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,injection ntsetcontextthread,used ntsetcontextthread to modify a thread in a remote process indicative of process injection,3,suspicious
44963,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,injection resumethread,resumed a suspended thread in a remote process potentially indicative of process injection,3,suspicious
44964,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,network icmp,generates some icmp traffic,4,suspicious
44965,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,dead host,connects to an ip address that is no longer responding to requests (legitimate services will remain up-and-running usually),5,malicious


In [33]:
actions = actions.drop(columns = ['action_severity'])
actions

Unnamed: 0,sha1,sha256,md5,action_name,action_description,action_level
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,checks debugger,checks if process is being debugged by a debugger,neutral
1,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,generates crypto key,uses windows apis to generate a cryptographic key,neutral
2,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,antivm memory available,"checks amount of memory in system, this can be used to detect virtual machines that have a low amount of memory available",neutral
3,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,allocates rwx,allocates read-write-execute memory (usually to unpack itself),neutral
4,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,antivm network adapters,checks adapter addresses which can be used to detect virtual network interfaces,neutral
...,...,...,...,...,...,...
44962,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,injection ntsetcontextthread,used ntsetcontextthread to modify a thread in a remote process indicative of process injection,suspicious
44963,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,injection resumethread,resumed a suspended thread in a remote process potentially indicative of process injection,suspicious
44964,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,network icmp,generates some icmp traffic,suspicious
44965,98837feaac2ed17b3709e5183b660dd41e7a1e0e,5b37ddd18800dc91071db31b340e8f9997bb89d42d5a194d7ff8c980a54abe89,1b47215513edd622ba825dced07c8969,dead host,connects to an ip address that is no longer responding to requests (legitimate services will remain up-and-running usually),malicious


# Threat Vendor Analysis

In [34]:
tv_analysis = pd.read_csv(path_OTX_TV_Analysis, low_memory = False)
tv_analysis.head(3)


Unnamed: 0,sha1,sha256,md5,malware,verdict,threat_vendor_name
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,Win32:PWSX-gen\ [Trj],['Malware infection'],avast
1,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31,Win32:Malware-gen,['Malware infection'],avast
2,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,Win32:BackdoorX-gen\ [Trj],['Malware infection'],avast


## Data Manipuliation

- verdict field : from list of strings to string 

In [35]:
import re

In [36]:
tv_analysis['verdict'].unique()

array(["['Malware infection']", nan, "['Malware detected']"], dtype=object)

In [37]:
def replace_value(x):
    # Use a regular expression to extract the value between square brackets
    if( pd.isna(x) ):
        return x
    match = re.search(r"\['(.*?)'\]", x)
    if match:
        original_value = match.group(1)
        # Replace the original value with the new value
        return original_value
    else:
        return x 

def cleaning_tv_analysis(df):
    
    df['verdict'] = df['verdict'].apply(lambda x: replace_value(x))
        
    return df
        

In [38]:
tv_analysis = cleaning_tv_analysis(tv_analysis)
tv_analysis

Unnamed: 0,sha1,sha256,md5,malware,verdict,threat_vendor_name
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,Win32:PWSX-gen\ [Trj],Malware infection,avast
1,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31,Win32:Malware-gen,Malware infection,avast
2,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,Win32:BackdoorX-gen\ [Trj],Malware infection,avast
3,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,MSIL:NanoCore-B\ [Trj],Malware infection,avast
4,18a703514e47e1c3b59cf12de95f1ea5c7475083,28b6bbc6b1d6ee21ccdc25c5eefccad5e21556954542cd19a0a87f7dde91df78,260bf0106c006d4e988a2685df71a758,Win32:RemcosRAT-A\ [Trj],Malware infection,avast
...,...,...,...,...,...,...
7969,3ab53037e2360c42b57d213234ced58717844a71,5de4932cb0ccab3fec711135cdd1af5a09c26a870bfb2d2adc650d0fdbf16488,e0fea510c784eb0e440e99e53d83d414,Win.Malware.Generic-6623004-0,Malware infection,clamav
7970,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,Win.Malware.Dlmc-7395626-0,Malware infection,clamav
7971,ba5e50a39d1502e38196724ec9d844ee4bf8d002,f3fcbb0fedb1e3b732185aebbf845ca185c950ca3635026d8a754312220577c9,0550342e66698384f232ac39b700dd14,Win.Packed.Samas-7998113-0,Malware infection,clamav
7972,d14540087dd6d350fb49e644e53ccefc0e4c1582,dba8a4717d516772b8cc06bd258265f21f7db6ddab62328917524c5a243c67a1,fe783ff4bccd76cb215ee1f7738d9ed5,,,clamav


## Data Cleaning

#### Standardization
Replace malware and verdict with a lower case format

In [39]:
tv_analysis['malware'] = tv_analysis['malware'].str.lower()
tv_analysis['verdict'] = tv_analysis['verdict'].str.lower()

tv_analysis

Unnamed: 0,sha1,sha256,md5,malware,verdict,threat_vendor_name
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,win32:pwsx-gen\ [trj],malware infection,avast
1,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31,win32:malware-gen,malware infection,avast
2,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,win32:backdoorx-gen\ [trj],malware infection,avast
3,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,msil:nanocore-b\ [trj],malware infection,avast
4,18a703514e47e1c3b59cf12de95f1ea5c7475083,28b6bbc6b1d6ee21ccdc25c5eefccad5e21556954542cd19a0a87f7dde91df78,260bf0106c006d4e988a2685df71a758,win32:remcosrat-a\ [trj],malware infection,avast
...,...,...,...,...,...,...
7969,3ab53037e2360c42b57d213234ced58717844a71,5de4932cb0ccab3fec711135cdd1af5a09c26a870bfb2d2adc650d0fdbf16488,e0fea510c784eb0e440e99e53d83d414,win.malware.generic-6623004-0,malware infection,clamav
7970,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,win.malware.dlmc-7395626-0,malware infection,clamav
7971,ba5e50a39d1502e38196724ec9d844ee4bf8d002,f3fcbb0fedb1e3b732185aebbf845ca185c950ca3635026d8a754312220577c9,0550342e66698384f232ac39b700dd14,win.packed.samas-7998113-0,malware infection,clamav
7972,d14540087dd6d350fb49e644e53ccefc0e4c1582,dba8a4717d516772b8cc06bd258265f21f7db6ddab62328917524c5a243c67a1,fe783ff4bccd76cb215ee1f7738d9ed5,,,clamav


#### Missing Values

- drop rows with missing values related to sha1 sha256 and md5 field because they cannot be associate to any file

In [40]:
tv_analysis.dropna(subset=['sha1', 'sha256', 'md5'], inplace=True)
tv_analysis

Unnamed: 0,sha1,sha256,md5,malware,verdict,threat_vendor_name
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,win32:pwsx-gen\ [trj],malware infection,avast
1,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31,win32:malware-gen,malware infection,avast
2,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,win32:backdoorx-gen\ [trj],malware infection,avast
3,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,msil:nanocore-b\ [trj],malware infection,avast
4,18a703514e47e1c3b59cf12de95f1ea5c7475083,28b6bbc6b1d6ee21ccdc25c5eefccad5e21556954542cd19a0a87f7dde91df78,260bf0106c006d4e988a2685df71a758,win32:remcosrat-a\ [trj],malware infection,avast
...,...,...,...,...,...,...
7969,3ab53037e2360c42b57d213234ced58717844a71,5de4932cb0ccab3fec711135cdd1af5a09c26a870bfb2d2adc650d0fdbf16488,e0fea510c784eb0e440e99e53d83d414,win.malware.generic-6623004-0,malware infection,clamav
7970,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,win.malware.dlmc-7395626-0,malware infection,clamav
7971,ba5e50a39d1502e38196724ec9d844ee4bf8d002,f3fcbb0fedb1e3b732185aebbf845ca185c950ca3635026d8a754312220577c9,0550342e66698384f232ac39b700dd14,win.packed.samas-7998113-0,malware infection,clamav
7972,d14540087dd6d350fb49e644e53ccefc0e4c1582,dba8a4717d516772b8cc06bd258265f21f7db6ddab62328917524c5a243c67a1,fe783ff4bccd76cb215ee1f7738d9ed5,,,clamav


- drop row if both the fields 'verdict' and 'malware' are missing because any information can be taken from that row, so it will be useless

In [41]:
tv_analysis.dropna(subset=['malware', 'verdict'], how='all', inplace=True)
tv_analysis.reset_index(drop=True, inplace=True)
tv_analysis

Unnamed: 0,sha1,sha256,md5,malware,verdict,threat_vendor_name
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,win32:pwsx-gen\ [trj],malware infection,avast
1,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31,win32:malware-gen,malware infection,avast
2,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,win32:backdoorx-gen\ [trj],malware infection,avast
3,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,msil:nanocore-b\ [trj],malware infection,avast
4,18a703514e47e1c3b59cf12de95f1ea5c7475083,28b6bbc6b1d6ee21ccdc25c5eefccad5e21556954542cd19a0a87f7dde91df78,260bf0106c006d4e988a2685df71a758,win32:remcosrat-a\ [trj],malware infection,avast
...,...,...,...,...,...,...
4463,0d7b506cdc8e0228a8c50f1adf3b1b3224fc114d,a48ab00c2a748d146264d9ceebed2346013a730bde2ee91bc03de76f8d262aa1,1e53278bd1d1ee7c84ffc2d44dce4117,win.packed.zusy-6860439-0,malware infection,clamav
4464,a3447ba9b83f30284c6d3effb45c31ad9d5f258f,bbd3ecd9e9671d94e8897980c4eb9391ae9cb444615ed9a93b8221ae8fa66790,f6eec1317ece3ffb7c4916e224d9734d,win.trojan.revengerat-6344273-0,malware infection,clamav
4465,3ab53037e2360c42b57d213234ced58717844a71,5de4932cb0ccab3fec711135cdd1af5a09c26a870bfb2d2adc650d0fdbf16488,e0fea510c784eb0e440e99e53d83d414,win.malware.generic-6623004-0,malware infection,clamav
4466,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,win.malware.dlmc-7395626-0,malware infection,clamav


## Saving

In [42]:
dataframes_dict = {}

# Add the DataFrames to the dictionary with keys
dataframes_dict['Entries'] = entries
dataframes_dict['TV_Analysis'] = tv_analysis
dataframes_dict['Behaviours'] = actions
dataframes_dict['Anomalies'] = anomalies
dataframes_dict['Attacks'] = attacks

In [43]:
save_directory = "C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\FileAnalysis\\PostDataCleaning\\OTX_Alien_Vault\\"
for key, df in dataframes_dict.items():
    file_name = f"{key}.csv"
    file_path = save_directory + file_name
    df.to_csv(file_path, index=False)
    print(f'{key} added')

Entries added
TV_Analysis added
Behaviours added
Anomalies added
Attacks added
