# 3.1. Data Profiling - Threat Fox

In [1]:
import datetime
import os
import time
import pandas as pd 
import requests
import urllib3
import json
import sys
import numpy as np

pd.options.display.max_colwidth = 1000
pd.set_option('display.max_columns', None)

In [2]:
entries  = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\IocSegnalations\\PostDataStructurization\\Threat_Fox\\Entries.csv"
tags     = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\IocSegnalations\\PostDataStructurization\\Threat_Fox\\Tags.csv"
malwares = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\IocSegnalations\\PostDataStructurization\\Threat_Fox\\Malwares.csv"
alias    = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\IocSegnalations\\PostDataStructurization\\Threat_Fox\\Alias.csv"

## Data Profiling - Entries

In [3]:
entries_df = pd.read_csv(entries, low_memory = False )
entries_df

Unnamed: 0,id,ioc,threat_type,ioc_type,malware_printable,malware_malpedia,first_seen,last_seen,reference,reporter
0,1163856,146.56.118.82:443,botnet_cc,ip:port,Cobalt Strike,https://malpedia.caad.fkie.fraunhofer.de/details/win.cobalt_strike,2023-09-15 12:17:38 UTC,,,malpulse
1,1163857,47.94.206.253:8090,botnet_cc,ip:port,Cobalt Strike,https://malpedia.caad.fkie.fraunhofer.de/details/win.cobalt_strike,2023-09-15 12:17:37 UTC,,,malpulse
2,1163858,80.96.156.43:444,botnet_cc,ip:port,Cobalt Strike,https://malpedia.caad.fkie.fraunhofer.de/details/win.cobalt_strike,2023-09-15 12:17:37 UTC,,,malpulse
3,1163859,91.193.75.135:7245,botnet_cc,ip:port,Nanocore RAT,https://malpedia.caad.fkie.fraunhofer.de/details/win.nanocore,2023-09-15 12:17:37 UTC,,,SarlackLab
4,1163860,7245.ddns.net,botnet_cc,domain,Nanocore RAT,https://malpedia.caad.fkie.fraunhofer.de/details/win.nanocore,2023-09-15 12:17:36 UTC,,,SarlackLab
...,...,...,...,...,...,...,...,...,...,...
8186,1155702,41.68.165.218:1177,botnet_cc,ip:port,NjRAT,https://malpedia.caad.fkie.fraunhofer.de/details/win.njrat,2023-09-08 14:45:14 UTC,,,abuse_ch
8187,1155701,41.216.188.29:5200,botnet_cc,ip:port,Ave Maria,https://malpedia.caad.fkie.fraunhofer.de/details/win.ave_maria,2023-09-08 14:00:16 UTC,,,abuse_ch
8188,1155700,193.31.28.123:4444,botnet_cc,ip:port,Meterpreter,https://malpedia.caad.fkie.fraunhofer.de/details/win.meterpreter,2023-09-08 13:35:13 UTC,,,abuse_ch
8189,1155698,162.33.179.240:433,botnet_cc,ip:port,IcedID,https://malpedia.caad.fkie.fraunhofer.de/details/win.icedid,2023-09-08 13:33:51 UTC,,,abuse_ch


In [4]:
print("Duplicates: ",entries_df.duplicated().any())
#total number of not null values
not_null_values = entries_df.count().sum()
print("# not null values: ",not_null_values)
#total number of null values
null_values = entries_df.isnull().sum().sum()
print("# null values: ",null_values)
#total number of cells
n_values = entries_df.shape[0]*entries_df.shape[1]
print("# values: ",n_values)
#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN']
ds_property_nan = pd.read_csv(entries, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())

Duplicates:  False
# not null values:  67235
# null values:  14675
# values:  81910
# missings:  14675


## Completeness

In [5]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print("Completeness: ",completeness)

Completeness:  82.1%


## Timeliness

In [6]:
from datetime import datetime

temp_columns = ['first_seen']

dateparse = lambda x: datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S UTC')
ds_property_t = pd.read_csv(entries, parse_dates= temp_columns, date_parser=dateparse)

volatility = 20
today = datetime(2023, 9, 15)
ds_property_t['currency'] = (today-ds_property_t['first_seen']).dt.days

#adding a column with the timeliness computation
#if volatility is greater than currency the timeliness is equal to 1-currency/volatility, otherwise is 0
ds_property_t['Timeliness'] = np.where(volatility>ds_property_t['currency'],
                                       1-(ds_property_t['currency']/volatility),
                                       0)

  ds_property_t = pd.read_csv(entries, parse_dates= temp_columns, date_parser=dateparse)


In [7]:
print("Average Timeliness: ", ds_property_t['Timeliness'].mean())
print("Maximum Timeliness:", ds_property_t['Timeliness'].max())
print("Minimum Timeliness:", ds_property_t['Timeliness'].min())

Average Timeliness:  0.8165852765230133
Maximum Timeliness: 1.05
Minimum Timeliness: 0.7


### Profiling Auto-report

In [8]:
pip install pandas-profiling

Collecting visions[type_image_path]==0.7.4 (from pandas-profiling)
  Using cached visions-0.7.4-py3-none-any.whl (102 kB)
Installing collected packages: visions
  Attempting uninstall: visions
    Found existing installation: visions 0.7.5
    Uninstalling visions-0.7.5:
      Successfully uninstalled visions-0.7.5
Successfully installed visions-0.7.4
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ydata-profiling 4.6.3 requires visions[type_image_path]==0.7.5, but you have visions 0.7.4 which is incompatible.


In [9]:
pip install --upgrade numba

Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install visions==0.7.4

Note: you may need to restart the kernel to use updated packages.


In [11]:
pip install --upgrade pydantic pydantic-settings

Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install ydata-profiling

Collecting visions[type_image_path]==0.7.5 (from ydata-profiling)
  Using cached visions-0.7.5-py3-none-any.whl (102 kB)
Installing collected packages: visions
  Attempting uninstall: visions
    Found existing installation: visions 0.7.4
    Uninstalling visions-0.7.4:
      Successfully uninstalled visions-0.7.4
Successfully installed visions-0.7.5
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-profiling 3.2.0 requires visions[type_image_path]==0.7.4, but you have visions 0.7.5 which is incompatible.


In [13]:
from ydata_profiling import ProfileReport

In [14]:
profile = ProfileReport(entries_df, title="Threat Fox - Entries - Pandas Profiling Report")

profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 3\\Threat_Fox\\Entries.html")

# As a JSON string
json_data = profile.to_json()

# As a file
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 3\\Threat_Fox\\Entries.json")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Value Distribution

#### Distribution of malwares

In [15]:
#import libraries and set up
import seaborn as sns
import matplotlib
matplotlib.use('tkagg')

import matplotlib.pyplot as plt


sns.set(color_codes=True)
sns.set_palette(sns.color_palette("muted"))

In [16]:
entries_df['first_seen'] = pd.to_datetime(entries_df['first_seen'], utc=True)


In [17]:
def distibution_plot(df, field):
    
    unique_malwares = df[field].unique()
    
    new = pd.DataFrame()
    
    s = 0
    
    nunique = len(unique_malwares)
    
    for i in range(nunique):
        
        malware = unique_malwares[i]
        
        s = s + len(df[df[field] == malware ])
    
    avg = ( s / nunique )
    
    for i in range(nunique):
        
        malware = unique_malwares[i]
        
        tmp = df[df[field] == malware ]
    
        if(len(tmp) > avg):
            
            new = pd.concat([new, tmp])       
            
    return new       

In [18]:
distibution_plot(entries_df, 'malware_printable')['malware_printable'].hist()


<Axes: >

#### Distribution entries over time

In [19]:
# Create a time series plot with different colors for each category
plt.figure(figsize=(10, 6))

# Plot Category A with a blue color
plt.plot(entries_df['first_seen'], label='Threat Fox', marker='o', linestyle='-', color='blue')

# Customize the plot
plt.title('Time Series Data Threat Fox')
plt.xlabel('Timestamp')
plt.grid(True)

# Add a legend to distinguish between categories
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()

## Data Profiling - Malwares

In [20]:
malwares_df = pd.read_csv(malwares, low_memory = False )
malwares_df

Unnamed: 0,malware,type
0,Cobalt Strike,unknown
1,Nanocore RAT,unknown
2,Remcos,unknown
3,Loki Password Stealer (PWS),unknown
4,DCRat,unknown
...,...,...
63,Loda,unknown
64,SpyNote,unknown
65,SectopRAT,unknown
66,Ghost RAT,unknown


In [21]:
print("Duplicates: ",malwares_df.duplicated().any())
#total number of not null values
not_null_values = malwares_df.count().sum()
print("# not null values: ",not_null_values)
#total number of null values
null_values = malwares_df.isnull().sum().sum()
print("# null values: ",null_values)
#total number of cells
n_values = malwares_df.shape[0]*malwares_df.shape[1]
print("# values: ",n_values)
#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN']
ds_property_nan = pd.read_csv(malwares, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())

Duplicates:  False
# not null values:  136
# null values:  0
# values:  136
# missings:  0


## Completeness

In [22]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print("Completeness: ",completeness)

Completeness:  100.0%


### Profiling Auto-report

In [23]:
profile = ProfileReport(entries_df, title="Threat Fox - Malwares - Pandas Profiling Report")

profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 3\\Threat_Fox\\Malwares.html")

# As a JSON string
json_data = profile.to_json()

# As a file
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 3\\Threat_Fox\\Malwares.json")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Data Profiling - Alias

In [24]:
alias_df = pd.read_csv(alias, low_memory = False )
alias_df

Unnamed: 0,malware,alias
0,Cobalt Strike,Agentemis
1,Nanocore RAT,Nancrat
2,Remcos,RemcosRAT
3,Loki Password Stealer (PWS),Burkina
4,DCRat,DarkCrystal RAT
...,...,...
171,Houdini,WSHRAT
172,SmokeLoader,
173,Ave Maria,
174,Houdini,dinihou


In [25]:
print("Duplicates: ",alias_df.duplicated().any())
#total number of not null values
not_null_values = alias_df.count().sum()
print("# not null values: ",not_null_values)
#total number of null values
null_values = alias_df.isnull().sum().sum()
print("# null values: ",null_values)
#total number of cells
n_values = alias_df.shape[0]*alias_df.shape[1]
print("# values: ",n_values)
#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN']
ds_property_nan = pd.read_csv(alias, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())

Duplicates:  True
# not null values:  252
# null values:  100
# values:  352
# missings:  100


## Completeness

In [26]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print("Completeness: ",completeness)

Completeness:  71.6%


### Profiling Auto-report

In [27]:
profile = ProfileReport(entries_df, title="Threat Fox - Alias - Pandas Profiling Report")

profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 3\\Threat_Fox\\Alias.html")

# As a JSON string
json_data = profile.to_json()

# As a file
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 3\\Threat_Fox\\Alias.json")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Data Profiling - Tags

In [28]:
tags_df = pd.read_csv(tags, low_memory = False )
tags_df

Unnamed: 0,ID_ENTRY,source,tag
0,1155537,Threat Fox,redline_stealer
1,1155537,Threat Fox,infostealer
2,1155537,Threat Fox,stealer
3,1155538,Threat Fox,redline_stealer
4,1155538,Threat Fox,RAT
...,...,...,...
25034,1163859,Threat Fox,RAT
25035,1163860,Threat Fox,RAT
25036,1163860,Threat Fox,NanoCore
25037,1163861,Threat Fox,RemcosRAT


In [29]:
print("Duplicates: ",tags_df.duplicated().any())
#total number of not null values
not_null_values = tags_df.count().sum()
print("# not null values: ",not_null_values)
#total number of null values
null_values = tags_df.isnull().sum().sum()
print("# null values: ",null_values)
#total number of cells
n_values = tags_df.shape[0]*alias_df.shape[1]
print("# values: ",n_values)
#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN']
ds_property_nan = pd.read_csv(tags, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())

Duplicates:  False
# not null values:  74825
# null values:  292
# values:  50078
# missings:  292


## Completeness

In [30]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print("Completeness: ",completeness)

Completeness:  149.4%


### Profiling Auto-report

In [31]:
profile = ProfileReport(entries_df, title="Threat Fox - Tags - Pandas Profiling Report")

profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 3\\Threat_Fox\\Tags.html")

# As a JSON string
json_data = profile.to_json()

# As a file
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 3\\Threat_Fox\\Tags.json")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]