# 5.1 Data Profiling - Integrated Dataset - Entries

In [1]:
import datetime
import os
import time
import pandas as pd 
import requests
import urllib3
import json
import sys
import numpy as np

pd.options.display.max_colwidth = 1000
pd.set_option('display.max_columns', None)

In [2]:
file = r'C:\Users\leona\OneDrive\Desktop\Tesi\Pipeline\Datasets\Pipeline\IocSegnalations\PostDataIntegration\Entries.csv'

In [3]:
df = pd.read_csv(file, low_memory = False )
df

Unnamed: 0,ID_ENTRY,source,ioc,ioc_type,threat_type,malware,first_seen,last_seen,reporter,reference,status
0,0,Feodo Tracker,89.101.97.139:443,ip:port,botnet,qakbot,2021-09-29 08:42:47,2023-08-25,Feodo Tracker blocklist,https://feodotracker.abuse.ch/,
1,1,Feodo Tracker,41.228.22.180:443,ip:port,botnet,qakbot,2021-09-29 08:42:51,2023-08-25,Feodo Tracker blocklist,https://feodotracker.abuse.ch/,
2,2,Feodo Tracker,144.139.47.206:443,ip:port,botnet,qakbot,2021-09-30 21:25:38,2023-08-25,Feodo Tracker blocklist,https://feodotracker.abuse.ch/,
3,3,Feodo Tracker,41.86.42.158:995,ip:port,botnet,qakbot,2021-10-08 14:48:30,2023-08-25,Feodo Tracker blocklist,https://feodotracker.abuse.ch/,
4,4,Feodo Tracker,63.143.92.99:995,ip:port,botnet,qakbot,2021-10-08 14:48:31,2023-08-25,Feodo Tracker blocklist,https://feodotracker.abuse.ch/,
...,...,...,...,...,...,...,...,...,...,...,...
15580,2704832,URL Haus,http://219.157.141.108:59808/Mozi.m,,payload_delivery,,2023-08-16 00:03:35,2023-08-16 00:03:35,Gandylyan1,https://urlhaus.abuse.ch/url/2704832/,offline
15581,2704830,URL Haus,http://175.107.13.143:38812/Mozi.m,,payload_delivery,,2023-08-16 00:03:34,2023-08-16 00:03:34,Gandylyan1,https://urlhaus.abuse.ch/url/2704830/,offline
15582,2704831,URL Haus,http://102.33.46.237:54730/Mozi.m,,payload_delivery,,2023-08-16 00:03:34,2023-08-16 00:03:34,Gandylyan1,https://urlhaus.abuse.ch/url/2704831/,offline
15583,2704829,URL Haus,http://158.255.82.182:48683/Mozi.m,,payload_delivery,,2023-08-16 00:03:06,2023-08-18 09:12:32,Gandylyan1,https://urlhaus.abuse.ch/url/2704829/,offline


In [4]:
print("Duplicates: ",df.duplicated().any())
#total number of not null values
not_null_values = df.count().sum()
print("# not null values: ",not_null_values)
#total number of null values
null_values = df.isnull().sum().sum()
print("# null values: ",null_values)
#total number of cells
n_values = df.shape[0]*df.shape[1]
print("# values: ",n_values)
#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN']
ds_property_nan = pd.read_csv(file, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())

Duplicates:  False
# not null values:  148728
# null values:  22707
# values:  171435
# missings:  22707


## Completeness

In [5]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print(completeness)

86.8%


# Single column analysis

# 1 : ioc

## CARDINALITIES

In [6]:
#len command counts the number of rows
n_rows = len(df['ioc'])
print("# rows: ", n_rows)

# rows:  15585


In [7]:
#number of non—null observation in "ioc" column
count_values = df['ioc'].count()
print("# non-null: ", n_rows)

# non-null:  15585


In [8]:
#value counts is the same of a group by (attribute "ibu")
df['ioc'].value_counts()

ioc
gapi-node.io                                                           3
blockall-my.xyz                                                        2
mycollection-my.xyz                                                    2
coinflore-my.xyz                                                       2
https://scauditora.cl/absorbability                                    2
                                                                      ..
rmlkin.duckdns.org                                                     1
autobasecars4040.ddns.net                                              1
icando.ug                                                              1
marioddns.hopto.org                                                    1
https://esteticalocarno.com/wp-content/uploads/2016/11/PadNoter.zip    1
Name: count, Length: 15547, dtype: int64

## Cardinality

In [9]:
#nunique is the number of distict values for an attribute (attribute "ibu")
distinct_values = (df['ioc'].nunique())
print("Cardinality: ", distinct_values)

Cardinality:  15547


## Uniqueness

In [10]:
#Uniqueness evaluation (attribute "ioc")
uniqueness = distinct_values / n_rows
print("Uniqueness: ", uniqueness)

Uniqueness:  0.9975617581007379


## Distinctness

In [11]:
#Distinctness evaluation (attribute "ioc")
distinctness = distinct_values / count_values
print("Distinctness: ", distinctness)

Distinctness:  0.9975617581007379


# 2 : threat_type


## CARDINALITIES


In [12]:
col = 'threat_type'

In [13]:
n_rows = len(df[col])
print("# rows: ", n_rows)

# rows:  15585


In [14]:
count_values = df[col].count()
print("# non-null: ", n_rows)

# non-null:  15585


In [15]:
df[col].value_counts()

threat_type
botnet              8355
payload_delivery    7213
payload               17
Name: count, dtype: int64

## Cardinality

In [16]:
distinct_values = (df[col].nunique())
print("Cardinality: ", distinct_values)

Cardinality:  3


## Uniqueness

In [17]:
uniqueness = distinct_values / n_rows
print("Uniqueness: ", uniqueness)

Uniqueness:  0.00019249278152069297


## Distinctness

In [18]:
distinctness = distinct_values / count_values
print("Distinctness: ", distinctness)

Distinctness:  0.00019249278152069297


# 2 : ioc_type


## CARDINALITIES


In [19]:
col = 'ioc_type'

In [20]:
n_rows = len(df[col])
print("# rows: ", n_rows)

# rows:  15585


In [21]:
count_values = df[col].count()
print("# non-null: ", n_rows)

# non-null:  15585


In [22]:
df[col].value_counts()

ioc_type
domain         4857
ip:port        3067
url             588
sha256_hash       9
md5_hash          8
Name: count, dtype: int64

## Cardinality

In [23]:
distinct_values = (df[col].nunique())
print("Cardinality: ", distinct_values)

Cardinality:  5


## Uniqueness

In [24]:
uniqueness = distinct_values / n_rows
print("Uniqueness: ", uniqueness)

Uniqueness:  0.0003208213025344883


## Distinctness

In [25]:
distinctness = distinct_values / count_values
print("Distinctness: ", distinctness)

Distinctness:  0.0005862351975612616


# 3 : malware


## CARDINALITIES


In [26]:
col = 'malware'

In [27]:
n_rows = len(df[col])
print("# rows: ", n_rows)

# rows:  15585


In [28]:
count_values = df[col].count()
print("# non-null: ", n_rows)

# non-null:  15585


In [29]:
df[col].value_counts()

malware
remcos            2537
asyncrat          2252
quasar rat        1120
cobalt strike      682
bitrat             355
                  ... 
mystic stealer       1
kimsuky              1
supremebot           1
sectoprat            1
ghost rat            1
Name: count, Length: 69, dtype: int64

## Cardinality

In [30]:
distinct_values = (df[col].nunique())
print("Cardinality: ", distinct_values)

Cardinality:  69


## Uniqueness

In [31]:
uniqueness = distinct_values / n_rows
print("Uniqueness: ", uniqueness)

Uniqueness:  0.004427333974975939


## Distinctness

In [32]:
distinctness = distinct_values / count_values
print("Distinctness: ", distinctness)

Distinctness:  0.00815313718539525


# 4 : reporter

## CARDINALITIES


In [33]:
col = 'reporter'

In [34]:
n_rows = len(df[col])
print("# rows: ", n_rows)

# rows:  15585


In [35]:
count_values = df[col].count()
print("# non-null: ", n_rows)

# non-null:  15585


In [36]:
df[col].value_counts()

reporter
Gi7w0rm         6496
tolisec         2343
geenensp         884
abuse_ch         875
zbetcheckin      756
                ... 
Chengetai          1
teamcymru_S2       1
sicehicetf         1
ACVaessen          1
stoerchl           1
Name: count, Length: 62, dtype: int64

## Cardinality

In [37]:
distinct_values = (df[col].nunique())
print("Cardinality: ", distinct_values)

Cardinality:  62


## Uniqueness

In [38]:
uniqueness = distinct_values / n_rows
print("Uniqueness: ", uniqueness)

Uniqueness:  0.003978184151427655


## Distinctness

In [39]:
distinctness = distinct_values / count_values
print("Distinctness: ", distinctness)

Distinctness:  0.003978184151427655


# Profiling Auto-report

In [40]:
pip install pandas-profiling

Collecting visions[type_image_path]==0.7.4 (from pandas-profiling)
  Using cached visions-0.7.4-py3-none-any.whl (102 kB)
Installing collected packages: visions
  Attempting uninstall: visions
    Found existing installation: visions 0.7.5
    Uninstalling visions-0.7.5:
      Successfully uninstalled visions-0.7.5
Successfully installed visions-0.7.4
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ydata-profiling 4.6.3 requires visions[type_image_path]==0.7.5, but you have visions 0.7.4 which is incompatible.


In [41]:
pip install --upgrade numba

Note: you may need to restart the kernel to use updated packages.


In [42]:
pip install visions==0.7.4

Note: you may need to restart the kernel to use updated packages.


In [43]:
pip install --upgrade pydantic pydantic-settings

Note: you may need to restart the kernel to use updated packages.


In [44]:
pip install ydata-profiling

Collecting visions[type_image_path]==0.7.5 (from ydata-profiling)
  Using cached visions-0.7.5-py3-none-any.whl (102 kB)
Installing collected packages: visions
  Attempting uninstall: visions
    Found existing installation: visions 0.7.4
    Uninstalling visions-0.7.4:
      Successfully uninstalled visions-0.7.4
Successfully installed visions-0.7.5
Note: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pandas-profiling 3.2.0 requires visions[type_image_path]==0.7.4, but you have visions 0.7.5 which is incompatible.


In [45]:
from ydata_profiling import ProfileReport

In [53]:
df['first_seen'] = pd.to_datetime(df['first_seen'], format='mixed', utc = True)
df['last_seen'] = pd.to_datetime(df['last_seen'], format='mixed', utc = True)

df = df.dropna(subset=['first_seen', 'last_seen'])

profile = ProfileReport(df, title="Integrated Dataset - Entries - Pandas Profiling Report")

profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 5\\IntegratedDataset\\Entries.html")

# As a JSON string
json_data = profile.to_json()

# As a file
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 5\\IntegratedDataset\\Entries.json")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Value Distribution

Histograms representing the source and the timeline of the data

#### Distribution of malwares

In [59]:
#import libraries and set up
import seaborn as sns
import matplotlib
matplotlib.use('tkagg')
import matplotlib.pyplot as plt


sns.set(color_codes=True)
sns.set_palette(sns.color_palette("muted"))

In [60]:
df['first_seen'] = pd.to_datetime(df['first_seen'], utc=True, format='mixed')

threat_fox_df = df[ df['source'] == 'Threat Fox' ]
url_haus_df = df[ df['source'] == 'URL Haus' ]
feodo_tracker_df = df[ df['source'] == 'Feodo Tracker' ]

In [61]:
def distibution_plot(df, field):
    
    unique_malwares = df[field].unique()
    
    new = pd.DataFrame()
    
    s = 0
    
    nunique = len(unique_malwares)
    
    for i in range(nunique):
        
        malware = unique_malwares[i]
        
        s = s + len(df[df[field] == malware ])
    
    avg = ( s / nunique )
    
    for i in range(nunique):
        
        malware = unique_malwares[i]
        
        tmp = df[df[field] == malware ]
    
        if(len(tmp) > avg):
            
            new = pd.concat([new, tmp])       
            
    return new       

In [62]:
distibution_plot(df, 'malware')['malware'].hist()

<Axes: >

#### Distribution of entries over time

In [63]:
# Create a time series plot with different colors for each category
plt.figure(figsize=(10, 6))

# Plot Category A with a blue color
plt.plot(threat_fox_df['first_seen'], label='Threat Fox', marker='o', linestyle='-', color='blue')

# Plot Category B with a green color
plt.plot(url_haus_df['first_seen'], label='URL Haus', marker='x', linestyle='-', color='green')

# Plot Category C with a red color
plt.plot(feodo_tracker_df['first_seen'], label='Feodo Tracker', marker='s', linestyle='-', color='red')

# Customize the plot
plt.title('Time Series Data by Category')
plt.xlabel('Timestamp')
plt.grid(True)

# Add a legend to distinguish between categories
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()
