## 2.3.0 Data Quality assessment


In [1]:
import datetime
import os
import time
import pandas as pd 
import requests
import urllib3
import json
import numpy as np
import sys
pd.options.display.max_colwidth = 1000
pd.set_option('display.max_columns', None)

In [2]:
entries  = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\IocSegnalations\\PostDataStructurization\\Feodo_Tracker\\Entries.csv"
malwares = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\IocSegnalations\\PostDataStructurization\\Feodo_Tracker\\Malwares.csv"

## Entries

In [3]:
entries_df = pd.read_csv(entries, low_memory = False )
entries_df

Unnamed: 0,ID_ENTRY,source,ioc,ioc_type,threat_type,malware,first_seen,last_seen,reporter,reference
0,0,Feodo Tracker,89.101.97.139:443,ip:port,botnet,QakBot,2021-09-29 08:42:47,2023-08-25,Feodo Tracker blocklist,https://feodotracker.abuse.ch/
1,1,Feodo Tracker,41.228.22.180:443,ip:port,botnet,QakBot,2021-09-29 08:42:51,2023-08-25,Feodo Tracker blocklist,https://feodotracker.abuse.ch/
2,2,Feodo Tracker,144.139.47.206:443,ip:port,botnet,QakBot,2021-09-30 21:25:38,2023-08-25,Feodo Tracker blocklist,https://feodotracker.abuse.ch/
3,3,Feodo Tracker,41.86.42.158:995,ip:port,botnet,QakBot,2021-10-08 14:48:30,2023-08-25,Feodo Tracker blocklist,https://feodotracker.abuse.ch/
4,4,Feodo Tracker,63.143.92.99:995,ip:port,botnet,QakBot,2021-10-08 14:48:31,2023-08-25,Feodo Tracker blocklist,https://feodotracker.abuse.ch/
...,...,...,...,...,...,...,...,...,...,...
333,333,Feodo Tracker,41.97.47.7:443,ip:port,botnet,QakBot,2023-08-24 23:29:43,2023-08-25,Feodo Tracker blocklist,https://feodotracker.abuse.ch/
334,334,Feodo Tracker,92.9.44.234:2222,ip:port,botnet,QakBot,2023-08-25 01:29:36,2023-08-25,Feodo Tracker blocklist,https://feodotracker.abuse.ch/
335,335,Feodo Tracker,78.152.198.132:443,ip:port,botnet,QakBot,2023-08-25 10:08:52,2023-08-25,Feodo Tracker blocklist,https://feodotracker.abuse.ch/
336,336,Feodo Tracker,113.193.95.229:443,ip:port,botnet,QakBot,2023-08-25 11:24:40,2023-08-25,Feodo Tracker blocklist,https://feodotracker.abuse.ch/


In [4]:

print("Duplicates: ",entries_df.duplicated().any())

#total number of not null values
not_null_values = entries_df.count().sum()
print("# not null values: ",not_null_values)

#total number of null values
null_values = entries_df.isnull().sum().sum()
print("# null values: ",null_values)

#total number of cells
n_values = entries_df.shape[0]*entries_df.shape[1]
print("# values: ",n_values)

#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN']
ds_property_nan = pd.read_csv(entries, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())


Duplicates:  False
# not null values:  3380
# null values:  0
# values:  3380
# missings:  0


## Completeness

In [5]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print(completeness)

100.0%


## Timeliness

In [7]:
temp_columns = 'first_seen'
from datetime import datetime
dateparse = lambda x: datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
ds_property_t = pd.read_csv(entries, parse_dates= [temp_columns], date_parser=dateparse)
volatility = 60
today = datetime(2023, 9, 15)
ds_property_t['currency'] = (today-ds_property_t['first_seen']).dt.days
#adding a column with the timeliness computation
#if volatility is greater than currency the timeliness is equal to 1-currency/volatility, otherwise is 0
ds_property_t['Timeliness'] = np.where(volatility>ds_property_t['currency'],
                                       1-(ds_property_t['currency']/volatility),
                                       0)
print("Average Timeliness: ", ds_property_t['Timeliness'].mean())
print("Maximum Timeliness:", ds_property_t['Timeliness'].max())
print("Minimum Timeliness:", ds_property_t['Timeliness'].min())


Average Timeliness:  0.21114398422090724
Maximum Timeliness: 0.6666666666666667
Minimum Timeliness: 0.0


  ds_property_t = pd.read_csv(entries, parse_dates= [temp_columns], date_parser=dateparse)


## Auto-report

In [8]:
from ydata_profiling import ProfileReport

In [9]:
profile = ProfileReport(entries_df, title="Feodo Tracker Entries - Pandas Profiling Report")

profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 3\\Feodo_Tracker\\Entries.html")

# As a JSON string
json_data = profile.to_json()

# As a file
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 3\\Feodo_Tracker\\Entries.json")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Value Distribution

#### Distribution entries over time

In [10]:
# Create a time series plot with different colors for each category
plt.figure(figsize=(10, 6))

# Plot Category A with a blue color
plt.plot(entries_df['first_seen'], label='Feodo Tracker', marker='o', linestyle='-', color='green')

# Customize the plot
plt.title('Time Series Data Feodo Tracker')
plt.xlabel('Timestamp')
plt.grid(True)

# Add a legend to distinguish between categories
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()

NameError: name 'plt' is not defined

## Data Profiling - Malwares

In [None]:
malwares_df = pd.read_csv(malwares, low_memory = False )
malwares_df

In [None]:

print("Duplicates: ",malwares_df.duplicated().any())

#total number of not null values
not_null_values = malwares_df.count().sum()
print("# not null values: ",not_null_values)

#total number of null values
null_values = malwares_df.isnull().sum().sum()
print("# null values: ",null_values)

#total number of cells
n_values = malwares_df.shape[0]*malwares_df.shape[1]
print("# values: ",n_values)

#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN']
ds_property_nan = pd.read_csv(malwares, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())


## Completeness

In [None]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print(completeness)

## Auto-report

In [None]:
profile = ProfileReport(malwares_df, title="Feodo Tracker Malwares - Pandas Profiling Report")

profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 3\\Feodo_Tracker\\Malwares.html")

# As a JSON string
json_data = profile.to_json()

# As a file
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 3\\Feodo_Tracker\\Malwares.json")