# 3.2.0 Data Profiling


In [1]:
import datetime
import os
import time
import pandas as pd 
import requests
import urllib3
import json
import numpy as np
import sys

In [2]:
pd.options.display.max_colwidth = 1000
pd.set_option('display.max_columns', None)

In [3]:
entries  = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\IocSegnalations\\PostDataStructurization\\URL_Haus\\Entries.csv"
tags     = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\IocSegnalations\\PostDataStructurization\\URL_Haus\\Tags.csv"

## Data Profiling - Entries

In [5]:
entries_df = pd.read_csv(entries, low_memory = False )
print("Duplicates: ",entries_df.duplicated().any())

Duplicates:  False


In [6]:
#total number of not null values
not_null_values = entries_df.count().sum()
print("# not null values: ",not_null_values)

# not null values:  62352


In [7]:
#total number of null values
null_values = entries_df.isnull().sum().sum()
print("# null values: ",null_values)

# null values:  1152


In [8]:
#total number of cells
n_values = entries_df.shape[0]*entries_df.shape[1]
print("# values: ",n_values)

# values:  63504


In [9]:
#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN']
ds_property_nan = pd.read_csv(entries, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())

# missings:  1152


## Completeness

In [10]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print(completeness)

98.2%


## Timeliness

In [13]:
temp_columns = ['first_seen']
from datetime import datetime
dateparse = lambda x: datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S')
ds_property_t = pd.read_csv(entries, parse_dates= temp_columns, date_parser=dateparse)
volatility = 60
today = datetime(2023, 9, 15)
ds_property_t['currency'] = (today-ds_property_t['first_seen']).dt.days
#adding a column with the timeliness computation
#if volatility is greater than currency the timeliness is equal to 1-currency/volatility, otherwise is 0
ds_property_t['Timeliness'] = np.where(volatility>ds_property_t['currency'],
                                       1-(ds_property_t['currency']/volatility),
                                       0)
print("Average Timeliness: ", ds_property_t['Timeliness'].mean())
print("Maximum Timeliness:", ds_property_t['Timeliness'].max())
print("Minimum Timeliness:", ds_property_t['Timeliness'].min())

Average Timeliness:  0.7573105631141346
Maximum Timeliness: 1.0166666666666666
Minimum Timeliness: 0.5166666666666666


  ds_property_t = pd.read_csv(entries, parse_dates= temp_columns, date_parser=dateparse)


## Auto-report

In [14]:
from ydata_profiling import ProfileReport

In [15]:
profile = ProfileReport(entries_df, title="URL Haus - Entries - Pandas Profiling Report")

profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 3\\URL_Haus\\Entries.html")

# As a JSON string
json_data = profile.to_json()

# As a file
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 3\\URL_Haus\\Entries.json")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Value Distribution

#### Distribution entries over time

In [16]:
import seaborn as sns
import matplotlib
matplotlib.use('tkagg')
import matplotlib.pyplot as plt

from datetime import datetime



sns.set(color_codes=True)
sns.set_palette(sns.color_palette("muted"))

In [18]:
for i in range(len(entries_df)):
    
    val = str(entries_df.at[i,'first_seen'])
    parsed_datetime = datetime.strptime(val, '%Y-%m-%d %H:%M:%S')
    date_part = parsed_datetime.strftime('%Y%m%d')
    
    entries_df.loc[i,'first_seen'] = date_part

In [19]:
# Create a time series plot with different colors for each category
plt.figure(figsize=(10, 6))

# Plot Category A with a blue color
plt.plot(entries_df['first_seen'], label='URL Haus', marker='o', linestyle='-', color='blue')

# Customize the plot
plt.title('Time Series Data URL HAUS')
plt.xlabel('Timestamp')
plt.grid(True)

# Add a legend to distinguish between categories
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()

## Data Profiling - Tags

In [20]:
tags_df = pd.read_csv(tags, low_memory = False )
tags_df

Unnamed: 0,ID_ENTRY,source,tag
0,2711900,URL Haus,hajime
1,2711899,URL Haus,SocGholish
2,2711898,URL Haus,Mozi
3,2711897,URL Haus,dropped-by-SmokeLoader
4,2711896,URL Haus,NanoCore
...,...,...,...
7051,2704832,URL Haus,Mozi
7052,2704830,URL Haus,Mozi
7053,2704831,URL Haus,Mozi
7054,2704829,URL Haus,Mozi


In [21]:
print("Duplicates: ",tags_df.duplicated().any())
#total number of not null values
not_null_values = tags_df.count().sum()
print("# not null values: ",not_null_values)
#total number of null values
null_values = tags_df.isnull().sum().sum()
print("# null values: ",null_values)
#total number of cells
n_values = tags_df.shape[0]*tags_df.shape[1]
print("# values: ",n_values)
#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN']
ds_property_nan = pd.read_csv(tags, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())

Duplicates:  False
# not null values:  20917
# null values:  251
# values:  21168
# missings:  251


### Completeness

In [22]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print("Completeness: ",completeness)

Completeness:  98.8%


### Profiling Auto-report