# 18.0. Data Profiling - Integrated Dataset 

In [1]:
from datetime import datetime
import os
import time
import pandas as pd 
import requests
import urllib3
import json
import sys
import numpy as np

pd.options.display.max_colwidth = 1000
pd.set_option('display.max_columns', None)

In [2]:
from ydata_profiling import ProfileReport

### Data Profiling - ENTRIES

In [3]:
path_Entries = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\FileAnalysis\\PostDataIntegration\\Entries.csv"

In [4]:
entries = pd.read_csv(path_Entries, low_memory = False)
entries

Unnamed: 0,sha1,sha256,md5,first_seen,last_seen,analysis_date,file_type,delivery_method
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,2021-11-09,2021-11-09,2021-11-09,exe,email attachment
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,2023-07-19,2023-08-25,2023-07-19,exe,web download
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,2022-01-04,2022-02-04,2022-01-04,exe,unknown
3,55cbf8dae95d350ad7e5a63b418f6cd203488f51,8b7ae9f195b075a789d6d8277d500d27754bfa3c53ecca8db7beac8ccd07884f,97d9d216f2627cb238ca6637580b1739,2023-06-13,2023-06-13,2022-02-09,exe,unknown
4,a2915c1be9e6134b7bf3ca5ca00eeb0c969bedab,6c2a2251861a6d2701814843fadac940cf4d34db9f446f0698352fd866b31739,1b09de36dfe5850d7e3fbd6b39c89a43,2022-03-27,2022-04-06,2021-02-11,exe,web download
...,...,...,...,...,...,...,...,...
2657,6b6955af591154d3f6e2db35ca9904cce7e264b1,3a9d8c693b7dc6a9c2bf876c5626ebd0923e35bd5f698baed248df7f51f67275,03872a722df0b073a84fd1aa45eb9376,,,2020-02-20,elf,
2658,3ab53037e2360c42b57d213234ced58717844a71,5de4932cb0ccab3fec711135cdd1af5a09c26a870bfb2d2adc650d0fdbf16488,e0fea510c784eb0e440e99e53d83d414,,,2021-07-14,exe,
2659,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,,,2020-09-08,exe,
2660,d14540087dd6d350fb49e644e53ccefc0e4c1582,dba8a4717d516772b8cc06bd258265f21f7db6ddab62328917524c5a243c67a1,fe783ff4bccd76cb215ee1f7738d9ed5,,,2021-12-20,exe,


In [5]:
print("Duplicates: ",entries.duplicated().any())
#total number of not null values
not_null_values = entries.count().sum()
print("# not null values: ",not_null_values)
#total number of null values
null_values = entries.isnull().sum().sum()
print("# null values: ",null_values)
#total number of cells
n_values = entries.shape[0]*entries.shape[1]
print("# values: ",n_values)
#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN', 'None', '']
ds_property_nan = pd.read_csv(path_Entries, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())

Duplicates:  True
# not null values:  15206
# null values:  6090
# values:  21296
# missings:  6090


In [6]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print("Completeness: ",completeness)

Completeness:  71.4%


In [8]:
def analyse_columns(df):
    
    columns = df.columns
    for i in range(len(columns)):
        
        compute_completeness(df, columns[i])

def compute_completeness(df, col):
    completeness = 1 - df[col].isna().mean()

    # Print the completeness percentage
    print(f"Completeness for '{(col)}': {completeness * 100:.2f}%")

In [9]:
analyse_columns(entries)

Completeness for 'sha1': 100.00%
Completeness for 'sha256': 100.00%
Completeness for 'md5': 100.00%
Completeness for 'first_seen': 23.82%
Completeness for 'last_seen': 23.82%
Completeness for 'analysis_date': 99.77%
Completeness for 'file_type': 100.00%
Completeness for 'delivery_method': 23.82%


In [10]:
profile = ProfileReport(entries, title="Entries - Pandas Profiling Report", minimal=True)
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 18\\Integrated_Dataset\\Entries_Report.html")

# As a JSON string
json_data = profile.to_json()

# As a file
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 18\\Integrated_Dataset\\Entries_Report.json")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Data Profiling - ANOMALIES

In [11]:
path_Anomalies = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\FileAnalysis\\PostDataIntegration\\Anomalies.csv"

In [12]:
anomalies = pd.read_csv(path_Anomalies, low_memory = False)
anomalies.head(3)

Unnamed: 0,sha1,sha256,md5,anomaly_name
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,checksum header zero
1,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,checksum header zero
2,12d263832d7f08122fbfc4431da22016e0823539,cf1260b1d0686dbee4a5c9e3bcd09eff07bfae0a53d17356b72150f59ad1a497,497d3c4d08b80e03f622064dced10135,entropy based


In [13]:
print("Duplicates: ",anomalies.duplicated().any())
#total number of not null values
not_null_values = anomalies.count().sum()
print("# not null values: ",not_null_values)
#total number of null values
null_values = anomalies.isnull().sum().sum()
print("# null values: ",null_values)
#total number of cells
n_values = anomalies.shape[0]*anomalies.shape[1]
print("# values: ",n_values)
#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN', 'None', '']
ds_property_nan = pd.read_csv(path_Anomalies, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())

Duplicates:  False
# not null values:  12952
# null values:  0
# values:  12952
# missings:  0


In [14]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print("Completeness: ",completeness)

Completeness:  100.0%


In [15]:
analyse_columns(anomalies)

Completeness for 'sha1': 100.00%
Completeness for 'sha256': 100.00%
Completeness for 'md5': 100.00%
Completeness for 'anomaly_name': 100.00%


In [16]:
profile = ProfileReport(anomalies, title="Anomalies - Pandas Profiling Report", minimal=True)
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 18\\Integrated_Dataset\\Anomalies_Report.html")

# As a JSON string
json_data = profile.to_json()

# As a file
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 18\\Integrated_Dataset\\Anomalies_Report.json")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Data Profiling - ATTACKS

In [17]:
path_Attacks = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\FileAnalysis\\PostDataIntegration\\Attacks.csv"

In [18]:
attacks = pd.read_csv(path_Attacks, low_memory = False)
attacks.head(3)

Unnamed: 0,sha1,sha256,md5,attack_id,attack_name,attack_type
0,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31,T1060,Registry Run Keys / Start Folder,Installation
1,713a00b13a87c52b43404cd32d0f4645844709d9,3b96a384f9989f28315eb374eae991e70ed07b88665cbf831f24bd574502feac,97ad0714b0245825407dd25d7d168d31,T1056,Input Capture,Objectives
2,e900de5aea86567fadc7507a6811596eb4b86a14,3363270a2950c8518e7883ea147b3f947b3a96aa31d37a990c73f975946335ce,e076afd88fe6a708d91a3f771754e860,T1060,Registry Run Keys / Start Folder,Installation


In [19]:
print("Duplicates: ",attacks.duplicated().any())
#total number of not null values
not_null_values = attacks.count().sum()
print("# not null values: ",not_null_values)
#total number of null values
null_values = attacks.isnull().sum().sum()
print("# null values: ",null_values)
#total number of cells
n_values = attacks.shape[0]*attacks.shape[1]
print("# values: ",n_values)
#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN', 'None', '']
ds_property_nan = pd.read_csv(path_Attacks, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())

Duplicates:  False
# not null values:  32376
# null values:  0
# values:  32376
# missings:  0


In [20]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print("Completeness: ",completeness)

Completeness:  100.0%


In [21]:
analyse_columns(attacks)

Completeness for 'sha1': 100.00%
Completeness for 'sha256': 100.00%
Completeness for 'md5': 100.00%
Completeness for 'attack_id': 100.00%
Completeness for 'attack_name': 100.00%
Completeness for 'attack_type': 100.00%


In [22]:
profile = ProfileReport(attacks, title="Attacks - Pandas Profiling Report", minimal=True)
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 18\\Integrated_Dataset\\Attacks_Report.html")

# As a JSON string
json_data = profile.to_json()

# As a file
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 18\\Integrated_Dataset\\Attacks_Report.json")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Data Profiling - BEHAVIOURS

In [23]:
path_Behaviours = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\FileAnalysis\\PostDataIntegration\\Behaviours.csv"

In [24]:
behaviours = pd.read_csv(path_Behaviours, low_memory = False)
behaviours.head(3)

Unnamed: 0,sha1,sha256,md5,action_name,action_description,maliciousness_level
0,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,checks debugger,checks if process is being debugged by a debugger,neutral
1,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,generates crypto key,uses windows apis to generate a cryptographic key,neutral
2,b701b181de8e595225668cb65d5b975e5b300a39,58edde581177c6f96caa4da4cd73396afee160b9152692a664b28aa67dd0d100,d6cda532ce834b973d5292e05e1d6c38,antivm memory available,"checks amount of memory in system, this can be used to detect virtual machines that have a low amount of memory available",neutral


In [25]:
print("Duplicates: ",behaviours.duplicated().any())
#total number of not null values
not_null_values = behaviours.count().sum()
print("# not null values: ",not_null_values)
#total number of null values
null_values = behaviours.isnull().sum().sum()
print("# null values: ",null_values)
#total number of cells
n_values = behaviours.shape[0]*behaviours.shape[1]
print("# values: ",n_values)
#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN', 'None', '']
ds_property_nan = pd.read_csv(path_Behaviours, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())

Duplicates:  True
# not null values:  317562
# null values:  9552
# values:  327114
# missings:  9552


In [26]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print("Completeness: ",completeness)

Completeness:  97.1%


In [27]:
analyse_columns(behaviours)

Completeness for 'sha1': 100.00%
Completeness for 'sha256': 100.00%
Completeness for 'md5': 100.00%
Completeness for 'action_name': 100.00%
Completeness for 'action_description': 82.48%
Completeness for 'maliciousness_level': 100.00%


In [28]:
profile = ProfileReport(behaviours, title="Behaviours - Pandas Profiling Report", minimal=True)
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 18\\Integrated_Dataset\\Behaviours_Report.html")

# As a JSON string
json_data = profile.to_json()

# As a file
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 18\\Integrated_Dataset\\Behaviours_Report.json")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Data Profiling - THREAT_VENDOR_ANALYSIS

In [29]:
path_TV_Analysis = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\FileAnalysis\\PostDataIntegration\\TV_Analysis.csv"

In [30]:
tv_analysis = pd.read_csv(path_TV_Analysis, low_memory = False)
tv_analysis.head(3)

Unnamed: 0,sha1,sha256,md5,malware,verdict,threat_vendor_name
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,pua.win.packer.borlanddelphi-15,,clamav
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,securiteinfo.com.trojan.generickd.68242337.20401.3632.unofficial,,clamav
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,win.trojan.remcos-9753190-0,,clamav


In [31]:
print("Duplicates: ",tv_analysis.duplicated().any())
#total number of not null values
not_null_values = tv_analysis.count().sum()
print("# not null values: ",not_null_values)
#total number of null values
null_values = tv_analysis.isnull().sum().sum()
print("# null values: ",null_values)
#total number of cells
n_values = tv_analysis.shape[0]*tv_analysis.shape[1]
print("# values: ",n_values)
#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN', 'None', '']
ds_property_nan = pd.read_csv(path_TV_Analysis, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())

#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print("Completeness: ",completeness)

Duplicates:  True
# not null values:  52641
# null values:  5637
# values:  58278
# missings:  5637
Completeness:  90.3%


In [32]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print("Completeness: ",completeness)

Completeness:  90.3%


In [33]:
analyse_columns(tv_analysis)

Completeness for 'sha1': 100.00%
Completeness for 'sha256': 100.00%
Completeness for 'md5': 100.00%
Completeness for 'malware': 64.62%
Completeness for 'verdict': 77.34%
Completeness for 'threat_vendor_name': 100.00%


In [34]:
profile = ProfileReport(tv_analysis, title="Threat Vendor Analysis - Pandas Profiling Report", minimal=True)
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 18\\Integrated_Dataset\\TV_Analysis_Report.html")

# As a JSON string
json_data = profile.to_json()

# As a file
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 18\\Integrated_Dataset\\TV_Analysis_Report.json")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Data Profiling - TAGS

In [35]:
path_Tags = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\FileAnalysis\\PostDataIntegration\\Tags.csv"

In [36]:
tags = pd.read_csv(path_Tags, low_memory = False)
tags.head(3)

Unnamed: 0,sha1,sha256,md5,tag_name
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,exe
1,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,remcosrat
2,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,32


In [37]:
print("Duplicates: ",tags.duplicated().any())
#total number of not null values
not_null_values = tags.count().sum()
print("# not null values: ",not_null_values)
#total number of null values
null_values = tags.isnull().sum().sum()
print("# null values: ",null_values)
#total number of cells
n_values = tags.shape[0]*tags.shape[1]
print("# values: ",n_values)
#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN', 'None', '']
ds_property_nan = pd.read_csv(path_Tags, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())

Duplicates:  True
# not null values:  6500
# null values:  0
# values:  6500
# missings:  0


In [38]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print("Completeness: ",completeness)

Completeness:  100.0%


In [39]:
analyse_columns(tags)

Completeness for 'sha1': 100.00%
Completeness for 'sha256': 100.00%
Completeness for 'md5': 100.00%
Completeness for 'tag_name': 100.00%


In [40]:
profile = ProfileReport(tags, title="Tags - Pandas Profiling Report", minimal=True)
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 18\\Integrated_Dataset\\Tags_Report.html")

# As a JSON string
json_data = profile.to_json()

# As a file
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 18\\Integrated_Dataset\\Tags_Report.json")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Data Profiling - Yara Rules

In [41]:
path_yaraRules = r"C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\FileAnalysis\\PostDataIntegration\\Yara_Rules.csv"

In [42]:
yaraRules = pd.read_csv(path_yaraRules, low_memory = False)
yaraRules.head(3)

Unnamed: 0,sha1,sha256,md5,yara_rule_name,yara_rule_description,yara_rule_author
0,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,iexplorer remcos,detect iexplorer being taken over by remcos,iam-py-test
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,indicator suspicious exe uacbypass cmstpcom,detects windows exceutables bypassing uac using cmstp com interfaces. mitre (t1218.003),ditekSHen
2,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,pe imphash,,


In [43]:
print("Duplicates: ",yaraRules.duplicated().any())
#total number of not null values
not_null_values = yaraRules.count().sum()
print("# not null values: ",not_null_values)
#total number of null values
null_values = yaraRules.isnull().sum().sum()
print("# null values: ",null_values)
#total number of cells
n_values = yaraRules.shape[0]*yaraRules.shape[1]
print("# values: ",n_values)
#we added to the set of missing values also 'na' and '--'
missing_values = ['na','--', 'NaN', 'None', '']
ds_property_nan = pd.read_csv(path_yaraRules, na_values = missing_values)
print("# missings: ",ds_property_nan.isnull().sum().sum())

Duplicates:  True
# not null values:  19067
# null values:  1561
# values:  20628
# missings:  1561


In [44]:
#COMPLETENESS evaluation
completeness = not_null_values / n_values
completeness = '{0:.1f}%'.format(completeness*100)
print("Completeness: ",completeness)

Completeness:  92.4%


In [45]:
analyse_columns(yaraRules)

Completeness for 'sha1': 100.00%
Completeness for 'sha256': 100.00%
Completeness for 'md5': 100.00%
Completeness for 'yara_rule_name': 100.00%
Completeness for 'yara_rule_description': 70.39%
Completeness for 'yara_rule_author': 84.21%


In [46]:
profile = ProfileReport(yaraRules, title="Yara Rules - Pandas Profiling Report", minimal=True)
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 18\\Integrated_Dataset\\Yara_Rules_Report.html")

# As a JSON string
json_data = profile.to_json()

# As a file
profile.to_file("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Report\\Profiling step 18\\Integrated_Dataset\\Yara_Rules_Report.json")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Render JSON:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]