# 20.1 Data Enrichment - Threat Vendor Prediction

Predict the malware class and maliciousness from the Threat Vendor Analysis


In [1]:
from datetime import datetime
import os
import time
import pandas as pd 
import requests
import urllib3
import json
import sys
import math

import numpy as np

In [2]:
alias       = r'C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\IocSegnalations\\IntegratedDataset\\PostDataEnrichment\\Alias.csv'
malwares    = r'C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\IocSegnalations\\IntegratedDataset\\PostDataEnrichment\\Malwares.csv'
entries     = r'C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\FileAnalysis\\PostDataEnrichment\\Entries.csv'
tv_analysis = r'C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\FileAnalysis\\PostDataCleaning\\Integrated_Dataset\\TV_Analysis.csv'


## Enrichment Malware

Select a malware or an alias from the list of malwares table which is the most similiar to the malware assigned by the Threat Vendor

#### Alias 

In [None]:
alias_df = pd.read_csv(alias, low_memory = False)
alias_df

In [4]:
words_list = list(set(list(alias_df[alias_df['alias'].notna()]['alias'])))
len(words_list)


191

#### Malwares from Threat Vendor Analysis

In [None]:
tv_analysis_df = pd.read_csv(tv_analysis, low_memory = False)
tv_analysis_df

In [6]:
tv_analysis_df[['sha1', 'sha256', 'md5']].nunique()

sha1      2458
sha256    2458
md5       2458
dtype: int64

In [None]:
tv_analysis_df_new = tv_analysis_df.copy()

tv_analysis_df_new['worm'] = 0
tv_analysis_df_new['trojan'] = 0
tv_analysis_df_new['ransomware'] = 0
tv_analysis_df_new['rootkit'] = 0
tv_analysis_df_new['spyware'] = 0
tv_analysis_df_new['adware'] = 0
tv_analysis_df_new['botnet'] = 0
tv_analysis_df_new['keylogger'] = 0
tv_analysis_df_new['dropper'] = 0
tv_analysis_df_new['backdoor'] = 0
tv_analysis_df_new['downloader'] = 0


worm_keys       = ['worm', 'w0rm', 'slam']
trojan_keys     = ['rat', 'trj', 'trojan']
ransomware_keys = ['ransom', 'crypt', 'lock', 'ransomware']
keylog_keys     = ['key', 'keylog', ]
adware_keys     = ['adw']
rootkit_keys    = ['root', 'kit']
botnet_keys     = ['bot', 'c2', 'cc']
spyware_keys    = ['spy', 'steal']
backdoor_keys   = ['bd', 'back', 'door']
download_keys   = ['down', 'dl', 'load']
dropper_keys    = ['drop', 'drp']


def verifica_presenza(parole_chiave, stringa):
    for parola_chiave in parole_chiave:
        if parola_chiave.lower() in stringa.lower():
            return True
    return False

for i in range(len(tv_analysis_df_new)):
    
    tag_name = tv_analysis_df_new.at[i, 'malware']
    
    if(pd.notna(tag_name)):
    
        if verifica_presenza(worm_keys, tag_name):
            tv_analysis_df_new.loc[i, 'worm'] = 1

        if verifica_presenza(trojan_keys, tag_name):
            tv_analysis_df_new.loc[i, 'trojan'] = 1  

        if verifica_presenza(ransomware_keys, tag_name):
            tv_analysis_df_new.loc[i, 'ransomware'] = 1

        if verifica_presenza(rootkit_keys, tag_name):
            tv_analysis_df_new.loc[i, 'rootkit'] = 1

        if verifica_presenza(spyware_keys, tag_name):
            tv_analysis_df_new.loc[i, 'spyware'] = 1  

        if verifica_presenza(adware_keys, tag_name):
            tv_analysis_df_new.loc[i, 'adware'] = 1

        if verifica_presenza(botnet_keys, tag_name):
            tv_analysis_df_new.loc[i, 'botnet'] = 1

        if verifica_presenza(keylog_keys, tag_name):
            tv_analysis_df_new.loc[i, 'keylogger'] = 1    

        if verifica_presenza(dropper_keys, tag_name):
            tv_analysis_df_new.loc[i, 'dropper'] = 1    

        if verifica_presenza(backdoor_keys, tag_name):
            tv_analysis_df_new.loc[i, 'backdoor'] = 1   

        if verifica_presenza(download_keys, tag_name):
            tv_analysis_df_new.loc[i, 'downloader'] = 1 
        
tv_analysis_df_new       


In [None]:
from collections import Counter

def most_common_value(group):
    value_counts = Counter(group)
    most_common = max(value_counts, key=value_counts.get)
    return most_common

new = tv_analysis_df_new.dropna(subset=['verdict'])

new = pd.DataFrame(new.groupby(['sha1', 'sha256', 'md5'])['verdict'].agg(most_common_value).reset_index())

new

In [9]:
from collections import Counter

grouped_df = tv_analysis_df_new[['sha1', 'sha256', 'md5', 'worm', 'trojan', 'ransomware', 'rootkit',
       'spyware', 'adware', 'botnet', 'keylogger', 'dropper', 'backdoor',
       'downloader']].groupby(['sha1', 'sha256', 'md5']).sum()
max_column = grouped_df.idxmax(axis=1)
max_is_unique = grouped_df.eq(grouped_df.max(axis=1), axis=0).sum(axis=1) == 1

max_column[grouped_df.sum(axis=1) == 0 | max_is_unique ] = 'unknown'


type_df = pd.DataFrame(max_column).reset_index()
type_df = type_df.rename(columns={0 : 'malware_type'})

type_df 

Unnamed: 0,sha1,sha256,md5,malware_type
0,007416f9ba10d2e2951a9dcd104b7206ad20d433,094b57a07096fb1f866a104c0b14c84aa99815b613587780765c8c6cd6ef1a42,7baa4836bd81113c16960501f679a81b,unknown
1,0076d8c6a11389be49538c5ad7b7d785267d610b,a536dd3a57671b050f8928a41d84eaecb4424055e3fa62bb697e5cefc5effd05,8c92da8deae966909acff4f1703f3707,unknown
2,008f97da0165cb969cf5fd1eca915272be7d3ead,5fbae0ffc5fc830f591197bd52b2523ca7cb51e6fcc41d7f6f85c4bc738719d6,a9ec10648f84a04baeb42bfff166ac62,trojan
3,00deee1467fdb3da4538b92e11702938700c3c18,099562ce3bc1287853f17b20de1a32f043f600a20c430deb090dd4eb6b0e033a,08df5b6cfb6640881ec9bf5064739842,unknown
4,010235d7e9862a99bebfbe6356fa2a1314a60ce5,e276038d763f4304792e616270cd17e327a9cec96911d7c5575fbd29a3db4f71,aa3174796d63fcb60c4ab4a1868bb77c,unknown
...,...,...,...,...
2453,ff91376da89e90d19e7c9f53dd8915be22e83627,d9dcaa71fb3a6eb36123628d089cfa59f1b3f01120626a5e4d791a7adf4116ab,31b76b0b5ab72c603ef54efa63c32b22,trojan
2454,ff945fbb4577b5b8939d6f80367c5e4b6cdef99b,bf9c7574e3ca23a96e317b42385aee11a982ab20649a6954d507e9c76b4044b5,87243804ebf481b95392b3ec64774297,unknown
2455,ffa389bca10a6cc2ce21599a88fb46c1fe74ddae,498bb8801504768164f19943b448561cc59627fccb169a83cc2efd9b44afecc2,ebf038addbae83d5a577c3e0a9bb40d2,trojan
2456,fff8c580c32587b64b62794c35f22ecfe4787ac3,81e2da0121fbcfa18b7bd6ac1b587bf4b36aef521858356a33dd789ba1d345a9,695ec9e465a43e9f64f1a5926325948c,unknown


In [None]:
import re
    
tv_analysis2_df = tv_analysis_df.copy()

for i in range(len(tv_analysis2_df)):
        
    value = tv_analysis2_df.at[i,'malware']
    if (not isinstance(value, float) or not math.isnan(value)):    
        value = value.strip() 
        value = value.lower()
        value = value.replace(':', ' ').replace('.', ' ').replace('-', ' ').replace('\\',' ').replace('[' , ' ').replace( ']', ' ').replace( ',', ' ')
        value = ''.join(char for char in value if not char.isdigit())
        value = value.strip() 
    tv_analysis2_df.at[i,'to_predict'] = value
       
tv_analysis2_df

#### Jaro-Wrinkler Similarity index

In [11]:
# Jaro Winkler Similarity
def jaro_Winkler(s1, s2) :
 
    jaro_dist = jaro_distance(s1, s2);
 
    # If the jaro Similarity is above a threshold
    if (jaro_dist > 0.7) :
 
        # Find the length of common prefix
        prefix = 0;
 
        for i in range(min(len(s1), len(s2))) :
         
            # If the characters match
            if (s1[i] == s2[i]) :
                prefix += 1;
 
            # Else break
            else :
                break;
 
        # Maximum of 4 characters are allowed in prefix
        prefix = min(4, prefix);
 
        # Calculate jaro winkler Similarity
        jaro_dist += 0.1 * prefix * (1 - jaro_dist);
 
    return jaro_dist;


def jaro_distance(s1, s2) :
 
    # If the strings are equal
    if (s1 == s2) :
        return 1.0;
 
    # Length of two strings
    len1 = len(s1);
    len2 = len(s2);
 
    if (len1 == 0 or len2 == 0) :
        return 0.0;
 
    # Maximum distance upto which matching
    # is allowed
    max_dist = (max(len(s1), len(s2)) // 2 ) - 1;
 
    # Count of matches
    match = 0;
 
    # Hash for matches
    hash_s1 = [0] * len(s1) ;
    hash_s2 = [0] * len(s2) ;
 
    # Traverse through the first string
    for i in range(len1) :
 
        # Check if there is any matches
        for j in range( max(0, i - max_dist),
                    min(len2, i + max_dist + 1)) :
             
            # If there is a match
            if (s1[i] == s2[j] and hash_s2[j] == 0) :
                hash_s1[i] = 1;
                hash_s2[j] = 1;
                match += 1;
                break;
         
    # If there is no match
    if (match == 0) :
        return 0.0;
 
    # Number of transpositions
    t = 0;
 
    point = 0;
 
    # Count number of occurrences
    # where two characters match but
    # there is a third matched character
    # in between the indices
    for i in range(len1) :
        if (hash_s1[i]) :
 
            # Find the next matched character
            # in second string
            while (hash_s2[point] == 0) :
                point += 1;
 
            if (s1[i] != s2[point]) :
                point += 1;
                t += 1;
            else :
                point += 1;
                 
        t /= 2;
 
    # Return the Jaro Similarity
    return ((match / len1 + match / len2 +
            (match - t) / match ) / 3.0);

In [12]:
def compute_matrix(lista_riga, lista_colonna):
    
    df = pd.DataFrame()
    
    df.index = lista_riga
    
    df.rename(columns=lista_colonna, inplace=True)  
    
    for i in range(len(lista_riga)):
        for j in range(len(lista_colonna)):
            
            string1 = lista_riga[i]
            string2 = lista_colonna[j]
            
            jaro_wrinkler = jaro_Winkler(string1,string2)
            
            df.loc[string1, string2] = jaro_wrinkler
            
    return df
            
            

In [13]:
def check_similarity(df, words_list):
    
    dataframe = pd.DataFrame()
    
    for i in range(len(df)):
        
        #print(i)
        
        mal = str(df.at[i, 'to_predict'])
        
        malwares_list = mal.split()
        matrix = compute_matrix( malwares_list, words_list)
        
        malwares_df = search_similiar_malware(matrix, mal,  0.7)
        
        dataframe = pd.concat([malwares_df, dataframe])
        
    return dataframe
        
def search_similiar_malware(matrix, malware_tv , alpha):
    
    dataframe = pd.DataFrame()

    indexes = list(matrix.index)
    columns = list(matrix.columns)
    
    for i in range(len(indexes)):
        
        maximum = 0
        temp = pd.DataFrame()

        for j in range(len(columns)):
            
            x = indexes[i]
            y = columns[j]
                        
            element = matrix.at[x, y]
            
            #print(f'maximum : {maximum}, element : { element } , x : { x } , y : { y }')
            
            if isinstance(element, float) and (element > alpha) and (element > maximum) :
                
                maximum = element
                alias = y

                data = {'alias': [alias], 'tv_malware': [malware_tv]}
                
                temp = pd.DataFrame(data)
        
        if(len(temp) != 0):

            dataframe = pd.concat([dataframe, temp], ignore_index= True)
    
    return dataframe        

In [None]:
trial_df = check_similarity(tv_analysis2_df, words_list )

In [15]:
trial_df = trial_df.drop_duplicates().reset_index(drop = True)
trial_df

Unnamed: 0,alias,tv_malware
0,whitesnake stealer,win packed samas
1,iceid,win packed samas
2,scarimson,win packed samas
3,whitesnake stealer,win malware dlmc
4,amadey,win malware dlmc
...,...,...
2564,houdini,securiteinfo com generic mg bdedfed unofficial
2565,pupy,pua win packer borlanddelphi
2566,whitesnake stealer,pua win packer borlanddelphi
2567,darkcomet,pua win packer borlanddelphi


In [16]:
new_trial_df = trial_df.rename(columns = {'alias' : 'malware', 'tv_malware' : 'alias'})
new_aliases_df = pd.concat([alias_df, new_trial_df])

new_aliases_df.to_csv("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\FileAnalysis\\PostDataEnrichment\\Alias.csv")

In [None]:
new_tv_analysis = tv_analysis2_df.copy()
new_tv_analysis = new_tv_analysis.sort_values(by=['sha1', 'sha256', 'md5'])
new_tv_analysis.tail(50)

In [18]:
new_tv_analysis = tv_analysis2_df.copy()

for i in range(len(trial_df)):
    
    tv_malware = trial_df.at[i, 'tv_malware']
    alias = trial_df.at[i, 'alias']

    malware = alias_df[alias_df.alias == alias].reset_index(drop = True).at[0,'malware']
    
    new_tv_analysis.loc[new_tv_analysis.to_predict == tv_malware , 'malware_printable'] = malware  
    
new_tv_analysis   

Unnamed: 0,sha1,sha256,md5,malware,verdict,threat_vendor_name,to_predict,malware_printable
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,pua.win.packer.borlanddelphi-15,,clamav,pua win packer borlanddelphi,banload
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,securiteinfo.com.trojan.generickd.68242337.20401.3632.unofficial,,clamav,securiteinfo com trojan generickd unofficial,houdini
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,win.trojan.remcos-9753190-0,,clamav,win trojan remcos,remcos
3,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,win.trojan.remcos-9841897-0,,clamav,win trojan remcos,remcos
4,a2915c1be9e6134b7bf3ca5ca00eeb0c969bedab,6c2a2251861a6d2701814843fadac940cf4d34db9f446f0698352fd866b31739,1b09de36dfe5850d7e3fbd6b39c89a43,securiteinfo.com.generic.mg.1b09de36dfe5850d.24204.unofficial,,clamav,securiteinfo com generic mg bdedfed unofficial,houdini
...,...,...,...,...,...,...,...,...
9708,0d7b506cdc8e0228a8c50f1adf3b1b3224fc114d,a48ab00c2a748d146264d9ceebed2346013a730bde2ee91bc03de76f8d262aa1,1e53278bd1d1ee7c84ffc2d44dce4117,win.packed.zusy-6860439-0,malicious,clamav,win packed zusy,icedid
9709,a3447ba9b83f30284c6d3effb45c31ad9d5f258f,bbd3ecd9e9671d94e8897980c4eb9391ae9cb444615ed9a93b8221ae8fa66790,f6eec1317ece3ffb7c4916e224d9734d,win.trojan.revengerat-6344273-0,malicious,clamav,win trojan revengerat,redline stealer
9710,3ab53037e2360c42b57d213234ced58717844a71,5de4932cb0ccab3fec711135cdd1af5a09c26a870bfb2d2adc650d0fdbf16488,e0fea510c784eb0e440e99e53d83d414,win.malware.generic-6623004-0,malicious,clamav,win malware generic,cobalt strike
9711,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,win.malware.dlmc-7395626-0,malicious,clamav,win malware dlmc,amadey


In [19]:
old_malware = new_tv_analysis.copy()
old_malware = pd.DataFrame(old_malware.groupby(['sha1', 'sha256', 'md5'])['malware_printable'].agg(most_common_value).reset_index())

old_malware

Unnamed: 0,sha1,sha256,md5,malware_printable
0,007416f9ba10d2e2951a9dcd104b7206ad20d433,094b57a07096fb1f866a104c0b14c84aa99815b613587780765c8c6cd6ef1a42,7baa4836bd81113c16960501f679a81b,
1,0076d8c6a11389be49538c5ad7b7d785267d610b,a536dd3a57671b050f8928a41d84eaecb4424055e3fa62bb697e5cefc5effd05,8c92da8deae966909acff4f1703f3707,cobalt strike
2,008f97da0165cb969cf5fd1eca915272be7d3ead,5fbae0ffc5fc830f591197bd52b2523ca7cb51e6fcc41d7f6f85c4bc738719d6,a9ec10648f84a04baeb42bfff166ac62,supremebot
3,00deee1467fdb3da4538b92e11702938700c3c18,099562ce3bc1287853f17b20de1a32f043f600a20c430deb090dd4eb6b0e033a,08df5b6cfb6640881ec9bf5064739842,
4,010235d7e9862a99bebfbe6356fa2a1314a60ce5,e276038d763f4304792e616270cd17e327a9cec96911d7c5575fbd29a3db4f71,aa3174796d63fcb60c4ab4a1868bb77c,cobalt strike
...,...,...,...,...
2453,ff91376da89e90d19e7c9f53dd8915be22e83627,d9dcaa71fb3a6eb36123628d089cfa59f1b3f01120626a5e4d791a7adf4116ab,31b76b0b5ab72c603ef54efa63c32b22,nanocore rat
2454,ff945fbb4577b5b8939d6f80367c5e4b6cdef99b,bf9c7574e3ca23a96e317b42385aee11a982ab20649a6954d507e9c76b4044b5,87243804ebf481b95392b3ec64774297,
2455,ffa389bca10a6cc2ce21599a88fb46c1fe74ddae,498bb8801504768164f19943b448561cc59627fccb169a83cc2efd9b44afecc2,ebf038addbae83d5a577c3e0a9bb40d2,cobalt strike
2456,fff8c580c32587b64b62794c35f22ecfe4787ac3,81e2da0121fbcfa18b7bd6ac1b587bf4b36aef521858356a33dd789ba1d345a9,695ec9e465a43e9f64f1a5926325948c,cobalt strike


In [20]:
old_malware['malware_printable'] = old_malware['malware_printable'].fillna('unknown')
old_malware

Unnamed: 0,sha1,sha256,md5,malware_printable
0,007416f9ba10d2e2951a9dcd104b7206ad20d433,094b57a07096fb1f866a104c0b14c84aa99815b613587780765c8c6cd6ef1a42,7baa4836bd81113c16960501f679a81b,unknown
1,0076d8c6a11389be49538c5ad7b7d785267d610b,a536dd3a57671b050f8928a41d84eaecb4424055e3fa62bb697e5cefc5effd05,8c92da8deae966909acff4f1703f3707,cobalt strike
2,008f97da0165cb969cf5fd1eca915272be7d3ead,5fbae0ffc5fc830f591197bd52b2523ca7cb51e6fcc41d7f6f85c4bc738719d6,a9ec10648f84a04baeb42bfff166ac62,supremebot
3,00deee1467fdb3da4538b92e11702938700c3c18,099562ce3bc1287853f17b20de1a32f043f600a20c430deb090dd4eb6b0e033a,08df5b6cfb6640881ec9bf5064739842,unknown
4,010235d7e9862a99bebfbe6356fa2a1314a60ce5,e276038d763f4304792e616270cd17e327a9cec96911d7c5575fbd29a3db4f71,aa3174796d63fcb60c4ab4a1868bb77c,cobalt strike
...,...,...,...,...
2453,ff91376da89e90d19e7c9f53dd8915be22e83627,d9dcaa71fb3a6eb36123628d089cfa59f1b3f01120626a5e4d791a7adf4116ab,31b76b0b5ab72c603ef54efa63c32b22,nanocore rat
2454,ff945fbb4577b5b8939d6f80367c5e4b6cdef99b,bf9c7574e3ca23a96e317b42385aee11a982ab20649a6954d507e9c76b4044b5,87243804ebf481b95392b3ec64774297,unknown
2455,ffa389bca10a6cc2ce21599a88fb46c1fe74ddae,498bb8801504768164f19943b448561cc59627fccb169a83cc2efd9b44afecc2,ebf038addbae83d5a577c3e0a9bb40d2,cobalt strike
2456,fff8c580c32587b64b62794c35f22ecfe4787ac3,81e2da0121fbcfa18b7bd6ac1b587bf4b36aef521858356a33dd789ba1d345a9,695ec9e465a43e9f64f1a5926325948c,cobalt strike


In [21]:
old_malware.malware_printable.value_counts()

malware_printable
cobalt strike                 1002
unknown                        411
remcos                         240
alien                          108
houdini                         98
mirai                           86
darkcomet                       75
asyncrat                        51
njrat                           50
nanocore rat                    33
ave maria                       24
bitrat                          20
ghost rat                       19
redline stealer                 15
xworm                           15
whitesnake stealer              13
darkgate                        11
supremebot                      11
smokeloader                     11
amadey                          11
quasar rat                      10
bumblebee                       10
crimson rat                     10
strrat                          10
loda                             9
kimsuky                          9
hydra                            9
fakeupdates                      9
as

In [22]:
merged_df = pd.merge(old_malware, type_df, on=['sha1', 'sha256', 'md5'])
merged_df = pd.merge(new, merged_df, on=['sha1', 'sha256', 'md5'])
merged_df = merged_df.rename(columns={'malware_printable' : 'malware', })
merged_df

Unnamed: 0,sha1,sha256,md5,verdict,malware,malware_type
0,007416f9ba10d2e2951a9dcd104b7206ad20d433,094b57a07096fb1f866a104c0b14c84aa99815b613587780765c8c6cd6ef1a42,7baa4836bd81113c16960501f679a81b,malicious,unknown,unknown
1,0076d8c6a11389be49538c5ad7b7d785267d610b,a536dd3a57671b050f8928a41d84eaecb4424055e3fa62bb697e5cefc5effd05,8c92da8deae966909acff4f1703f3707,malicious,cobalt strike,unknown
2,008f97da0165cb969cf5fd1eca915272be7d3ead,5fbae0ffc5fc830f591197bd52b2523ca7cb51e6fcc41d7f6f85c4bc738719d6,a9ec10648f84a04baeb42bfff166ac62,malicious,supremebot,trojan
3,00deee1467fdb3da4538b92e11702938700c3c18,099562ce3bc1287853f17b20de1a32f043f600a20c430deb090dd4eb6b0e033a,08df5b6cfb6640881ec9bf5064739842,malicious,unknown,unknown
4,010235d7e9862a99bebfbe6356fa2a1314a60ce5,e276038d763f4304792e616270cd17e327a9cec96911d7c5575fbd29a3db4f71,aa3174796d63fcb60c4ab4a1868bb77c,malicious,cobalt strike,unknown
...,...,...,...,...,...,...
2453,ff91376da89e90d19e7c9f53dd8915be22e83627,d9dcaa71fb3a6eb36123628d089cfa59f1b3f01120626a5e4d791a7adf4116ab,31b76b0b5ab72c603ef54efa63c32b22,malicious,nanocore rat,trojan
2454,ff945fbb4577b5b8939d6f80367c5e4b6cdef99b,bf9c7574e3ca23a96e317b42385aee11a982ab20649a6954d507e9c76b4044b5,87243804ebf481b95392b3ec64774297,malicious,unknown,unknown
2455,ffa389bca10a6cc2ce21599a88fb46c1fe74ddae,498bb8801504768164f19943b448561cc59627fccb169a83cc2efd9b44afecc2,ebf038addbae83d5a577c3e0a9bb40d2,malicious,cobalt strike,trojan
2456,fff8c580c32587b64b62794c35f22ecfe4787ac3,81e2da0121fbcfa18b7bd6ac1b587bf4b36aef521858356a33dd789ba1d345a9,695ec9e465a43e9f64f1a5926325948c,malicious,cobalt strike,unknown


In [23]:
entries_df = pd.read_csv(entries, low_memory = False)
entries_df = entries_df.drop(columns=['Unnamed: 0'])
entries_df = pd.merge(entries_df, merged_df, on=['sha1', 'sha256', 'md5'])
entries_df['#OTX_pulses'].fillna(0, inplace=True)
entries_df['#OTX_pulses'] = entries_df['#OTX_pulses'].astype(int)
entries_df

Unnamed: 0,sha1,sha256,md5,first_seen,last_seen,analysis_date,file_type,delivery_method,#OTX_pulses,verdict_x,malware_x,malware_type_x,verdict_y,malware_y,malware_type_y
0,411535c9548f86034a6a9a603de521a86a3b5466,04c53261b1220a894a02f5ffb39cdfd73f93481c0b5c8106d21c91b20205c62d,5b5a0225a2b7b9a054417662c733168a,2021-11-09,2021-11-09,2021-11-09,exe,email attachment,4,malicious,remcos,trojan,malicious,remcos,trojan
1,1ad9e9761fd6935c0cf5048c9615d0383baac48e,ac901bf5882f14e9e07235b8488b6479b4519addda6dbfb89147401c1e9e6e4f,da9534900ee0d11c9b30cf33152ea03c,2023-07-19,2023-08-25,2023-07-19,exe,web download,2,malicious,remcos,trojan,malicious,remcos,trojan
2,8ab63abfe58fb218cc001581ac0d9fe38c784157,7f514ed5e1ec262953e6252a4089531c519e95d700c5808415b0f049fc59a5f0,855232a26e8afcd0cca5f816a1919dee,2022-01-04,2022-02-04,2022-01-04,exe,unknown,11,malicious,remcos,trojan,malicious,remcos,trojan
3,55cbf8dae95d350ad7e5a63b418f6cd203488f51,8b7ae9f195b075a789d6d8277d500d27754bfa3c53ecca8db7beac8ccd07884f,97d9d216f2627cb238ca6637580b1739,2023-06-13,2023-06-13,2022-02-09,exe,unknown,6,malicious,remcos,trojan,malicious,remcos,trojan
4,a2915c1be9e6134b7bf3ca5ca00eeb0c969bedab,6c2a2251861a6d2701814843fadac940cf4d34db9f446f0698352fd866b31739,1b09de36dfe5850d7e3fbd6b39c89a43,2022-03-27,2022-04-06,2021-02-11,exe,web download,2,malicious,houdini,trojan,malicious,houdini,trojan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2469,cfb22401f0daf38925601e1d6daaaa37e112b440,71df2a2ebf19a75e3f0bfc9b80e4066de9e44cd151ba700e2fe55d1f4be8b17a,284caed6cde33bdb536907a2e51f1311,,,2021-03-11,exe,unknown,0,malicious,cobalt strike,unknown,malicious,cobalt strike,unknown
2470,6b6955af591154d3f6e2db35ca9904cce7e264b1,3a9d8c693b7dc6a9c2bf876c5626ebd0923e35bd5f698baed248df7f51f67275,03872a722df0b073a84fd1aa45eb9376,,,2020-02-20,elf,unknown,0,malicious,cobalt strike,unknown,malicious,cobalt strike,unknown
2471,3ab53037e2360c42b57d213234ced58717844a71,5de4932cb0ccab3fec711135cdd1af5a09c26a870bfb2d2adc650d0fdbf16488,e0fea510c784eb0e440e99e53d83d414,,,2021-07-14,exe,unknown,0,malicious,cobalt strike,unknown,malicious,cobalt strike,unknown
2472,4f9b9859a596a8bfdfee516788587ca0c0fb6ae8,f10dd5ce32d72489dd2696e5b58ee115f3cd065969a61a6613cab27bcd57de70,c9f771be253b6ca1d7a808d7d65c5e29,,,2020-09-08,exe,unknown,2,malicious,unknown,unknown,malicious,unknown,unknown


In [24]:
entries_df.to_csv("C:\\Users\\leona\\OneDrive\\Desktop\\Tesi\\Pipeline\\Datasets\\Pipeline\\FileAnalysis\\PostDataEnrichment\\Entries.csv")

In [25]:
'''
jwm_tv_analysis_old_alias = compute_matrix( tv_words, words_list)
jwm_tv_analysis_old_alias

'''

'\njwm_tv_analysis_old_alias = compute_matrix( tv_words, words_list)\njwm_tv_analysis_old_alias\n\n'

In [26]:
'''
jwm_tv_analysis_tv_analysis = compute_matrix( tv_words, tv_words)
jwm_tv_analysis_tv_analysis
'''



'\njwm_tv_analysis_tv_analysis = compute_matrix( tv_words, tv_words)\njwm_tv_analysis_tv_analysis\n'

In [27]:
'''
def search_similarity(matrix, alias, alpha):
    
    dataframe1 = pd.DataFrame()
    dataframe = pd.DataFrame()

    indexes = list(matrix.index)
    columns = list(matrix.columns)
    
    for i in range(len(indexes)):
        for j in range(len(columns)):
            
            x = indexes[i]
            y = columns[j]
                        
            element = matrix.at[x, y]
            
            if((element > alpha)):

                data = {'alias': [y], 'tv_malware': [x]}
                temp = pd.DataFrame(data)
                
                dataframe1 = pd.concat([dataframe1, temp], ignore_index= True)
    
    dataframe1 = dataframe1.reset_index(drop=True)
    
    for i in range(len(dataframe1)):
        
        tv_malware_element = dataframe1.at[i, 'tv_malware'] 
        alias_element = dataframe1.at[i, 'alias']
        
        malware = alias[ alias.alias == alias_element ].reset_index(drop = True).at[0, 'malware']
        
        data = {'malware': [malware], 'alias': [alias_element]}
        temp1 = pd.DataFrame(data)

        dataframe = pd.concat([dataframe, temp1], ignore_index= True)
        
    dataframe = dataframe.reset_index(drop=True)
    
    return dataframe
'''

"\ndef search_similarity(matrix, alias, alpha):\n    \n    dataframe1 = pd.DataFrame()\n    dataframe = pd.DataFrame()\n\n    indexes = list(matrix.index)\n    columns = list(matrix.columns)\n    \n    for i in range(len(indexes)):\n        for j in range(len(columns)):\n            \n            x = indexes[i]\n            y = columns[j]\n                        \n            element = matrix.at[x, y]\n            \n            if((element > alpha)):\n\n                data = {'alias': [y], 'tv_malware': [x]}\n                temp = pd.DataFrame(data)\n                \n                dataframe1 = pd.concat([dataframe1, temp], ignore_index= True)\n    \n    dataframe1 = dataframe1.reset_index(drop=True)\n    \n    for i in range(len(dataframe1)):\n        \n        tv_malware_element = dataframe1.at[i, 'tv_malware'] \n        alias_element = dataframe1.at[i, 'alias']\n        \n        malware = alias[ alias.alias == alias_element ].reset_index(drop = True).at[0, 'malware']\n        

In [28]:
'''
new_alias_df = pd.concat([search_similarity(jwm_tv_analysis_old_alias, alias_df, 0.80), alias_df]).drop_duplicates().reset_index(drop=True)

new_alias_df
'''



'\nnew_alias_df = pd.concat([search_similarity(jwm_tv_analysis_old_alias, alias_df, 0.80), alias_df]).drop_duplicates().reset_index(drop=True)\n\nnew_alias_df\n'

In [29]:
'''def search_similarity_in_matrix(matrix, df, alpha):
    
    dataframe = pd.DataFrame()

    indexes = list(matrix.index)
    columns = list(matrix.columns)
    
    for i in range(len(indexes)):
        for j in range(len(columns)):
            
            if(i > j):
            
                x = indexes[i]
                y = columns[j]
                        
                element = matrix.at[x, y]
            
                if((element > alpha)):                   
                    
                    data = {'A': [y], 'B': [x]}
                    temp = pd.DataFrame(data)
                
                    dataframe = pd.concat([dataframe, temp], ignore_index= True)
    
    return dataframe'''

"def search_similarity_in_matrix(matrix, df, alpha):\n    \n    dataframe = pd.DataFrame()\n\n    indexes = list(matrix.index)\n    columns = list(matrix.columns)\n    \n    for i in range(len(indexes)):\n        for j in range(len(columns)):\n            \n            if(i > j):\n            \n                x = indexes[i]\n                y = columns[j]\n                        \n                element = matrix.at[x, y]\n            \n                if((element > alpha)):                   \n                    \n                    data = {'A': [y], 'B': [x]}\n                    temp = pd.DataFrame(data)\n                \n                    dataframe = pd.concat([dataframe, temp], ignore_index= True)\n    \n    return dataframe"

In [30]:
'''similarity_matrix_dataframe = search_similarity_in_matrix(jwm_tv_analysis_tv_analysis, new_alias_df, 0.9)
similarity_matrix_dataframe = similarity_matrix_dataframe.drop_duplicates()
similarity_matrix_dataframe'''

'similarity_matrix_dataframe = search_similarity_in_matrix(jwm_tv_analysis_tv_analysis, new_alias_df, 0.9)\nsimilarity_matrix_dataframe = similarity_matrix_dataframe.drop_duplicates()\nsimilarity_matrix_dataframe'

In [31]:
'''def add_alias(new_possible_alias_df, alias_df):
    
    for i in range(len(new_possible_alias_df)):
        
        a_element = new_possible_alias_df.at[i, 'A'] 
        b_element = new_possible_alias_df.at[i, 'B']
        
        malwares = list(alias_df[ ( alias_df.alias == a_element ) | ( alias_df.alias == b_element ) ].reset_index(drop = True)['malware'])
        
        if(len(malwares) == 0):
            
            data = {'malware'}
            
        if(len(malwares) == 1):
            
            
'''        
        

"def add_alias(new_possible_alias_df, alias_df):\n    \n    for i in range(len(new_possible_alias_df)):\n        \n        a_element = new_possible_alias_df.at[i, 'A'] \n        b_element = new_possible_alias_df.at[i, 'B']\n        \n        malwares = list(alias_df[ ( alias_df.alias == a_element ) | ( alias_df.alias == b_element ) ].reset_index(drop = True)['malware'])\n        \n        if(len(malwares) == 0):\n            \n            data = {'malware'}\n            \n        if(len(malwares) == 1):\n            \n            \n"

In [32]:
'''similarity_dataframe.head(50)'''

'similarity_dataframe.head(50)'