In [1]:
import os
import re
import itertools
import pandas as pd
import numpy as np
import glob
import time
import datetime
import multiprocessing
import concurrent.futures

In [2]:
# Extracts evidence from a text
def get_evidence_from_text(text, evi_types=['url']):
    target_list=['ip','url']   
    assert set(evi_types).issubset(set(target_list)), "can't find evience types."
    
    df_list=[]
    for type in evi_types:     
        if type.lower() == 'url':
            # Define the regular expression pattern to match URLs
            pattern = r"\b(?:https?://)(?:www\.)?(?:\w+)(?:\.\w+)+(?:/[^/\s\"\'<>{}]*)*\b"
            
        if type.lower() == 'ip':
            # Extracts the ip from text file
            # IPv4 address in the form of a string of four decimal numbers separated by periods. 
            # Each decimal number in the range of 0 to 255.
            # pattern = r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b" 
            pattern =r'(\d{1,3}\.){3}\d{1,3}'
        rows=[(match.start(), match.end()-1, match.group()) 
                            for match in re.finditer(pattern , text)]   
        starts, ends, strs = zip(*rows) if len(rows) > 0 else ([], [], [])
        pd_type = pd.DataFrame({'type': type, 'value': strs, 'start_pos': starts, 'end_pos': ends})
        df_list.append(pd_type)
    return  pd.concat(df_list,ignore_index=True) 


# Extracts evidence from line of a file
def get_evidence_from_line(file_id, text, line_num, evi_types=['url']):
    df=get_evidence_from_text(text, evi_types)
    df['file_id']=file_id
    df['text']=text.rstrip() 
    df['line_num']=line_num
    return df

#Extracts evidence from file
def get_evidence_from_file(file, evi_types=['url']):
    try:    
         with open(file, 'r', encoding="utf8") as f:
            lines = f.readlines()
            # skip if a file is empty
            if len(lines)>0 :
                return pd.concat([get_evidence_from_line(file, line, i, evi_types) 
                          for i, line in enumerate(lines)]) 
            else:
                return None
    except UnicodeDecodeError as e:
        print(f"Error decoding file: {e} : {file}")
        
# Extracts evidence from a directory recursively. Save the results to a file
# The default directory of the output is output folder 
def get_evidence_from_folder(directory, evi_types=['url'], of_prefix="evi", support_threads=True, save=True):
    print(f'processing {directory} and # of files/directories in the directory is {len(os.listdir(directory))}' )
    start_time = time.time()
    
    # Processing all files recursively
    all_files = glob.glob(directory+"/**", recursive=True)
    all_files = [f for f in all_files if not os.path.isdir(f)]  
    
    if support_threads: 
        num_threads = multiprocessing.cpu_count()  
        with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
            # Process the files in parallel
            df=pd.concat([executor.submit(get_evidence_from_file, filename,  evi_types).result() 
                for filename in all_files]).reset_index().drop('index', axis=1)
    else:
        df= pd.concat([get_evidence_from_file(filename, evi_types) 
                      for filename in all_files]).reset_index().drop('index', axis=1)
    
    end_time = time.time()
    computing_time = end_time - start_time
    print(f"Computing time: {computing_time:.2f} seconds")
    
    if save:           
        # Save the DataFrame to a CSV file
        folder_path = "./output/"
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        
        current_time = datetime.datetime.now()
        file_name = of_prefix+"_"+ re.sub(r'[^\w\s]+', '-', directory)+"_".join(evi_types)+\
            "_"+current_time.strftime("%Y-%m-%d_%H-%M-%S.csv")
        of = folder_path + file_name
        
        df.to_csv(of, encoding="utf8", index=False)
        
        print(f"The results are saved to a default folder {of}")
    return df


In [13]:
get_evidence_from_folder("E:/vmshared_storage/Kali202004/bin2text/", evi_types=['ip'], of_prefix="xu_simple")

processing E:/vmshared_storage/Kali202004/bin2text/ and # of files/directories in the directory is 6
Error decoding file: 'utf-8' codec can't decode byte 0xe9 in position 37: invalid continuation byte : E:/vmshared_storage/Kali202004/bin2text\txt\01336019a63a42c53f5b898a620f431953fe06c2.txt
Error decoding file: 'utf-8' codec can't decode byte 0x82 in position 7: invalid start byte : E:/vmshared_storage/Kali202004/bin2text\txt\02207641b1193afa96eee2f59b58d86c752f57ee.txt
Error decoding file: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte : E:/vmshared_storage/Kali202004/bin2text\txt\02f84a08618da7b25c3461544a09e1968eeb510f.txt
Error decoding file: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte : E:/vmshared_storage/Kali202004/bin2text\txt\09aa310e3635ced1d6c811f804fff456f74cd146.txt
Error decoding file: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte : E:/vmshared_storage/Kali202004/bin2text\txt\0c5be1c585cf2eeba7909

Unnamed: 0,type,value,start_pos,end_pos,file_id,text,line_num
0,ip,1.000.000.000,9.0,21.0,E:/vmshared_storage/Kali202004/bin2text\Apple_...,\t<string>1.000.000.000+</string>,77
1,ip,1.0.0.1,6.0,12.0,E:/vmshared_storage/Kali202004/bin2text\Apple_...,\t<key>1.0.0.1</key>,4612
2,ip,1.0.0.1,9.0,15.0,E:/vmshared_storage/Kali202004/bin2text\Apple_...,\t<string>1.0.0.1</string>,4613
3,ip,1.1.1.1,6.0,12.0,E:/vmshared_storage/Kali202004/bin2text\Apple_...,\t<key>1.1.1.1</key>,4616
4,ip,1.1.1.1,9.0,15.0,E:/vmshared_storage/Kali202004/bin2text\Apple_...,\t<string>1.1.1.1</string>,4617
...,...,...,...,...,...,...,...
6750,ip,3.4.2.1,20.0,26.0,E:/vmshared_storage/Kali202004/bin2text\txt\3e...,SCP Ver.......... 3.4.2.1.6.0,22
6751,ip,3.4.2.1,20.0,26.0,E:/vmshared_storage/Kali202004/bin2text\txt\3e...,SCP Ver.......... 3.4.2.1.6.0,41
6752,ip,100.0.33.17,35.0,45.0,E:/vmshared_storage/Kali202004/bin2text\txt\e7...,Apple clang version 11.0.0 (clang-1100.0.33.17),40
6753,ip,10.160.94.3,2.0,12.0,E:/vmshared_storage/Kali202004/bin2text\txt\e8...,# 10.160.94.3\tasecretkeygoeshere,1
