#PREPEARING SHODAN

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import ast

DATA_FOLDER = "../../DATA/"

In [2]:
shodan = pd.read_csv(DATA_FOLDER + "RAW/shodan_df_hashed.csv")
shodan

Unnamed: 0,shodan_info,attacker_ip_enum
0,{},5915
1,"{'22/tcp': {'headers_hash': None, 'jarm': None...",3325
2,{},8416
3,{},1213
4,{},9185
...,...,...
197674,"{'1701/udp': {'headers_hash': None, 'jarm': No...",196812
197675,{},191141
197676,"{'80/tcp': {'headers_hash': -282574487, 'jarm'...",195077
197677,{},198002


In [3]:
shodan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197679 entries, 0 to 197678
Data columns (total 2 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   shodan_info       197679 non-null  object
 1   attacker_ip_enum  197679 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.0+ MB


In [4]:
shodan['shodan_info'] = shodan['shodan_info'].apply(ast.literal_eval)
shodan['port_list'] = shodan["shodan_info"].apply(lambda x: [key for key in x.keys()])
shodan

Unnamed: 0,shodan_info,attacker_ip_enum,port_list
0,{},5915,[]
1,"{'22/tcp': {'headers_hash': None, 'jarm': None...",3325,[22/tcp]
2,{},8416,[]
3,{},1213,[]
4,{},9185,[]
...,...,...,...
197674,"{'1701/udp': {'headers_hash': None, 'jarm': No...",196812,"[1701/udp, 8728/tcp]"
197675,{},191141,[]
197676,"{'80/tcp': {'headers_hash': -282574487, 'jarm'...",195077,"[80/tcp, 443/tcp, 541/tcp]"
197677,{},198002,[]


# FEATURE 1 (open ports COUNT)

In [5]:
shodan['port_count'] = shodan["shodan_info"].apply(lambda x: len(x))
shodan

Unnamed: 0,shodan_info,attacker_ip_enum,port_list,port_count
0,{},5915,[],0
1,"{'22/tcp': {'headers_hash': None, 'jarm': None...",3325,[22/tcp],1
2,{},8416,[],0
3,{},1213,[],0
4,{},9185,[],0
...,...,...,...,...
197674,"{'1701/udp': {'headers_hash': None, 'jarm': No...",196812,"[1701/udp, 8728/tcp]",2
197675,{},191141,[],0
197676,"{'80/tcp': {'headers_hash': -282574487, 'jarm'...",195077,"[80/tcp, 443/tcp, 541/tcp]",3
197677,{},198002,[],0


# FEATURE 2 AND 3(PORT TYPE TCP/UDP)

In [6]:
shodan["HAS_TCP?"] = shodan["port_list"].apply(lambda x: any("HAS_TCP?" in item.lower() for item in x))
shodan

Unnamed: 0,shodan_info,attacker_ip_enum,port_list,port_count,HAS_TCP?
0,{},5915,[],0,False
1,"{'22/tcp': {'headers_hash': None, 'jarm': None...",3325,[22/tcp],1,False
2,{},8416,[],0,False
3,{},1213,[],0,False
4,{},9185,[],0,False
...,...,...,...,...,...
197674,"{'1701/udp': {'headers_hash': None, 'jarm': No...",196812,"[1701/udp, 8728/tcp]",2,False
197675,{},191141,[],0,False
197676,"{'80/tcp': {'headers_hash': -282574487, 'jarm'...",195077,"[80/tcp, 443/tcp, 541/tcp]",3,False
197677,{},198002,[],0,False


In [7]:
shodan["HAS_UDP?"] = shodan["port_list"].apply(lambda x: any("HAS_UDP?" in item.lower() for item in x))
shodan

Unnamed: 0,shodan_info,attacker_ip_enum,port_list,port_count,HAS_TCP?,HAS_UDP?
0,{},5915,[],0,False,False
1,"{'22/tcp': {'headers_hash': None, 'jarm': None...",3325,[22/tcp],1,False,False
2,{},8416,[],0,False,False
3,{},1213,[],0,False,False
4,{},9185,[],0,False,False
...,...,...,...,...,...,...
197674,"{'1701/udp': {'headers_hash': None, 'jarm': No...",196812,"[1701/udp, 8728/tcp]",2,False,False
197675,{},191141,[],0,False,False
197676,"{'80/tcp': {'headers_hash': -282574487, 'jarm'...",195077,"[80/tcp, 443/tcp, 541/tcp]",3,False,False
197677,{},198002,[],0,False,False


# FEATURE 4 HAS RISKY PORTS?
based on
https://www.all-about-security.de/identifying-secure-and-unsecured-ports-and-how-to-secure-them/


Ports 137 and 139 (NetBIOS over TCP) and 445 (SMB)
Port 22 (SSH)
Port 53 (DNS)
Port 25 (SMTP)
Port 3389 (remote desktop)
Ports 80, 443, 8080 and 8443 (HTTP and HTTPS)
Ports 20 and 21 (FTP)
Port 23 (Telnet)
Ports 1433, 1434 and 3306 (used by databases)

In [8]:
risky_ports =["137", "139", "445", "22", "53", "25", "3389","80", "443", "8080", "8443", "20", "21", "23", "1433", "1434", "3306"]

In [9]:
def has_common_elements(l_ports):
    for x in l_ports:
        if x.split("/")[0] in risky_ports:
            return True
    return False

shodan['risky_ports'] = shodan['port_list'].apply(lambda x: has_common_elements(x))

del risky_ports

shodan


Unnamed: 0,shodan_info,attacker_ip_enum,port_list,port_count,HAS_TCP?,HAS_UDP?,risky_ports
0,{},5915,[],0,False,False,False
1,"{'22/tcp': {'headers_hash': None, 'jarm': None...",3325,[22/tcp],1,False,False,True
2,{},8416,[],0,False,False,False
3,{},1213,[],0,False,False,False
4,{},9185,[],0,False,False,False
...,...,...,...,...,...,...,...
197674,"{'1701/udp': {'headers_hash': None, 'jarm': No...",196812,"[1701/udp, 8728/tcp]",2,False,False,False
197675,{},191141,[],0,False,False,False
197676,"{'80/tcp': {'headers_hash': -282574487, 'jarm'...",195077,"[80/tcp, 443/tcp, 541/tcp]",3,False,False,True
197677,{},198002,[],0,False,False,False


#SAVE SHODAN

In [10]:
shodan = shodan.drop(['shodan_info', 'port_list'], axis=1)
shodan.to_parquet(DATA_FOLDER + 'PROCESSING/NEW_SHODAN_FEATURES.parq', index=True)
del shodan

#Load train

In [11]:
df = pd.read_parquet(DATA_FOLDER + "RAW/train.parq").dropna().drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40014024 entries, 0 to 61629682
Data columns (total 11 columns):
 #   Column             Dtype              
---  ------             -----              
 0   attack_time        datetime64[us, UTC]
 1   watcher_country    category           
 2   watcher_as_num     float32            
 3   watcher_as_name    category           
 4   attacker_country   category           
 5   attacker_as_num    float32            
 6   attacker_as_name   category           
 7   attack_type        category           
 8   watcher_uuid_enum  int32              
 9   attacker_ip_enum   int32              
 10  label              int8               
dtypes: category(5), datetime64[us, UTC](1), float32(2), int32(2), int8(1)
memory usage: 1.6 GB


#SAVE FOR MODELS

In [12]:
df['attack_time'] = pd.to_datetime(df['attack_time']).dt.day
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40014024 entries, 0 to 61629682
Data columns (total 11 columns):
 #   Column             Dtype   
---  ------             -----   
 0   attack_time        int32   
 1   watcher_country    category
 2   watcher_as_num     float32 
 3   watcher_as_name    category
 4   attacker_country   category
 5   attacker_as_num    float32 
 6   attacker_as_name   category
 7   attack_type        category
 8   watcher_uuid_enum  int32   
 9   attacker_ip_enum   int32   
 10  label              int8    
dtypes: category(5), float32(2), int32(3), int8(1)
memory usage: 1.4 GB


#NORMALIZER


In [13]:
scaler = StandardScaler()
df[['attack_time']] = scaler.fit_transform(df[['attack_time']])
df[['watcher_as_num']] = scaler.fit_transform(df[['watcher_as_num']])
df[['attacker_as_num']] = scaler.fit_transform(df[['attacker_as_num']])
df[['watcher_uuid_enum']] = scaler.fit_transform(df[['watcher_uuid_enum']])
del scaler

# FEATURE 5 and 6 (splitting attack type)

In [14]:
df[["protocol", "attack_type"]] = df['attack_type'].str.split(':', expand=True)
df

Unnamed: 0,attack_time,watcher_country,watcher_as_num,watcher_as_name,attacker_country,attacker_as_num,attacker_as_name,attack_type,watcher_uuid_enum,attacker_ip_enum,label,protocol
0,1.762414,DE,-0.294419,Host Europe GmbH,TR,-0.191146,Murat Aktas,exploit,-0.963938,6466,0,http
1,1.762414,DE,-0.294419,Host Europe GmbH,TR,-0.191146,Murat Aktas,spam,-0.963938,6466,0,http
2,1.762414,DE,-0.424088,bn:t Blatzheim Networks Telecom GmbH,DE,-0.157041,Contabo GmbH,bruteforce,-0.963795,4637,0,http
3,1.762414,DE,-0.424088,bn:t Blatzheim Networks Telecom GmbH,DE,-0.157041,Contabo GmbH,spam,-0.963795,4637,0,http
4,1.762414,DE,-0.424088,bn:t Blatzheim Networks Telecom GmbH,DE,-0.157041,Contabo GmbH,exploit,-0.963795,4637,0,http
...,...,...,...,...,...,...,...,...,...,...,...,...
61629671,0.940076,US,-0.407045,INMOTION,US,-0.593989,ATT-INTERNET4,bruteforce,-0.275084,191439,0,http
61629673,0.940076,US,-0.407045,INMOTION,US,-0.593989,ATT-INTERNET4,scan,-0.275084,191439,0,http
61629674,0.940076,US,-0.407045,INMOTION,US,-0.593989,ATT-INTERNET4,exploit,-0.275084,191439,0,http
61629681,0.940076,US,-0.407036,NAMECHEAP-NET,US,-0.435487,WOW,scan,1.931960,193446,0,http


# FEATURE 7 (joining countries)

In [15]:
df["joined_countries"] = df["attacker_country"].astype(str) + ":" + df["watcher_country"].astype(str)
df

Unnamed: 0,attack_time,watcher_country,watcher_as_num,watcher_as_name,attacker_country,attacker_as_num,attacker_as_name,attack_type,watcher_uuid_enum,attacker_ip_enum,label,protocol,joined_countries
0,1.762414,DE,-0.294419,Host Europe GmbH,TR,-0.191146,Murat Aktas,exploit,-0.963938,6466,0,http,TR:DE
1,1.762414,DE,-0.294419,Host Europe GmbH,TR,-0.191146,Murat Aktas,spam,-0.963938,6466,0,http,TR:DE
2,1.762414,DE,-0.424088,bn:t Blatzheim Networks Telecom GmbH,DE,-0.157041,Contabo GmbH,bruteforce,-0.963795,4637,0,http,DE:DE
3,1.762414,DE,-0.424088,bn:t Blatzheim Networks Telecom GmbH,DE,-0.157041,Contabo GmbH,spam,-0.963795,4637,0,http,DE:DE
4,1.762414,DE,-0.424088,bn:t Blatzheim Networks Telecom GmbH,DE,-0.157041,Contabo GmbH,exploit,-0.963795,4637,0,http,DE:DE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61629671,0.940076,US,-0.407045,INMOTION,US,-0.593989,ATT-INTERNET4,bruteforce,-0.275084,191439,0,http,US:US
61629673,0.940076,US,-0.407045,INMOTION,US,-0.593989,ATT-INTERNET4,scan,-0.275084,191439,0,http,US:US
61629674,0.940076,US,-0.407045,INMOTION,US,-0.593989,ATT-INTERNET4,exploit,-0.275084,191439,0,http,US:US
61629681,0.940076,US,-0.407036,NAMECHEAP-NET,US,-0.435487,WOW,scan,1.931960,193446,0,http,US:US


In [16]:
df['protocol'] = df['protocol'].astype('category')
df['attack_type'] = df['attack_type'].astype('category')
df['joined_countries'] = df['joined_countries'].astype('category')
df["attacker_ip_enum"] = df["attacker_ip_enum"].astype(np.int64)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40014024 entries, 0 to 61629682
Data columns (total 13 columns):
 #   Column             Dtype   
---  ------             -----   
 0   attack_time        float64 
 1   watcher_country    category
 2   watcher_as_num     float32 
 3   watcher_as_name    category
 4   attacker_country   category
 5   attacker_as_num    float32 
 6   attacker_as_name   category
 7   attack_type        category
 8   watcher_uuid_enum  float64 
 9   attacker_ip_enum   int64   
 10  label              int8    
 11  protocol           category
 12  joined_countries   category
dtypes: category(7), float32(2), float64(2), int64(1), int8(1)
memory usage: 2.0 GB


In [17]:
scaler = StandardScaler()
df[['attack_time']] = scaler.fit_transform(df[['attack_time']])
df[['watcher_as_num']] = scaler.fit_transform(df[['watcher_as_num']])
df[['attacker_as_num']] = scaler.fit_transform(df[['attacker_as_num']])
df[['watcher_uuid_enum']] = scaler.fit_transform(df[['watcher_uuid_enum']])

In [18]:
df.to_parquet(DATA_FOLDER + 'PROCESSING/NEW_TRAIN_FEATURES.parq', index=True)