# Preparación de entorno y dataset

In [1]:
! pip install pyarrow
! pip install -q kaggle



In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [3]:
pd.options.display.float_format = '{:20,.4f}'.format # suprimimos la notacion cientifica en los outputs

In [4]:
from google.colab import files

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"hangyeolkoo","key":"4032f7e1f591049aec886c05d7504de2"}'}

In [5]:
! mkdir ~/.kaggle

In [6]:
! cp kaggle.json ~/.kaggle/

In [7]:
! chmod 600 ~/.kaggle/kaggle.json

In [8]:
! kaggle competitions download vpn-classification -f dataset_v2/train.parq

Downloading train.parq.zip to /content
 97% 545M/564M [00:03<00:00, 149MB/s]
100% 564M/564M [00:03<00:00, 149MB/s]


In [9]:
! kaggle competitions download vpn-classification -f dataset_v2/test.parq

Downloading test.parq.zip to /content
 94% 155M/164M [00:01<00:00, 119MB/s] 
100% 164M/164M [00:01<00:00, 101MB/s]


In [10]:
! kaggle competitions download vpn-classification -f dataset_v2/shodan_df_hashed.csv

Downloading shodan_df_hashed.csv.zip to /content
  0% 0.00/2.11M [00:00<?, ?B/s]
100% 2.11M/2.11M [00:00<00:00, 125MB/s]


In [11]:
! unzip train.parq.zip

Archive:  train.parq.zip
  inflating: train.parq              


In [12]:
! unzip test.parq.zip

Archive:  test.parq.zip
  inflating: test.parq               


In [13]:
! unzip shodan_df_hashed.csv.zip

Archive:  shodan_df_hashed.csv.zip
  inflating: shodan_df_hashed.csv    


## Manejo de train dataset

In [14]:
train = pd.read_parquet("train.parq",engine="auto")
test = pd.read_parquet("test.parq",engine="auto")

**Imputo los valores nulos**

Por cuestiones de perfomance se decidió crear un propio Imputer, que rellena los NaNs con los datos más frecuentes. Esto se llevó a cabo sólo en dataset de test, para no utilizar informaciones erróneas para el entrenamiento.

In [15]:
class MostFrequentImputer:
    def __init__(self):
        self.most_frequent_values = None

    def fit(self, df):
        self.most_frequent_values = {col: df[col].mode().iloc[0] for col in df.columns}
        return self

    def transform(self, df):
        for col, value in self.most_frequent_values.items():
            df[col] = df[col].fillna(value)
        return df

In [16]:
imputer = MostFrequentImputer()

test_columns = test.columns

imputer.fit(test)
imputer.transform(test)

test.isna().sum().sum()

0

In [17]:
train.head()

Unnamed: 0,attack_time,watcher_country,watcher_as_num,watcher_as_name,attacker_country,attacker_as_num,attacker_as_name,attack_type,watcher_uuid_enum,attacker_ip_enum,label
0,2023-07-31 07:17:51+00:00,DE,34011.0,Host Europe GmbH,TR,47721.0,Murat Aktas,http:exploit,0,6466,0
1,2023-07-31 07:17:51+00:00,DE,34011.0,Host Europe GmbH,TR,47721.0,Murat Aktas,http:spam,0,6466,0
2,2023-07-31 07:17:49+00:00,DE,20886.0,bn:t Blatzheim Networks Telecom GmbH,DE,51167.0,Contabo GmbH,http:bruteforce,2,4637,0
3,2023-07-31 07:17:49+00:00,DE,20886.0,bn:t Blatzheim Networks Telecom GmbH,DE,51167.0,Contabo GmbH,http:spam,2,4637,0
4,2023-07-31 07:17:49+00:00,DE,20886.0,bn:t Blatzheim Networks Telecom GmbH,DE,51167.0,Contabo GmbH,http:exploit,2,4637,0


In [18]:
train.isna().sum().sum()

822426

In [19]:
train['label'].value_counts()

0    60594448
1     1035237
Name: label, dtype: int64

In [20]:
train = train.dropna()

In [21]:
train['label'].value_counts()

0    60135084
1     1026672
Name: label, dtype: int64

In [22]:
train = train.drop(columns=['watcher_as_name','attacker_as_name'])

test = test.drop(columns=['watcher_as_name','attacker_as_name'])

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61161756 entries, 0 to 61629684
Data columns (total 9 columns):
 #   Column             Dtype              
---  ------             -----              
 0   attack_time        datetime64[ns, UTC]
 1   watcher_country    category           
 2   watcher_as_num     float32            
 3   attacker_country   category           
 4   attacker_as_num    float32            
 5   attack_type        category           
 6   watcher_uuid_enum  int32              
 7   attacker_ip_enum   int32              
 8   label              int8               
dtypes: category(3), datetime64[ns, UTC](1), float32(2), int32(2), int8(1)
memory usage: 2.2 GB


Las columnas que aportan informaciones equivalentes a otra ya existente fueron descartadas.

## Feature engineering

In [23]:
train['day_of_week'] = train['attack_time'].dt.day_name()
train['hour'] = train['attack_time'].dt.hour

test['day_of_week'] = test['attack_time'].dt.day_name()
test['hour'] = test['attack_time'].dt.hour

In [24]:
train[['service', 'threat_type']] = train['attack_type'].str.split(':', expand=True)

test[['service', 'threat_type']] = test['attack_type'].str.split(':', expand=True)

In [25]:
train['same_country'] = (train['attacker_country'].astype('str') == train['watcher_country'].astype('str')).astype(int)

test['same_country'] = (test['attacker_country'].astype('str') == test['watcher_country'].astype('str')).astype(int)

In [26]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61161756 entries, 0 to 61629684
Data columns (total 14 columns):
 #   Column             Dtype              
---  ------             -----              
 0   attack_time        datetime64[ns, UTC]
 1   watcher_country    category           
 2   watcher_as_num     float32            
 3   attacker_country   category           
 4   attacker_as_num    float32            
 5   attack_type        category           
 6   watcher_uuid_enum  int32              
 7   attacker_ip_enum   int32              
 8   label              int8               
 9   day_of_week        object             
 10  hour               int64              
 11  service            object             
 12  threat_type        object             
 13  same_country       int64              
dtypes: category(3), datetime64[ns, UTC](1), float32(2), int32(2), int64(2), int8(1), object(3)
memory usage: 4.4+ GB


In [27]:
train['hour'] = train['hour'].astype('int8')
train['service'] = train['service'].astype('category')
train['threat_type'] = train['threat_type'].astype('category')
train['same_country'] = train['same_country'].astype('category')

test['hour'] = test['hour'].astype('int8')
test['service'] = test['service'].astype('category')
test['threat_type'] = test['threat_type'].astype('category')
test['same_country'] = test['same_country'].astype('category')

In [28]:
def map_day_to_sin(day):
  list_days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
  for i in range(7):
    if day == list_days[i]:
      return np.sin(2*np.pi*i/7)
  return day

In [29]:
def map_hour_to_sin(hour):
  return np.sin(2*np.pi*hour/7)

In [30]:
train['sin_day_of_week'] = train['day_of_week'].map(map_day_to_sin)
train['sin_hour'] = train['hour'].map(map_hour_to_sin)

test['sin_day_of_week'] = test['day_of_week'].map(map_day_to_sin)
test['sin_hour'] = test['hour'].map(map_hour_to_sin)

In [31]:
train = train.drop(columns=['day_of_week','hour'])

test = test.drop(columns=['day_of_week','hour'])

In [32]:
train['sin_day_of_week'] = train['sin_day_of_week'].astype('float')

test['sin_day_of_week'] = test['sin_day_of_week'].astype('float')

In [33]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61161756 entries, 0 to 61629684
Data columns (total 14 columns):
 #   Column             Dtype              
---  ------             -----              
 0   attack_time        datetime64[ns, UTC]
 1   watcher_country    category           
 2   watcher_as_num     float32            
 3   attacker_country   category           
 4   attacker_as_num    float32            
 5   attack_type        category           
 6   watcher_uuid_enum  int32              
 7   attacker_ip_enum   int32              
 8   label              int8               
 9   service            category           
 10  threat_type        category           
 11  same_country       category           
 12  sin_day_of_week    float64            
 13  sin_hour           float64            
dtypes: category(6), datetime64[ns, UTC](1), float32(2), float64(2), int32(2), int8(1)
memory usage: 3.2 GB


In [34]:
train = train.drop(columns=['attack_time'])

test = test.drop(columns=['attack_time'])

In [35]:
train.head()

Unnamed: 0,watcher_country,watcher_as_num,attacker_country,attacker_as_num,attack_type,watcher_uuid_enum,attacker_ip_enum,label,service,threat_type,same_country,sin_day_of_week,sin_hour
0,DE,34011.0,TR,47721.0,http:exploit,0,6466,0,http,exploit,0,0.0,-0.0
1,DE,34011.0,TR,47721.0,http:spam,0,6466,0,http,spam,0,0.0,-0.0
2,DE,20886.0,DE,51167.0,http:bruteforce,2,4637,0,http,bruteforce,1,0.0,-0.0
3,DE,20886.0,DE,51167.0,http:spam,2,4637,0,http,spam,1,0.0,-0.0
4,DE,20886.0,DE,51167.0,http:exploit,2,4637,0,http,exploit,1,0.0,-0.0


In [36]:
train['attack_count_per_ip'] = train.groupby('attacker_ip_enum')['attacker_ip_enum'].transform('count')

test['attack_count_per_ip'] = test.groupby('attacker_ip_enum')['attacker_ip_enum'].transform('count')

In [37]:
train_grouped_by_ip_enum = train.groupby('attacker_ip_enum').agg({
    'attack_count_per_ip' : 'first',
    'watcher_country': lambda x: x.mode().iat[0],
    'watcher_as_num': lambda x: x.mode().iat[0],
    'attacker_country': lambda x: x.mode().iat[0],
    'attacker_as_num': lambda x: x.mode().iat[0],
    'attack_type': lambda x: x.mode().iat[0],
    'watcher_uuid_enum': lambda x: x.mode().iat[0],
    'label': lambda x: x.mode().iat[0],
    'sin_day_of_week': 'mean',
    'sin_hour': 'mean',
    'service': lambda x: x.mode().iat[0],
    'threat_type': lambda x: x.mode().iat[0],
    'same_country': lambda x: x.mode().iat[0]
}).reset_index()

test_grouped_by_ip_enum = test.groupby('attacker_ip_enum').agg({
    'attack_count_per_ip' : 'first',
    'watcher_country': lambda x: x.mode().iat[0],
    'watcher_as_num': lambda x: x.mode().iat[0],
    'attacker_country': lambda x: x.mode().iat[0],
    'attacker_as_num': lambda x: x.mode().iat[0],
    'attack_type': lambda x: x.mode().iat[0],
    'watcher_uuid_enum': lambda x: x.mode().iat[0],
    'sin_day_of_week': 'mean',
    'sin_hour': 'mean',
    'service': lambda x: x.mode().iat[0],
    'threat_type': lambda x: x.mode().iat[0],
    'same_country': lambda x: x.mode().iat[0]
}).reset_index()

train_grouped_by_ip_enum.head()

Unnamed: 0,attacker_ip_enum,attack_count_per_ip,watcher_country,watcher_as_num,attacker_country,attacker_as_num,attack_type,watcher_uuid_enum,label,sin_day_of_week,sin_hour,service,threat_type,same_country
0,0,52,US,14061.0,US,14618.0,http:scan,18401,0,-0.4339,0.0816,http,scan,0
1,1,30,US,3303.0,BR,27699.0,http:exploit,8646,0,0.7818,0.1567,http,exploit,0
2,2,58,DE,12897.0,DE,3320.0,http:scan,24879,0,0.089,0.0876,http,scan,1
3,3,22,DE,12897.0,VE,21826.0,http:exploit,24879,0,-0.1423,0.2312,http,exploit,0
4,4,68,US,396982.0,SA,25019.0,http:exploit,1828,0,-0.2439,0.2687,http,exploit,0


In [38]:
train_grouped_by_ip_enum.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 147649 entries, 0 to 147648
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   attacker_ip_enum     147649 non-null  int64  
 1   attack_count_per_ip  147649 non-null  int64  
 2   watcher_country      147649 non-null  object 
 3   watcher_as_num       147649 non-null  float32
 4   attacker_country     147649 non-null  object 
 5   attacker_as_num      147649 non-null  float32
 6   attack_type          147649 non-null  object 
 7   watcher_uuid_enum    147649 non-null  int32  
 8   label                147649 non-null  int8   
 9   sin_day_of_week      147649 non-null  float64
 10  sin_hour             147649 non-null  float64
 11  service              147649 non-null  object 
 12  threat_type          147649 non-null  object 
 13  same_country         147649 non-null  int64  
dtypes: float32(2), float64(2), int32(1), int64(3), int8(1), object(5)
me

XGBoost necesita que las variables categóricas sean del dtype 'category'.

In [39]:
train_grouped_by_ip_enum['watcher_country'] = train_grouped_by_ip_enum['watcher_country'].astype('category')
train_grouped_by_ip_enum['watcher_as_num'] = train_grouped_by_ip_enum['watcher_as_num'].astype('category')
train_grouped_by_ip_enum['attacker_country'] = train_grouped_by_ip_enum['attacker_country'].astype('category')
train_grouped_by_ip_enum['attacker_as_num'] = train_grouped_by_ip_enum['attacker_as_num'].astype('category')
train_grouped_by_ip_enum['watcher_uuid_enum'] = train_grouped_by_ip_enum['watcher_uuid_enum'].astype('category')
train_grouped_by_ip_enum['attack_type'] = train_grouped_by_ip_enum['attack_type'].astype('category')
train_grouped_by_ip_enum['service'] = train_grouped_by_ip_enum['service'].astype('category')
train_grouped_by_ip_enum['threat_type'] = train_grouped_by_ip_enum['threat_type'].astype('category')
train_grouped_by_ip_enum['same_country'] = train_grouped_by_ip_enum['same_country'].astype('int8')

test_grouped_by_ip_enum['watcher_country'] = test_grouped_by_ip_enum['watcher_country'].astype('category')
test_grouped_by_ip_enum['watcher_as_num'] = test_grouped_by_ip_enum['watcher_as_num'].astype('category')
test_grouped_by_ip_enum['attacker_country'] = test_grouped_by_ip_enum['attacker_country'].astype('category')
test_grouped_by_ip_enum['attacker_as_num'] = test_grouped_by_ip_enum['attacker_as_num'].astype('category')
test_grouped_by_ip_enum['watcher_uuid_enum'] = test_grouped_by_ip_enum['watcher_uuid_enum'].astype('category')
test_grouped_by_ip_enum['attack_type'] = test_grouped_by_ip_enum['attack_type'].astype('category')
test_grouped_by_ip_enum['service'] = test_grouped_by_ip_enum['service'].astype('category')
test_grouped_by_ip_enum['threat_type'] = test_grouped_by_ip_enum['threat_type'].astype('category')
test_grouped_by_ip_enum['same_country'] = test_grouped_by_ip_enum['same_country'].astype('int8')

## Manejo de shodan_info



In [40]:
shodan_df = pd.read_csv('shodan_df_hashed.csv')

In [41]:
shodan_df.head()

Unnamed: 0,shodan_info,attacker_ip_enum
0,{},5915
1,"{'22/tcp': {'headers_hash': None, 'jarm': None...",3325
2,{},8416
3,{},1213
4,{},9185


In [42]:
len(shodan_df) == len(shodan_df['attacker_ip_enum'].unique())

True

In [43]:
import ast
from collections import Counter

port_counter = Counter()
ip_port_list = {}

for index, row in shodan_df.iterrows():
  shodan_info = ast.literal_eval(row['shodan_info'])
  ip_port_list[int(row['attacker_ip_enum'])] = list(shodan_info)
  port_counter.update(shodan_info.keys())

In [44]:
ip_port_count = {}
for ip in ip_port_list:
  ip_port_count[ip] = len(ip_port_list[ip])

In [45]:
len(port_counter.keys())

2297

In [46]:
port_counter.most_common()[10]

('587/tcp', 2300)

In [47]:
top_port = set()
for port in port_counter.most_common(10):
  top_port.add(port[0])

In [48]:
for ip in ip_port_list:
  for port in ip_port_list[ip]:
    if port not in top_port:
      ip_port_list[ip].remove(port)

In [49]:
train_grouped_by_ip_enum['port_count'] = train_grouped_by_ip_enum['attacker_ip_enum'].map(ip_port_count)
train_grouped_by_ip_enum['port_list'] = train_grouped_by_ip_enum['attacker_ip_enum'].map(ip_port_list)

test_grouped_by_ip_enum['port_count'] = test_grouped_by_ip_enum['attacker_ip_enum'].map(ip_port_count)
test_grouped_by_ip_enum['port_list'] = test_grouped_by_ip_enum['attacker_ip_enum'].map(ip_port_list)

In [50]:
for port in top_port:
  train_grouped_by_ip_enum[port] = train_grouped_by_ip_enum['port_list'].map(lambda port_list: int(port in port_list))
  test_grouped_by_ip_enum[port] = test_grouped_by_ip_enum['port_list'].map(lambda port_list: int(port in port_list))

In [51]:
train_grouped_by_ip_enum = train_grouped_by_ip_enum.drop(columns='port_list')
test_grouped_by_ip_enum = test_grouped_by_ip_enum.drop(columns='port_list')

## Dataset resultante

En resumen, se crearon las siguientes nuevas features:
* Cantidad de ataques por attacker_ip_enum
* Seno de día de semana(siendo 0 lunes, 6 domingo)
* Seno de hora de ataque
* Servicio atacado
* Tipo de amenaza (spam, scam, etc.)
* Si el país atacante y atacado coinciden
* Cantidad de puertos

Este último fue creado teniendo en cuenta que en caso de coincidencia, será más probable que el atacante no esté utilizando un VPN, que generalmente cambia la ubicación.

In [52]:
train_grouped_by_ip_enum.head()

Unnamed: 0,attacker_ip_enum,attack_count_per_ip,watcher_country,watcher_as_num,attacker_country,attacker_as_num,attack_type,watcher_uuid_enum,label,sin_day_of_week,...,2000/tcp,53/udp,995/tcp,993/tcp,443/tcp,465/tcp,53/tcp,21/tcp,22/tcp,80/tcp
0,0,52,US,14061.0,US,14618.0,http:scan,18401,0,-0.4339,...,0,0,0,0,0,0,0,0,0,0
1,1,30,US,3303.0,BR,27699.0,http:exploit,8646,0,0.7818,...,0,0,0,0,0,0,0,0,0,0
2,2,58,DE,12897.0,DE,3320.0,http:scan,24879,0,0.089,...,0,0,0,0,0,0,0,0,0,0
3,3,22,DE,12897.0,VE,21826.0,http:exploit,24879,0,-0.1423,...,0,0,0,0,0,0,0,0,0,0
4,4,68,US,396982.0,SA,25019.0,http:exploit,1828,0,-0.2439,...,0,0,0,0,0,0,0,0,0,0


## Split

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
y = train_grouped_by_ip_enum['label']
X = train_grouped_by_ip_enum.drop(columns=['label'])

In [55]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=12)

In [56]:
def common_member(a, b):
    a_set = set(a)
    b_set = set(b)

    if (a_set & b_set):
        print(a_set & b_set)
    else:
        print("No common elements")

common_member(X_train['attacker_ip_enum'],X_valid['attacker_ip_enum'])

No common elements


In [57]:
X_train = X_train.set_index('attacker_ip_enum')
X_valid = X_valid.set_index('attacker_ip_enum')
X_test = test_grouped_by_ip_enum.set_index('attacker_ip_enum')

In [462]:
X_train.head()

Unnamed: 0_level_0,attack_count_per_ip,watcher_country,watcher_as_num,attacker_country,attacker_as_num,attack_type,watcher_uuid_enum,sin_day_of_week,sin_hour,service,...,2000/tcp,53/udp,995/tcp,993/tcp,443/tcp,465/tcp,53/tcp,21/tcp,22/tcp,80/tcp
attacker_ip_enum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
164114,34,AT,8387.0,ID,7713.0,http:exploit,165,-0.9749,-0.2246,http,...,0,0,0,0,0,0,0,0,0,0
86950,159,FR,16276.0,LT,47583.0,ssh:bruteforce,297,-0.7095,-0.1465,ssh,...,0,0,0,0,0,0,0,0,0,0
132929,60,US,20847.0,ID,7713.0,http:exploit,398,0.4204,0.2399,http,...,0,0,0,0,0,0,0,0,0,0
165613,30,US,14061.0,BR,8167.0,http:exploit,2597,0.3494,0.4337,http,...,0,0,0,0,0,0,0,0,0,0
63880,22,JP,131965.0,JP,4721.0,http:scan,846,0.724,-0.4344,http,...,0,0,0,0,0,0,0,0,0,0


In [461]:
X_test.head()

Unnamed: 0_level_0,attack_count_per_ip,watcher_country,watcher_as_num,attacker_country,attacker_as_num,attack_type,watcher_uuid_enum,sin_day_of_week,sin_hour,service,...,2000/tcp,53/udp,995/tcp,993/tcp,443/tcp,465/tcp,53/tcp,21/tcp,22/tcp,80/tcp
attacker_ip_enum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,165,US,396982.0,KH,38235.0,http:spam,24724,0.1401,0.0959,http,...,1,0,0,0,0,0,0,0,0,1
7,200,LU,53667.0,CA,16276.0,ssh:bruteforce,259,-0.7365,-0.1462,ssh,...,0,0,0,0,1,0,0,0,1,1
21,21,DE,12897.0,DE,3209.0,http:scan,24879,0.4339,0.7818,http,...,0,0,0,0,0,0,0,0,0,0
29,16,US,0.0,IN,132996.0,http:exploit,18513,-0.0483,0.4398,http,...,0,0,0,0,0,0,0,0,0,0
33,6,GB,14061.0,CZ,43037.0,http:scan,6570,0.0,-0.8462,http,...,0,0,0,0,0,0,0,0,0,0


# Modelo


## Modelo con parámetros default

In [58]:
! pip install xgboost



In [59]:
from xgboost import XGBClassifier

In [61]:
model_default = XGBClassifier(enable_categorical=True,
                              objective='binary:hinge',
                              seed=12
                              )


In [62]:
model_default.fit(X_train, y_train)

In [63]:
from sklearn.metrics import f1_score

y_pred = model_default.predict(X_valid)

f1_score(y_valid, y_pred)

0.7103448275862069

In [64]:
y_pred_train_default = model_default.predict(X_train)

f1_score(y_train, y_pred_train_default)

0.9794327776575017

In [437]:
model_default.get_params

<bound method XGBModel.get_params of XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=True, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='binary:hinge', ...)>

## Modelo con parámetros encontrados mediante Random Search

In [438]:
param_dist = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [None, 3, 4, 5],
    'gamma': [0, 0.1],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [0, 0.1],
    'enable_categorical':[True],
    'objective':['binary:hinge']
}

In [439]:
xgb_classifier = XGBClassifier()

In [453]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    estimator=xgb_classifier,
    param_distributions=param_dist,
    scoring='f1',
    n_iter=80,
    cv=3,
    random_state=12,
    n_jobs=-1
)

In [454]:
random_search.fit(X_train, y_train)

In [455]:
best_model = random_search.best_estimator_
best_model

In [456]:
best_model.get_params

<bound method XGBModel.get_params of XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=True, eval_metric=None, feature_types=None,
              gamma=0.1, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.3, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=4, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, objective='binary:hinge', ...)>

In [457]:
y_pred = best_model.predict(X_valid)

f1_score(y_valid, y_pred)

0.7595628415300546

In [458]:
y_pred_train = best_model.predict(X_train)

f1_score(y_train, y_pred_train)

0.848112778535698

## Modelo con parámetros ingresados manualmente

In [178]:
y_train.value_counts()

0    115834
1      2285
Name: label, dtype: int64

In [220]:
random_search.best_params_

{'subsample': 1.0,
 'scale_pos_weight': 3,
 'reg_lambda': 0.5,
 'reg_alpha': 1,
 'objective': 'binary:hinge',
 'n_estimators': 400,
 'min_child_weight': 1,
 'max_depth': 3,
 'learning_rate': 0.2,
 'gamma': 0.1,
 'enable_categorical': True,
 'colsample_bytree': 1.0}

In [445]:
model_manual = XGBClassifier(objective='binary:hinge',
                             reg_lambda=0.5,
                             reg_alpha=1.5,
                             max_depth=2,
                             learning_rate=0.2,
                             gamma=0.1,
                             enable_categorical=True,
                             seed=15)

In [446]:
model_manual.fit(X_train, y_train)

In [447]:
y_pred_manual = model_manual.predict(X_valid)

f1_score(y_valid, y_pred_manual)

0.7392497712717292

In [449]:
y_pred_train_manual = model_manual.predict(X_train)

f1_score(y_train, y_pred_train_manual)

0.7566423700917