In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

!pip install category_encoders
from category_encoders.target_encoder import TargetEncoder
import category_encoders as ce



In [3]:
import gc

# Creación features, split de train/validation y encodeado

In [4]:
df = pd.read_parquet('/content/drive/MyDrive/Organización de Datos/TP2/dataset_v2/train.parq', columns=['attack_time', 'watcher_country', 'attacker_country', 'attacker_as_name', 'attack_type',  'watcher_uuid_enum','attacker_ip_enum', 'label' ])

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df['watcher_country'] = df.watcher_country.cat.add_categories('NN').fillna('NN')
df['attacker_as_name'] = df.attacker_as_name.cat.add_categories('NN').fillna('NN')
df['attacker_country'] = df.attacker_country.cat.add_categories('NN').fillna('NN')

In [7]:
df['watcher_uuid_enum'] = df.watcher_uuid_enum.astype('category')

In [8]:
df.rename(columns={'attacker_as_name':'attacker', 'watcher_uuid_enum':'watcher_id'}, inplace=True)

In [9]:
def obtener_rangos_horarios(hora):
    if 0 <= hora < 4:
        return '0-4'
    if 4 <= hora < 8:
        return '4-8'
    if 8 <= hora < 12:
        return '8-12'
    if 12 <= hora < 16:
        return '12-16'
    if 16 <= hora < 20:
        return '16-20'
    else:
        return '20-24'

In [10]:
df['hora'] = df['attack_time'].dt.hour
df['day'] = df['attack_time'].dt.day_name().astype('category')
df['hour_range'] = df['hora'].map(lambda x: obtener_rangos_horarios(x))
df.drop(['hora'], axis=1,  inplace=True)
df.drop(['attack_time'], axis=1, inplace=True)

In [11]:
df['attack_protocol'] = df['attack_type'].map(lambda x: x.split(':')[0]).astype('category')
df['attack_type'] = df['attack_type'].map(lambda x: x.split(':')[1]).astype('category')

In [12]:
df['attack_countries'] = df['attacker_country'].astype(str) + '-' + df['watcher_country'].astype(str)
df['attack_countries'] = df['attack_countries'].astype('category')

df['country_hour_range'] = df['hour_range'] + ' - ' + df['watcher_country'].astype(str)
df['country_hour_range'] = df['country_hour_range'].astype('category')

In [13]:
df.drop(['attacker_country'], axis=1, inplace=True)
df.drop(['watcher_country'], axis=1, inplace=True)
df.drop(['hour_range'], axis=1, inplace=True)

In [14]:
attacker_ips = df['attacker_ip_enum'].unique()

In [15]:
ips_train, ips_validation = train_test_split(attacker_ips, test_size=0.1, random_state=10)

In [16]:
train = df[df['attacker_ip_enum'].isin(ips_train)]
validation = df[df['attacker_ip_enum'].isin(ips_validation)]

In [17]:
train = train.drop(['attacker_ip_enum'], axis=1)
validation = validation.drop(['attacker_ip_enum'], axis=1)

In [18]:
train.drop_duplicates(inplace=True) #se generan repetidas al dividir time en dia/rango_horario que para entrenar no usaré

In [19]:
validation = validation.reset_index(drop=True)

In [20]:
train = train.reset_index(drop=True)

In [21]:
X_train = train.drop(['label'], axis=1)
y_train = train[['label']]

X_validation = validation.drop(['label'], axis=1)
y_validation = validation[['label']]

In [22]:
del df
del train
del validation
del attacker_ips
del ips_train
del ips_validation

In [23]:
gc.collect()

0

In [24]:
me_attack_countries = TargetEncoder()

X_train['attack_countries_mean'] = me_attack_countries.fit_transform(X_train['attack_countries'], y_train)
X_validation['attack_countries_mean'] = me_attack_countries.transform(X_validation['attack_countries'])

X_train = X_train.drop('attack_countries', axis=1)
X_validation = X_validation.drop('attack_countries', axis=1)

In [25]:
me_attacker = TargetEncoder()

X_train['attacker_mean'] = me_attacker.fit_transform(X_train['attacker'], y_train)
X_validation['attacker_mean'] = me_attacker.transform(X_validation['attacker'])

X_train = X_train.drop('attacker', axis=1)
X_validation = X_validation.drop('attacker', axis=1)

In [26]:
me_watcher_id = TargetEncoder()

X_train['watcher_id_mean'] = me_watcher_id.fit_transform(X_train['watcher_id'], y_train)
X_validation['watcher_id_mean'] = me_watcher_id.transform(X_validation['watcher_id'])

X_train = X_train.drop('watcher_id', axis=1)
X_validation = X_validation.drop('watcher_id', axis=1)

In [27]:
me_attack_protocol = TargetEncoder()

X_train['attack_protocol_mean'] = me_attack_protocol.fit_transform(X_train['attack_protocol'], y_train)
X_validation['attack_protocol_mean'] = me_attack_protocol.transform(X_validation['attack_protocol'])

X_train = X_train.drop('attack_protocol', axis=1)
X_validation = X_validation.drop('attack_protocol', axis=1)

In [28]:
me_country_hour_range = TargetEncoder()

X_train['country_hour_range_mean'] = me_country_hour_range.fit_transform(X_train['country_hour_range'], y_train)
X_validation['country_hour_range_mean'] = me_country_hour_range.transform(X_validation['country_hour_range'])

X_train = X_train.drop('country_hour_range', axis=1)
X_validation = X_validation.drop('country_hour_range', axis=1)

In [29]:
me_day = TargetEncoder()

X_train['day_mean'] = me_day.fit_transform(X_train['day'], y_train)
X_validation['day_mean'] = me_day.transform(X_validation['day'])

X_train = X_train.drop('day', axis=1)
X_validation = X_validation.drop('day', axis=1)

In [30]:
ohe_attack_type = OneHotEncoder(drop='first')

encoded_attack_type_train = ohe_attack_type.fit_transform(X_train[['attack_type']]).todense().astype(np.uint8)
nuevas_col_train = pd.DataFrame(encoded_attack_type_train).add_prefix('attack_type_')
del encoded_attack_type_train
X_train = X_train.join(nuevas_col_train)
X_train = X_train.drop('attack_type', axis=1)
del nuevas_col_train


encoded_attack_type_validation = ohe_attack_type.transform(X_validation[['attack_type']]).todense().astype(np.uint8)
nuevas_col_validation = pd.DataFrame(encoded_attack_type_validation).add_prefix('attack_type_')
del encoded_attack_type_validation
X_validation = X_validation.join(nuevas_col_validation)
X_validation = X_validation.drop('attack_type', axis=1)
del nuevas_col_validation

# Decision Tree

In [33]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier

In [34]:
from sklearn.metrics import f1_score

In [39]:
tree = DecisionTreeClassifier(random_state=9)

In [40]:
params = [
    {'max_features': ['sqrt', 'log2', None]},
    {'min_samples_split': [2, 10, 50, 100, 200]},
    {'max_depth': [3,6,9]},
]

In [41]:
rs = RandomizedSearchCV(tree, param_distributions=params, n_iter=10, random_state=10, scoring='f1')

In [42]:
rs.fit(X_train, np.ravel(y_train))

In [43]:
rs.best_params_

{'max_depth': 3}

In [44]:
model = rs.best_estimator_

In [45]:
model.fit(X_train, np.ravel(y_train))

In [46]:
y_pred = model.predict(X_train)

In [47]:
f1_score(y_train, y_pred)

0.7063718567337491

In [48]:
y_pred = model.predict(X_validation)

In [49]:
f1_score(y_validation, y_pred)

0.6166062287933026

In [65]:
X_train.to_pickle("X_train.pkl")
X_validation.to_pickle("X_validation.pkl")
y_train.to_pickle("y_train.pkl")
y_validation.to_pickle("y_validation.pkl")

#unpickled_df = pd.read_pickle("aa.pkl")

In [66]:
del  X_train
del  X_validation
del  y_train
del  y_validation

# Test

In [67]:
test = pd.read_parquet('/content/drive/MyDrive/Organización de Datos/TP2/dataset_v2/test.parq', columns=['attack_time', 'watcher_country', 'attacker_country', 'attacker_as_name', 'attack_type',  'watcher_uuid_enum','attacker_ip_enum'])

In [68]:
test = test.drop_duplicates(subset='attacker_ip_enum').reset_index(drop=True)

In [69]:
ips_test = test[['attacker_ip_enum']]
test = test.drop(['attacker_ip_enum'], axis=1)

In [70]:
test['watcher_country'] = test.watcher_country.cat.add_categories('NN').fillna('NN')
test['attacker_as_name'] = test.attacker_as_name.cat.add_categories('NN').fillna('NN')
test['attacker_country'] = test.attacker_country.cat.add_categories('NN').fillna('NN')

In [71]:
test['watcher_uuid_enum'] = test.watcher_uuid_enum.astype('category')

In [72]:
test.rename(columns={'attacker_as_name':'attacker', 'watcher_uuid_enum':'watcher_id'}, inplace=True)

In [73]:
test['hora'] = test['attack_time'].dt.hour
test['day'] = test['attack_time'].dt.day_name().astype('category')
test['hour_range'] = test['hora'].map(lambda x: obtener_rangos_horarios(x))
test.drop(['hora'], axis=1,  inplace=True)
test.drop(['attack_time'], axis=1, inplace=True)

In [74]:
test['attack_protocol'] = test['attack_type'].map(lambda x: x.split(':')[0]).astype('category')
test['attack_type'] = test['attack_type'].map(lambda x: x.split(':')[1]).astype('category')

In [75]:
test['attack_countries'] = test['attacker_country'].astype(str) + '-' + test['watcher_country'].astype(str)
test['attack_countries'] = test['attack_countries'].astype('category')

test['country_hour_range'] = test['hour_range'] + ' - ' + test['watcher_country'].astype(str)
test['country_hour_range'] = test['country_hour_range'].astype('category')

In [76]:
test.drop(['attacker_country'], axis=1, inplace=True)
test.drop(['watcher_country'], axis=1, inplace=True)
test.drop(['hour_range'], axis=1, inplace=True)

In [77]:
test['attack_countries_mean'] = me_attack_countries.transform(test['attack_countries'])
test = test.drop('attack_countries', axis=1)

test['attacker_mean'] = me_attacker.transform(test['attacker'])
test = test.drop('attacker', axis=1)

test['watcher_id_mean'] = me_watcher_id.transform(test['watcher_id'])
test = test.drop('watcher_id', axis=1)

test['attack_protocol_mean'] = me_attack_protocol.transform(test['attack_protocol'])
test = test.drop('attack_protocol', axis=1)

test['country_hour_range_mean'] = me_country_hour_range.transform(test['country_hour_range'])
test = test.drop('country_hour_range', axis=1)

test['day_mean'] = me_day.transform(test['day'])
test = test.drop('day', axis=1)

encoded_attack_type_test = ohe_attack_type.transform(test[['attack_type']]).todense().astype(np.uint8)
nuevas_col_test = pd.DataFrame(encoded_attack_type_test).add_prefix('attack_type_')
del encoded_attack_type_test
test = test.join(nuevas_col_test)
test = test.drop('attack_type', axis=1)
del nuevas_col_test

In [78]:
y_pred = model.predict(test)

In [79]:
pred_df = pd.concat([ips_test, pd.DataFrame(y_pred, columns = ['label'])], axis=1)

In [80]:
pred_df.to_csv('/content/drive/MyDrive/Organización de Datos/TP2/predDecTree.csv', index=False)

In [81]:
pred_df

Unnamed: 0,attacker_ip_enum,label
0,7696,0
1,7543,0
2,7280,0
3,1538,0
4,285,0
...,...,...
49415,196304,0
49416,199918,0
49417,192446,0
49418,192056,0


0.5098 en Kaggle