# Cyber Security Attacks Model

The model is supposed to predict a cyber attack type based on user input.

Questions:
- What user input? Which fields can they input? Presumably all fields that are going to be used in final dataset.

In [2]:
import pandas as pd
import numpy as np

pd.options.display.max_columns = None

## Data Loading

In [4]:
df = pd.read_csv(r"C:\Users\isabela.ribeiro\Desktop\cybersecurity_attacks.csv")
df.head(3)

Unnamed: 0,Timestamp,Source IP Address,Destination IP Address,Source Port,Destination Port,Protocol,Packet Length,Packet Type,Traffic Type,Payload Data,Malware Indicators,Anomaly Scores,Alerts/Warnings,Attack Type,Attack Signature,Action Taken,Severity Level,User Information,Device Information,Network Segment,Geo-location Data,Proxy Information,Firewall Logs,IDS/IPS Alerts,Log Source
0,2023-05-30 06:33:58,103.216.15.12,84.9.164.252,31225,17616,ICMP,503,Data,HTTP,Qui natus odio asperiores nam. Optio nobis ius...,IoC Detected,28.67,,Malware,Known Pattern B,Logged,Low,Reyansh Dugal,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment A,"Jamshedpur, Sikkim",150.9.97.135,Log Data,,Server
1,2020-08-26 07:08:30,78.199.217.198,66.191.137.154,17245,48166,ICMP,1174,Data,HTTP,Aperiam quos modi officiis veritatis rem. Omni...,IoC Detected,51.5,,Malware,Known Pattern A,Blocked,Low,Sumer Rana,Mozilla/5.0 (compatible; MSIE 8.0; Windows NT ...,Segment B,"Bilaspur, Nagaland",,Log Data,,Firewall
2,2022-11-13 08:23:25,63.79.210.48,198.219.82.17,16811,53600,UDP,306,Control,HTTP,Perferendis sapiente vitae soluta. Hic delectu...,IoC Detected,87.42,Alert Triggered,DDoS,Known Pattern B,Ignored,Low,Himmat Karpe,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...,Segment C,"Bokaro, Rajasthan",114.133.48.179,Log Data,Alert Data,Firewall


In [5]:
df.columns = [x.lower() for x in df.columns]
df.columns = df.columns.str.replace('/', "_and_")
df.columns = df.columns.str.replace(' ', '_')
df.columns

Index(['timestamp', 'source_ip_address', 'destination_ip_address',
       'source_port', 'destination_port', 'protocol', 'packet_length',
       'packet_type', 'traffic_type', 'payload_data', 'malware_indicators',
       'attack_signature', 'action_taken', 'severity_level',
       'user_information', 'device_information', 'network_segment',
       'geo-location_data', 'proxy_information', 'firewall_logs',
       'ids_and_ips_alerts', 'log_source'],
      dtype='object')

In [None]:
# Source: https://db-ip.com/db/download/ip-to-country-lite
df_geo = pd.read_csv("../data/dbip-country-lite-2025-03.csv", names=['start_ip', 'end_ip', 'country_code'])
df_geo = df_geo[df_geo['country_code'].notna()]
df_geo = df_geo[~df_geo['start_ip'].str.contains(':')] # removes ipv6 addresses
df_geo.head(3)

In [None]:
df_asn = pd.read_csv("../data/dbip-asn-lite-2025-03.csv", names=['start_ip', 'end_ip', 'asn_id', 'asn_desc'])
df_asn = df_asn[~df_asn['start_ip'].str.contains(':')] # removes ipv6 addresses
df_asn.head(3)

In [None]:
df_ports = pd.read_csv("../data/top-30000-most-popular-tcp-ports-nmap-sorted.csv", header=None)
df_ports = pd.melt(df_ports)
l_ports = df_ports['value'].tolist()
l_ports

## Exploration

### Data Extraction

Before exploring the data entirely, there are 4 columns that extra data can be extracted from:
- timestamp
- source_ip_address
- destination_ip_address
- device_information

#### Timestamp

In [None]:
df['hour'] = pd.to_datetime(df['timestamp']).dt.hour
df['day_of_week'] = pd.to_datetime(df['timestamp']).dt.dayofweek
df['month'] = pd.to_datetime(df['timestamp']).dt.month
df['quarter'] = pd.to_datetime(df['timestamp']).dt.quarter
df['year'] = pd.to_datetime(df['timestamp']).dt.year
df.head(3)

#### IP Addresses

The 2 IP Address columns can be used to extract more valuable data. According to https://ipinfo.io/blog/ip-address-information, we can get information like location, ISP, network info (ASN and its type - ASN is a block of IPs owned by an org, hostname, number of domains on IP, privacy detection - coming from VPN or proxy).

Most of the data is behind a paywall except for the geolocation data. Although, data like ASN and IP addresses known for attacks could be useful.

In our case, a downloaded database is used to compare with the help of a package: https://pypi.org/project/ipaddress/

In [None]:
import ipaddress

def ip_to_int(ip_address):
    ip_cleaned = '.'.join(str(int(octet)) for octet in ip_address.split('.'))
    return int(ipaddress.IPv4Address(ip_cleaned))

def is_private(ip_address):
    return ipaddress.ip_address(ip_address).is_private

for col in ['start_ip', 'end_ip']:
    df_geo[f"{col}_int"] = df_geo[col].apply(ip_to_int)
    df_asn[f"{col}_int"] = df_asn[col].apply(ip_to_int)

for col in ['source_ip_address', 'destination_ip_address']:
    df[f"{col}_int"] = df[col].apply(ip_to_int)
    df[f"{col}_is_private"] = df[col].apply(is_private)
    
df_geo = df_geo.sort_values('start_ip_int').reset_index(drop=True)

In [None]:
source_private_ip = df[df['source_ip_address_is_private']==True]
print(source_private_ip['source_ip_address_is_private'].value_counts())

dest_private_ip = df[df['destination_ip_address_is_private']==True]
print(dest_private_ip['destination_ip_address_is_private'].value_counts())

In [None]:
def add_ip_info(info_df, df_info_col, start_ips, end_ips, new_col_suffix):

    cummax_ends = np.maximum.accumulate(end_ips)
    cummax_indices = np.empty_like(cummax_ends, dtype=np.int64)
    cummax_indices[0] = 0
    
    for i in range(1, len(end_ips)):
        if end_ips[i] > cummax_ends[i-1]:
            cummax_indices[i] = i
        else:
            cummax_indices[i] = cummax_indices[i-1]
    
    for loc in ['source','destination']:
        
        # this will match all rows all rows with vals from start_ips. there is no upper limit so all rows will be filled.
        indeces = np.searchsorted(
            cummax_ends,
            df[f'{loc}_ip_address_int'].values,
            side='left',
        )
        indeces[indeces == len(end_ips)] = len(end_ips) - 1
        ai = cummax_indices[indeces]
        # the solution for the above problem is to create a mask where False will be given if it does not fit with the end_ip too.
        mask = (df[f'{loc}_ip_address_int'].values >= start_ips[ai]) & (df[f'{loc}_ip_address_int'].values <= end_ips[ai])
            
        matched_values = np.where(mask, info_df[df_info_col].values[ai], np.nan)
        df[f'{loc}_{new_col_suffix}'] = matched_values

In [None]:
geo_start_ips = df_geo['start_ip_int'].values
geo_end_ips = df_geo['end_ip_int'].values
add_ip_info(df_geo, 'country_code', geo_start_ips, geo_end_ips, 'country')

asn_start_ips = df_asn['start_ip_int'].values
asn_end_ips = df_asn['end_ip_int'].values
add_ip_info(df_asn, 'asn_id', asn_start_ips, asn_end_ips, 'asn_id')

df.head(3)

#### Device Information

The values in this column has information in the form of user agents. We can extract info like browser, operating system, device model, etc.

Something to keep in mind is that data this type of data can be easily faked but can still point towards an anomaly.

There is Python package that can parse this data: https://pypi.org/project/user-agents/

Initially, the data inlcuded versions for each type making the data very specific and the column values were too broad.
Browser accrued 5490 results
OS accrued 174 results. While more manageable, the data within the versions seemed to be evenly spread within each OS family.
Device accrued only 8 device types - most types were Apple based, but the type "Other None None" was more than half of the dataset.ie. > 20000.
This can skew the dataset but it might be useful to group the Apple devices and use Other None None as another category.

To regenerate the data as described above, you can use the following code:
```
df = df.assign(**{"Browser": df["Device Information"].apply(lambda x : parse(x).browser.family + " " + parse(x).browser.version_string)})
df = df.assign(**{"OS": df["Device Information"].apply(lambda x : parse(x).os.family + " " + parse(x).os.version_string)})
df = df.assign(**{'Device': df['Device Information'].apply(lambda x : str(parse(x).device.family) + " " + str(parse(x).device.brand) + " " + str(parse(x).device.model))})
```

In [None]:
from user_agents import parse

df = df.assign(browser=df['device_information'].apply(lambda x : parse(x).browser.family))
df = df.assign(os=df['device_information'].apply(lambda x : parse(x).os.family))

df.head(3)

### Raw Data

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
dups = df.duplicated()
dups[dups == True]

In [None]:
df.nunique()

In [None]:
# reusable function
def show_nulls():
    null_count = df.isna().sum()
    print(null_count[null_count > 0])

show_nulls()

In [None]:
df.info()

## Initial Data Cleaning

Columns that immediately stand out as unnecessary are the ones that were used to extract data:
- timestamp
- source_ip_address
- destination_ip_address
- device_information

Then the columns that aren't as useful, especially those with a high value count that are hard to classify such as:
- user_information

In [None]:
columns_to_drop = [
    'timestamp', 'source_ip_address', 'source_ip_address_int', 'destination_ip_address',
    'destination_ip_address_int', 'device_information', 'user_information'
]
df.drop(columns_to_drop, axis=1, inplace=True)

## Analysis (almost)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

From the raw stats shown previously, we can determine the following:
- No duplicated rows.

**Need Clarity**

- From whose perspective is the data from? The victim?
- Is timestamp using UTC time or some other uniform time zone?
- Is the action_taken column showing actions done by the system after the attack happened and before human intervention?

**Further Analysis**

- Are countries relevant? Because we don't know if the data collection is concentrated in a certain area. Model results can be skewed.
- Ports above a certain value can be used for anything unlike the ports below that threshold. Can we assume that the ports selected are randomly assigned by the attacker?
- Packet Type has some relation to Protocol. Do some more checks to make sure.
- Checks on Packet Length.
- Checks on payload_data to see if there are any patterns within them. It looks like irrelevant latin text - possibly auto-generated.
- Check which values from some other column are paired with Malware Indicators.
- What is the Anomaly Score? Not sure which part of the process it is in. How does it relate to Attack Signatures?
- Can we group the different types of alerts? Or do they specifically indicate which type of attack there is. Or can we group them into a single column where the value will be true once 1 of the alert columns has a True value?

### Device Information

Upon further analysis:
- another column can be created for to see if the device is a mobile or not (bool).
  - before the next step, look for discrepancies between os and browser type since we know that the data can be faked.
- the browser column can be simplified by grouping the browsers together like "Chrome" and "Chrome Mobile iOS" as one family.

In [None]:
df['os'].value_counts()

In [None]:
df['browser'].value_counts()

In [None]:
browsers = df['browser'].unique()

def is_mobile_browser(browser_name):
    return any(c.isspace() for c in browser_name)

browser_families = [x for x in browsers if not is_mobile_browser(x)]

def is_mobile_os(os_name):
    mobile_os = ["iOS", "Android"]
    return any(os_name in x for x in mobile_os)

def get_browser_family(browser_name):
    if is_mobile_browser(browser_name):
        for family in browser_families:
            if family in browser_name:
                return family
    else:
        return browser_name

In [None]:
df = df.assign(is_mobile=df['os'].apply(is_mobile_os))
df = df.assign(is_mobile_browser=df['browser'].apply(is_mobile_browser))

In [None]:
# checking all browsers linked to mobile devices
df_mobile = df[df['is_mobile']==True]
df_mobile['browser'].value_counts()

In [None]:
contingency_table = pd.crosstab(df['is_mobile'], df['is_mobile_browser'])

plt.figure(figsize=(8, 5))
sns.heatmap(contingency_table, annot=True, cmap='Blues', fmt='g', cbar_kws={'label':"Count"})
plt.title("Heatmap of Mobile and Browser Combinations")
plt.xlabel("Is Mobile Browser")
plt.ylabel("Is Mobile")
plt.xticks([0.5, 1.5], ["No", "Yes"], rotation=0)
plt.yticks([0.5, 1.5], ["No", "Yes"], rotation=0)
plt.show()

In [None]:
# this is done here because this change would affect the results of the cell checking for browsers linked to mobile
# the goal of the previous cell is to see if there are any non-mobile browsers
df['browser'] = df['browser'].apply(get_browser_family)
df.head(3)

### Categorical Columns

In [None]:
def get_categorical_vals():
    values_unique = df.nunique()
    categorical_possible = values_unique[values_unique < 10]
    
    for col_name, val_count in categorical_possible.items():
        msg = ""
        col_unique_vals = df[col_name].unique()
        
        if val_count == 1:
            col_unique_vals = [x for x in col_unique_vals if not pd.isnull(x)]
            msg = "Removed null value. Possible boolean?"
            
        print(f"{col_name}, Values: {col_unique_vals} {msg}")

get_categorical_vals()

From the data generated above, we can see that some columns have few unique values. These values can indicate categories and therefore, they can be encoded making it easier for the algorithms to understand.
Both ordinal and nominal encoding should be considered.

Possible fields for ordinal encoding: severity_level

In addition that that, there are columns that contain only 1 unique value; usually the single value and others are populated by null values. Those columns can be possibly used as booleans. proxy_information is another good contender for a boolean since we won't need the exact values but rather if a proxy was detected.

In [None]:
df['has_proxy'] = np.where(df['proxy_information'].notnull(), True, False)
df['has_malware_indicator'] = np.where(df['malware_indicators'].notnull(), True, False)
df['has_alerts_and_warnings'] = np.where(df['alerts_and_warnings'].notnull(), True, False)
df['has_firewall_log'] = np.where(df['firewall_logs'].notnull(), True, False)
df['has_ids_ips_alert'] = np.where(df['ids_and_ips_alerts'].notnull(), True, False)

# drop original
columns_to_drop = ['malware_indicators', 'alerts_and_warnings', 'proxy_information', 'firewall_logs', 'ids_and_ips_alerts']
df.drop(columns_to_drop, axis=1, inplace=True)

show_nulls()
df.head(3)

We can see that 4 out of the 5 new bools are related to alerts so a new column can be created to concat them - if at least 1 is true, then the new column, has_system_alert, will be true.
Additionally, we can get an count of all alerts into a new column, alert_count.

In [None]:
def has_system_alert(row):
    return True if row['has_malware_indicator'] or row['has_alerts_and_warnings'] or row['has_firewall_log'] or row['has_ids_ips_alert'] else False

def count_alerts(row):
    count = 0

    if row['has_malware_indicator']:
        count+=1
    if row['has_alerts_and_warnings']:
        count+=1
    if row['has_firewall_log']:
        count+=1
    if row['has_ids_ips_alert']:
        count+=1
        
    return count
    
df['has_system_alert'] = df.apply(has_system_alert, axis=1)
df['alert_count'] = df.apply(count_alerts, axis=1)

print(df['has_system_alert'].value_counts())
df.head(3)

Well... based on the value_counts on has_system_alert, it looks useless, lol.

### Networking Stuff

The columns: protocol, packet_length, packet_type, traffic_type, payload_data can be combined somehow to create one or a few new columns.

In [None]:
for loc in ['source', 'destination']:
    df[f'{loc}_port_bin'] = df[f'{loc}_port'].apply(
        lambda value: "registered" if 1024 <= value <= 49151 else "dynamic"
    )

df['packet_length_bin'] = df['packet_length'].apply(
    lambda x: "small" if x <= 256 else
    "medium" if x <= 512 else
    "large" if x <= 1024 else
    "very large"
)

df['packet_length_bin'].value_counts()

In [None]:
def is_popular_port(port):
    return True if port in l_ports else False

df['is_popular_source_port'] = df['source_port'].apply(is_popular_port)
df['is_popular_destination_port'] = df['destination_port'].apply(is_popular_port)
df['protocol_uses_ports'] = df['protocol'].apply(lambda x : '0' if x == 'ICMP' else '1') # TODO: check feature crossing with bool

print(df['is_popular_source_port'].value_counts())
print(df['is_popular_destination_port'].value_counts())
print(df['protocol_uses_ports'].value_counts())

In [None]:
df['protocol_packet_type'] = df[['protocol', 'packet_type']].apply('_'.join, axis=1)
df['protocol_traffic_type'] = df[['protocol', 'packet_type']].apply('_'.join, axis=1)
df['source_port_bin__uses_port'] = df[['source_port_bin', 'protocol_uses_ports']].apply('_'.join, axis=1)
df['destination_port_bin__uses_port'] = df[['destination_port_bin', 'protocol_uses_ports']].apply('_'.join, axis=1)
df.head(3)

### Payload Data

The data could contain signs of an attack.

In [None]:
# df['payload_data'].tolist()[10000:10010]

### ASN & Countries

There is missing data in the countries and asn columns. Investigate.

In [None]:
show_nulls()

Since less than 10 countries are empty and about 15% ASN is empty, we can fill them with the modal value just to ensure the columns are filled for future tests.

In [None]:
df['source_country'].fillna(df['source_country'].mode()[0], inplace=True)
df['destination_country'].fillna(df['destination_country'].mode()[0], inplace=True)
df['source_asn_id'].fillna(df['source_asn_id'].mode()[0], inplace=True)
df['destination_asn_id'].fillna(df['destination_asn_id'].mode()[0], inplace=True)
show_nulls()

In [None]:
# the ASN IDs are supposed to be int rather than float
df['source_asn_id'] = df['source_asn_id'].astype(np.int64)
df['destination_asn_id'] = df['destination_asn_id'].astype(np.int64)
print(df.source_asn_id.dtype)
print(df.destination_asn_id.dtype)

### Class Distribution (for target)

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(x=df['attack_type'])
plt.title('Attack Type Distribution')
plt.xlabel('Attack Type')
plt.ylabel('Count')
plt.show()

The graph above shows that the target variables are uniform.

## Analysis

In [None]:
df.columns

In [None]:
get_categorical_vals()

### Some Preprocessing - Encoding
For now, only columns that can be ordinally encoded will be encoded.

The one that stands out is severity_level. Maybeee action_taken can be considered.

In [None]:
from sklearn.preprocessing import OrdinalEncoder

order = ['Low', 'Medium', 'High']
oe = OrdinalEncoder(categories=[order])
df['severity_level']= oe.fit_transform(df[['severity_level']])

### Feature Correlation with Attack Type

We can check categorical using chi squared, then numerical using...

"The Chi-square test is a statistical test that is used to determine whether there is a significant difference between the observed frequency of a categorical variable and the expected frequency based on the assumption of independence. It can be used to select the best categorical features for a classification model."
From: https://datascience.stackexchange.com/questions/117287/are-chi-square-and-anova-f-classif-to-select-best-features

"The Chi-Square test determines whether there is a significant association between two categorical variables. It helps in hypothesis testing to check whether observed frequencies differ from expected ones."
"If the p-value is less than the significance level (typically 0.05), reject the null hypothesis, indicating a significant relationship between the variables.
If the p-value is greater than 0.05, fail to reject the null hypothesis, meaning no significant relationship was found."
From: https://www.simplilearn.com/tutorials/statistics-tutorial/chi-square-test

In [None]:
categorical_features = [
    'protocol', 'packet_type', 'traffic_type', 'attack_signature', 'action_taken',
    'severity_level', 'network_segment', 'log_source', 'hour',
    'day_of_week', 'quarter', 'source_ip_address_is_private', 'destination_ip_address_is_private',
    'source_country', 'destination_country', 'source_asn_id', 'destination_asn_id', 'browser',
    'os', 'is_mobile', 'is_mobile_browser', 'has_malware_indicator', 'has_alerts_and_warnings',
    'has_proxy', 'has_firewall_log', 'has_ids_ips_alert', 'has_system_alert'
]

numeric_features = ['source_port', 'destination_port', 'packet_length','anomaly_scores', 'alert_count']

In [None]:
from scipy.stats import chi2_contingency

def chi_square_test(feature):
    contingency_table = pd.crosstab(df[feature], df['attack_type'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    return feature, chi2, p

chi_square_results = [chi_square_test(feature) for feature in categorical_features]

chi_square_df = pd.DataFrame(chi_square_results, columns=['feature', 'chi2', 'p'])

In [None]:
fig, axs = plt.subplots(1,2)
plt.figure(figsize=(20,10))
chi2_df = chi_square_df.sort_values(by='chi2', ascending=False)
pv_df = chi_square_df.sort_values(by='p', ascending=True) 
chi2_df.plot.bar(x='feature', y='chi2', figsize=(10,5), ax=axs[0])
pv_df.plot.bar(x='feature', y='p', ax=axs[1])

# features with p-values < 0.05 are statistically significant
chi_square_df[chi_square_df['p'] < 0.05]

## Model

In test stages only. Just to see what happens. Not meant to produce a usable model.

In [None]:
from sklearn.preprocessing import OneHotEncoder

cols_model = [
    'attack_type', 'packet_type', 'browser', 'is_mobile_browser', 'is_mobile', 'os', 'protocol', 'attack_signature'
]
df_model = df[cols_model]

X_no_encode = df_model.drop(columns=['attack_type'])

ohe=OneHotEncoder(sparse_output=False, handle_unknown='error')
X_encoded = ohe.fit_transform(X_no_encode)

categorical_columns = [f'{col}_{cat}' for i, col in enumerate(X_no_encode.columns) for cat in ohe.categories_[i]]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X = pd.DataFrame(X_encoded, columns=categorical_columns)
y = df_model['attack_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
import time

start_time = time.time()
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
elapsed_time = time.time() - start_time

print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")

In [None]:
feature_names = [f"feature {i}" for i in range(X.shape[1])]
forest_importances = pd.Series(importances, index=categorical_columns)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()

In [None]:
#Ports with something else
df['source_port_anomaly'] = df[['source_port_bin__uses_port', 'Anomaly Score']].apply('_'.join, axis=1)
df['dest_port_anomaly'] = df[['destination_port_bin__uses_port', 'Anomaly Score']].apply('_'.join, axis=1)

df['source_port_action'] = df[['source_port_bin__uses_port', 'Action Taken']].apply('_'.join, axis=1)
df['dest_port_action'] = df[['destination_port_bin__uses_port', 'Action Taken']].apply('_'.join, axis=1)         

df['source_port_traf_type'] = df[['source_port_bin__uses_port', 'Traffic Type']].apply('_'.join, axis=1)
df['dest_port_traf_type'] = df[['destination_port_bin__uses_port', 'Traffic Type']].apply('_'.join, axis=1)

#Packet Lenght with something else
df['pckt_len_traf_tp'] = df[['Packet Lenght', 'Traffic Type']].apply('_'.join, axis=1)
df['pckt_len_pckt_tp'] = df[['Packet Lenght', 'Packet Type']].apply('_'.join, axis=1)
df['pckt_len_Protocol'] = df[['Packet Lenght', 'Protocol']].apply('_'.join, axis=1)
df['pckt_len_Source'] = df[['Packet Lenght', 'Log Source']].apply('_'.join, axis=1)

#Packet Type with something else
df['pckt_tp_Protocol'] = df[['Packet Type', 'Protocol']].apply('_'.join, axis=1)
df['pckt_tp_Source'] = df[['Packet Type', 'Log Source']].apply('_'.join, axis=1)
df['pckt_tp_anomaly'] = df[['Packet Type', 'Anomaly Score']].apply('_'.join, axis=1)

#Coutries
df['source_country_destination'] = df[['source_country', 'destination_country']].apply('_'.join, axis=1)

#Traffic Type with something else
df['trf_tp_anomaly'] = df[['Anomaly Score', 'Protocol']].apply('_'.join, axis=1)
df['trf_tp_protocol'] = df[['Traffic Type', 'Protocol']].apply('_'.join, axis=1)                           

#Crossing new columns with itselves
# df['trf_tp_protocol'] = df[['protocol_uses_ports', 'Protocol']].apply('_'.join, axis=1) 
# I guess you created this one but I'm not sure, I'll considere "protocol_uses_ports". If is wrong, I need to switch later

#protocol_uses_ports 
df['trf_tp_protocol_source_port_action'] = df[['trf_tp_protocol', 'source_port_action']].apply('_'.join, axis=1) 
df['trf_tp_protocol_dest_port_action'] = df[['trf_tp_protocol', 'dest_port_action']].apply('_'.join, axis=1) 

df['trf_tp_protocol_source_port_action_alert_warning'] = df[['trf_tp_protocol_source_port_action', 'Alert/Warning']].apply('_'.join, axis=1) 
df['trf_tp_protocol_dest_port_action_alert_warning'] = df[['trf_tp_protocol_dest_port_action', 'Alert/Warning']].apply('_'.join, axis=1) 

df['trf_tp_protocol_source_port_action_alert_warning_firewall'] = df[['trf_tp_protocol_source_port_action_alert_warning', 'Firewall Alert']].apply('_'.join, axis=1) 
df['trf_tp_protocol_dest_port_action_alert_warning_firewall'] = df[['trf_tp_protocol_dest_port_action_alert_warning', 'Firewall Alert']].apply('_'.join, axis=1) 

df['trf_tp_protocol_source_port_action_alert_warning_firewall_IOC'] = df[['trf_tp_protocol_source_port_action_alert_warning_firewall', 'Malware Alert']].apply('_'.join, axis=1) 
df['trf_tp_protocol_dest_port_action_alert_warning_firewall_IOC'] = df[['trf_tp_protocol_dest_port_action_alert_warning_firewall', 'Malware Alert']].apply('_'.join, axis=1) 

df['trf_tp_protocol_source_port_action_alert_warning_firewall_IOC'] = df[['trf_tp_protocol_source_port_action_alert_warning_firewall_IOC', 'Action Taken']].apply('_'.join, axis=1) 
df['trf_tp_protocol_dest_port_action_alert_warning_firewall_IOC'] = df[['trf_tp_protocol_dest_port_action_alert_warning_firewall_IOC', 'Action Taken']].apply('_'.join, axis=1) 

df['pckt_tp_anomaly_pckt_tp'] = df[['pckt_tp_anomaly', 'Packet Type']].apply('_'.join, axis=1) 

#trf_tp_protocol 
df['trf_tp_protocol_anomaly'] = df[['trf_tp_protocol', 'Anomaly Score']].apply('_'.join, axis=1) 
df['trf_tp_protocol_pckt_tp'] = df[['trf_tp_protocol', 'Packet Type']].apply('_'.join, axis=1) 
df['trf_tp_protocol_traf_tp'] = df[['trf_tp_protocol', 'Traffic Type']].apply('_'.join, axis=1) 
df['trf_tp_protocol_Sever_lv'] = df[['trf_tp_protocol', 'Severity Level']].apply('_'.join, axis=1) 

#
df['trf_tp_protocol_source_port_action_ids'] = df[['trf_tp_protocol_source_port_action', 'IDS/IPS Alert']].apply('_'.join, axis=1)
df['trf_tp_protocol_dest_port_action_ids'] = df[['trf_tp_protocol_dest_port_action', 'IDS/IPS Alert']].apply('_'.join, axis=1)

df['trf_tp_protocol_source_port_action_ids_pckt_ln'] = df[['trf_tp_protocol_source_port_action_ids', 'Packet Lenght']].apply('_'.join, axis=1)
df['trf_tp_protocol_dest_port_action_ids_pckt-ln'] = df[['trf_tp_protocol_dest_port_action_ids', 'Packet Lenght']].apply('_'.join, axis=1

df['source_country_destination_proxy'] = df[['source_country_destination', 'Proxy Hidden']].apply('_'.join, axis=1)  ##need to create with you didn't                                                                                                                       

df['pckt_tp_Source_traf_tp'] = df[['Traffic Type', 'pckt_tp_Source']].apply('_'.join, axis=1)


