## Import Library

In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from matplotlib.colors import ListedColormap

## Membaca Dataset

In [21]:
df = pd.read_csv('android_traffic.csv', sep = ';')

## Mengecek Missing Value

In [22]:
df.isnull().sum()

name                       0
tcp_packets                0
dist_port_tcp              0
external_ips               0
vulume_bytes               0
udp_packets                0
tcp_urg_packet             0
source_app_packets         0
remote_app_packets         0
source_app_bytes           0
remote_app_bytes           0
duracion                7845
avg_local_pkt_rate      7845
avg_remote_pkt_rate     7845
source_app_packets.1       0
dns_query_times            0
type                       0
dtype: int64

In [23]:
df_cleaned = df.drop(columns = ['duracion', 'avg_local_pkt_rate', 'avg_remote_pkt_rate'], axis = 1)

In [24]:
df_cleaned.isnull().sum()

name                    0
tcp_packets             0
dist_port_tcp           0
external_ips            0
vulume_bytes            0
udp_packets             0
tcp_urg_packet          0
source_app_packets      0
remote_app_packets      0
source_app_bytes        0
remote_app_bytes        0
source_app_packets.1    0
dns_query_times         0
type                    0
dtype: int64

## Download file Data yang sudah dibersihkan

In [25]:
df_cleaned.to_csv('android_traffic_cleaned.csv', index = False)

In [26]:
df = pd.read_csv('android_traffic_cleaned.csv')

In [27]:
df.head()

Unnamed: 0,name,tcp_packets,dist_port_tcp,external_ips,vulume_bytes,udp_packets,tcp_urg_packet,source_app_packets,remote_app_packets,source_app_bytes,remote_app_bytes,source_app_packets.1,dns_query_times,type
0,AntiVirus,36,6,3,3911,0,0,39,33,5100,4140,39,3,benign
1,AntiVirus,117,0,9,23514,0,0,128,107,26248,24358,128,11,benign
2,AntiVirus,196,0,6,24151,0,0,205,214,163887,24867,205,9,benign
3,AntiVirus,6,0,1,889,0,0,7,6,819,975,7,1,benign
4,AntiVirus,6,0,1,882,0,0,7,6,819,968,7,1,benign


Split Data

In [28]:
x = df.iloc[:, :-1].values
y = df.iloc[:, 13].values

In [29]:
print(x)

[['AntiVirus' 36 6 ... 4140 39 3]
 ['AntiVirus' 117 0 ... 24358 128 11]
 ['AntiVirus' 196 0 ... 24867 205 9]
 ...
 ['Zsone' 0 0 ... 143 2 2]
 ['Zsone' 0 0 ... 143 2 2]
 ['Zsone' 0 0 ... 143 2 2]]


In [33]:
encoder = OneHotEncoder(handle_unknown='ignore')

In [34]:
x = encoder.fit_transform(x).toarray()

In [35]:
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [36]:
print(y)

['benign' 'benign' 'benign' ... 'malicious' 'malicious' 'malicious']


In [37]:
label_mapping = {'benign': 0, 'malicious': 1}

In [38]:
y_numeric = [label_mapping[label] for label in y]

print(y_numeric)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [39]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 50)

In [40]:
print(x_train)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
