In [85]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, classification_report

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import ExtraTreesClassifier
import lightgbm as lgb

from concurrent.futures import ThreadPoolExecutor

from imblearn.under_sampling import RandomUnderSampler

In [86]:
def convert_to_integer(value): # Birden fazla dbm_antsignal değerlerinin olduğu durumda dönüşüm işlemi için 
    if pd.isna(value) or value == '':
        return None 
    elif isinstance(value, int) or isinstance(value, float):
        return value
    else:
        signal_strengths = [int(v) for v in value.split('-') if v]
        if signal_strengths:
            average_strength = round(sum(signal_strengths) / len(signal_strengths))
            return -average_strength
        else:
            return None

In [87]:
features = ['frame.interface_id',
 'frame.dlt',
 'frame.offset_shift',
 'frame.time_epoch',
 'frame.time_delta',
 'frame.time_delta_displayed',
 'frame.time_relative',
 'frame.len',
 'frame.cap_len',
 'frame.marked',
 'frame.ignored',
 'radiotap.version',
 'radiotap.pad',
 'radiotap.length',
 'radiotap.present.tsft',
 'radiotap.present.flags',
 'radiotap.present.rate',
 'radiotap.present.channel',
 'radiotap.present.fhss',
 'radiotap.present.dbm_antsignal',
 'radiotap.present.dbm_antnoise',
 'radiotap.present.lock_quality',
 'radiotap.present.tx_attenuation',
 'radiotap.present.db_tx_attenuation',
 'radiotap.present.dbm_tx_power',
 'radiotap.present.antenna',
 'radiotap.present.db_antsignal',
 'radiotap.present.db_antnoise',
 'radiotap.present.rxflags',
 'radiotap.present.xchannel',
 'radiotap.present.mcs',
 'radiotap.present.ampdu',
 'radiotap.present.vht',
 'radiotap.present.reserved',
 'radiotap.present.rtap_ns',
 'radiotap.present.vendor_ns',
 'radiotap.present.ext',
 'radiotap.mactime',
 'radiotap.flags.cfp',
 'radiotap.flags.preamble',
 'radiotap.flags.wep',
 'radiotap.flags.frag',
 'radiotap.flags.fcs',
 'radiotap.flags.datapad',
 'radiotap.flags.badfcs',
 'radiotap.flags.shortgi',
 'radiotap.datarate',
 'radiotap.channel.freq',
 'radiotap.channel.type.turbo',
 'radiotap.channel.type.cck',
 'radiotap.channel.type.ofdm',
 'radiotap.channel.type.2ghz',
 'radiotap.channel.type.5ghz',
 'radiotap.channel.type.passive',
 'radiotap.channel.type.dynamic',
 'radiotap.channel.type.gfsk',
 'radiotap.channel.type.gsm',
 'radiotap.channel.type.sturbo',
 'radiotap.channel.type.half',
 'radiotap.channel.type.quarter',
 'radiotap.dbm_antsignal',
 'radiotap.antenna',
 'radiotap.rxflags.badplcp',
 'wlan.fc.type_subtype',
 'wlan.fc.version',
 'wlan.fc.type',
 'wlan.fc.subtype',
 'wlan.fc.ds',
 'wlan.fc.frag',
 'wlan.fc.retry',
 'wlan.fc.pwrmgt',
 'wlan.fc.moredata',
 'wlan.fc.protected',
 'wlan.fc.order',
 'wlan.duration',
 'wlan.ra',
 'wlan.da',
 'wlan.ta',
 'wlan.sa',
 'wlan.bssid',
 'wlan.frag',
 'wlan.seq',
 'wlan.bar.type',
 'wlan.ba.control.ackpolicy',
 'wlan.ba.control.multitid',
 'wlan.ba.control.cbitmap',
 'wlan.bar.compressed.tidinfo',
 'wlan.ba.bm',
 'wlan.fcs_good',
 'wlan_mgt.fixed.capabilities.ess',
 'wlan_mgt.fixed.capabilities.ibss',
 'wlan_mgt.fixed.capabilities.cfpoll.ap',
 'wlan_mgt.fixed.capabilities.privacy',
 'wlan_mgt.fixed.capabilities.preamble',
 'wlan_mgt.fixed.capabilities.pbcc',
 'wlan_mgt.fixed.capabilities.agility',
 'wlan_mgt.fixed.capabilities.spec_man',
 'wlan_mgt.fixed.capabilities.short_slot_time',
 'wlan_mgt.fixed.capabilities.apsd',
 'wlan_mgt.fixed.capabilities.radio_measurement',
 'wlan_mgt.fixed.capabilities.dsss_ofdm',
 'wlan_mgt.fixed.capabilities.del_blk_ack',
 'wlan_mgt.fixed.capabilities.imm_blk_ack',
 'wlan_mgt.fixed.listen_ival',
 'wlan_mgt.fixed.current_ap',
 'wlan_mgt.fixed.status_code',
 'wlan_mgt.fixed.timestamp',
 'wlan_mgt.fixed.beacon',
 'wlan_mgt.fixed.aid',
 'wlan_mgt.fixed.reason_code',
 'wlan_mgt.fixed.auth.alg',
 'wlan_mgt.fixed.auth_seq',
 'wlan_mgt.fixed.category_code',
 'wlan_mgt.fixed.htact',
 'wlan_mgt.fixed.chanwidth',
 'wlan_mgt.fixed.fragment',
 'wlan_mgt.fixed.sequence',
 'wlan_mgt.tagged.all',
 'wlan_mgt.ssid',
 'wlan_mgt.ds.current_channel',
 'wlan_mgt.tim.dtim_count',
 'wlan_mgt.tim.dtim_period',
 'wlan_mgt.tim.bmapctl.multicast',
 'wlan_mgt.tim.bmapctl.offset',
 'wlan_mgt.country_info.environment',
 'wlan_mgt.rsn.version',
 'wlan_mgt.rsn.gcs.type',
 'wlan_mgt.rsn.pcs.count',
 'wlan_mgt.rsn.akms.count',
 'wlan_mgt.rsn.akms.type',
 'wlan_mgt.rsn.capabilities.preauth',
 'wlan_mgt.rsn.capabilities.no_pairwise',
 'wlan_mgt.rsn.capabilities.ptksa_replay_counter',
 'wlan_mgt.rsn.capabilities.gtksa_replay_counter',
 'wlan_mgt.rsn.capabilities.mfpr',
 'wlan_mgt.rsn.capabilities.mfpc',
 'wlan_mgt.rsn.capabilities.peerkey',
 'wlan_mgt.tcprep.trsmt_pow',
 'wlan_mgt.tcprep.link_mrg',
 'wlan.wep.iv',
 'wlan.wep.key',
 'wlan.wep.icv',
 'wlan.tkip.extiv',
 'wlan.ccmp.extiv',
 'wlan.qos.tid',
 'wlan.qos.priority',
 'wlan.qos.eosp',
 'wlan.qos.ack',
 'wlan.qos.amsdupresent',
 'wlan.qos.buf_state_indicated1',
 'wlan.qos.bit4',
 'wlan.qos.txop_dur_req',
 'wlan.qos.buf_state_indicated2',
 'data.len',
 'class']

# tsft ekle
selected_features = ['frame.len', 'radiotap.length', 'radiotap.dbm_antsignal', 'wlan.duration', 'radiotap.present.tsft', 'radiotap.channel.type.cck', 'radiotap.channel.type.ofdm', 'wlan.fc.type', 'wlan.fc.subtype', 'wlan.fc.ds', 'wlan.fc.frag', 'wlan.fc.retry', 'wlan.fc.pwrmgt', 'wlan.fc.moredata', 'wlan.fc.protected']

### Read AWID2 Trn

In [88]:
awid2trn_data = pd.read_csv("./AWID-CLS-R-Trn/AWID-CLS-R-Trn/1", header=None, names=features, low_memory=False)

awid2trn_data = awid2trn_data.loc[:, selected_features + ['class']]
pd.set_option('display.max_columns', None)

awid2trn_data.head(10)

Unnamed: 0,frame.len,radiotap.length,radiotap.dbm_antsignal,wlan.duration,radiotap.present.tsft,radiotap.channel.type.cck,radiotap.channel.type.ofdm,wlan.fc.type,wlan.fc.subtype,wlan.fc.ds,wlan.fc.frag,wlan.fc.retry,wlan.fc.pwrmgt,wlan.fc.moredata,wlan.fc.protected,class
0,261,26,-47,0,1,1,0,0,8,0x00,0,0,0,0,0,normal
1,185,26,-47,0,1,1,0,0,8,0x00,0,0,0,0,0,normal
2,185,26,-64,0,1,1,0,0,8,0x00,0,0,0,0,0,normal
3,159,26,-32,0,1,1,0,0,8,0x00,0,0,0,0,0,normal
4,54,26,-21,44,1,0,1,2,4,0x01,0,0,0,0,0,normal
5,40,26,-24,0,1,0,1,1,13,0x00,0,0,0,0,0,normal
6,261,26,-47,0,1,1,0,0,8,0x00,0,0,0,0,0,normal
7,40,26,-24,0,1,0,1,1,13,0x00,0,0,0,0,0,normal
8,185,26,-52,0,1,1,0,0,8,0x00,0,0,0,0,0,normal
9,185,26,-58,0,1,1,0,0,8,0x00,0,0,0,0,0,normal


### Read AWID2 Tst

In [89]:
awid2tst_data = pd.read_csv("./AWID-CLS-R-Tst/AWID-CLS-R-Tst/awid2test", header=None, names=features, low_memory=False)

awid2tst_data = awid2tst_data.loc[:, selected_features + ['class']]
pd.set_option('display.max_columns', None)

awid2tst_data.head(10)

Unnamed: 0,frame.len,radiotap.length,radiotap.dbm_antsignal,wlan.duration,radiotap.present.tsft,radiotap.channel.type.cck,radiotap.channel.type.ofdm,wlan.fc.type,wlan.fc.subtype,wlan.fc.ds,wlan.fc.frag,wlan.fc.retry,wlan.fc.pwrmgt,wlan.fc.moredata,wlan.fc.protected,class
0,1552,26,-25,44,1,0,1,2,8,0x02,0,0,0,0,1,normal
1,1552,26,-25,44,1,0,1,2,8,0x02,0,1,0,0,1,normal
2,40,26,-26,0,1,0,1,1,13,0x00,0,0,0,0,0,normal
3,40,26,-26,0,1,0,1,1,13,0x00,0,0,0,0,0,normal
4,1552,26,-26,44,1,0,1,2,8,0x02,0,0,0,0,1,normal
5,1552,26,-25,44,1,0,1,2,8,0x02,0,0,0,0,1,normal
6,1552,26,-25,44,1,0,1,2,8,0x02,0,0,0,0,1,normal
7,1552,26,-26,44,1,0,1,2,8,0x02,0,0,0,0,1,normal
8,40,26,-24,0,1,0,1,1,13,0x00,0,0,0,0,0,normal
9,1552,26,-26,44,1,0,1,2,8,0x02,0,0,0,0,1,normal


### Dropping NaN values

In [90]:
awid2trn_data = awid2trn_data.replace(r'^\s*$', pd.NA, regex=True)
awid2trn_data = awid2trn_data.replace('?', pd.NA)
awid2trn_data = awid2trn_data.dropna()

In [91]:
awid2tst_data = awid2tst_data.replace(r'^\s*$', pd.NA, regex=True)
awid2tst_data = awid2tst_data.replace('?', pd.NA)
awid2tst_data = awid2tst_data.dropna()

### Injection rows removed

In [92]:
filter_train = awid2trn_data['class'] != 'injection'
filter_test = awid2tst_data['class'] != 'injection'
awid2trn_data = awid2trn_data[filter_train]
awid2tst_data = awid2tst_data[filter_test]

In [93]:
X_awid2trn = awid2trn_data.loc[:, selected_features]
y_awid2trn = awid2trn_data[['class']]

In [94]:
X_awid2tst = awid2tst_data.loc[:, selected_features]
y_awid2tst = awid2tst_data[['class']]

In [95]:
X_awid2trn

Unnamed: 0,frame.len,radiotap.length,radiotap.dbm_antsignal,wlan.duration,radiotap.present.tsft,radiotap.channel.type.cck,radiotap.channel.type.ofdm,wlan.fc.type,wlan.fc.subtype,wlan.fc.ds,wlan.fc.frag,wlan.fc.retry,wlan.fc.pwrmgt,wlan.fc.moredata,wlan.fc.protected
0,261,26,-47,0,1,1,0,0,8,0x00,0,0,0,0,0
1,185,26,-47,0,1,1,0,0,8,0x00,0,0,0,0,0
2,185,26,-64,0,1,1,0,0,8,0x00,0,0,0,0,0
3,159,26,-32,0,1,1,0,0,8,0x00,0,0,0,0,0
4,54,26,-21,44,1,0,1,2,4,0x01,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795570,40,26,-25,0,1,0,1,1,13,0x00,0,0,0,0,0
1795571,148,26,-58,0,1,1,0,0,8,0x00,0,0,0,0,0
1795572,54,26,-27,44,1,0,1,2,4,0x01,0,0,0,0,0
1795573,40,26,-25,0,1,0,1,1,13,0x00,0,0,0,0,0


In [96]:
print(f"normal:\n{y_awid2trn[y_awid2trn['class']=='normal'].count()}")
print(f"impersonation:\n{y_awid2trn[y_awid2trn['class']=='impersonation'].count()}")
print(f"flooding:\n{y_awid2trn[y_awid2trn['class']=='flooding'].count()}")
print(f"injection:\n{y_awid2trn[y_awid2trn['class']=='injection'].count()}")

normal:
class    1631218
dtype: int64
impersonation:
class    48522
dtype: int64
flooding:
class    48484
dtype: int64
injection:
class    0
dtype: int64


### Undersampling

In [97]:
under_sampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_awid2trn, y_awid2trn = under_sampler.fit_resample(X_awid2trn, y_awid2trn)

In [98]:
# Test verisine undersampling yaptigimizda DT ile 0.67 micro f1 skoru gibi kotu sonuclar aldik.

#under_sampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
#X_awid2tst, y_awid2tst = under_sampler.fit_resample(X_awid2tst, y_awid2tst)

In [99]:
print(f"normal:\n{y_awid2trn[y_awid2trn['class']=='normal'].count()}")
print(f"impersonation:\n{y_awid2trn[y_awid2trn['class']=='impersonation'].count()}")
print(f"flooding:\n{y_awid2trn[y_awid2trn['class']=='flooding'].count()}")
print(f"injection:\n{y_awid2trn[y_awid2trn['class']=='injection'].count()}")

normal:
class    48484
dtype: int64
impersonation:
class    48484
dtype: int64
flooding:
class    48484
dtype: int64
injection:
class    0
dtype: int64


In [100]:
print(f"normal:\n{y_awid2tst[y_awid2tst['class']=='normal'].count()}")
print(f"impersonation:\n{y_awid2tst[y_awid2tst['class']=='impersonation'].count()}")
print(f"flooding:\n{y_awid2tst[y_awid2tst['class']=='flooding'].count()}")
print(f"injection:\n{y_awid2tst[y_awid2tst['class']=='injection'].count()}")

normal:
class    530458
dtype: int64
impersonation:
class    20079
dtype: int64
flooding:
class    8097
dtype: int64
injection:
class    0
dtype: int64


In [101]:
X_awid2trn['radiotap.dbm_antsignal'] = X_awid2trn['radiotap.dbm_antsignal'].apply(convert_to_integer)

In [102]:
X_awid2tst['radiotap.dbm_antsignal'] = X_awid2tst['radiotap.dbm_antsignal'].apply(convert_to_integer)

### Encoding

In [103]:
columns_to_scale = ['frame.len', 'radiotap.length', 'radiotap.dbm_antsignal', 'wlan.duration']
columns_to_one_hot_encode = [col for col in X_awid2trn.columns if col not in columns_to_scale]

# Min-max scaling
scaler = MinMaxScaler()
X_awid2trn[columns_to_scale] = scaler.fit_transform(X_awid2trn[columns_to_scale])

scaler = MinMaxScaler()
X_awid2tst[columns_to_scale] = scaler.fit_transform(X_awid2tst[columns_to_scale])

# One-hot encoding
combined_data = pd.concat([X_awid2trn, X_awid2tst], axis=0) # Combine dataframes
combined_data = pd.get_dummies(combined_data, columns=columns_to_one_hot_encode)
X_awid2trn_encoded = combined_data[:len(X_awid2trn)]
X_awid2tst_encoded = combined_data[len(X_awid2trn):]

In [104]:
def evaluate_classifier(classifier, X_test, y_test):
    # Make predictions on the test set
    y_pred = classifier.predict(X_test)

    # Calculate Precision, Recall, F1, and Accuracy
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='micro')
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Micro F1 Score: {f1:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

In [105]:
def evaluate_classifier_lightgbm(classifier, X_test, y_test):
    # Make predictions on the test set
    y_pred_proba = classifier.predict(X_test)
    y_pred_class = y_pred_proba.argmax(axis=1)

    # Calculate Precision, Recall, F1, and Accuracy
    precision = precision_score(y_test, y_pred_class, average='weighted')
    recall = recall_score(y_test, y_pred_class, average='weighted')
    f1 = f1_score(y_test, y_pred_class, average='micro')
    accuracy = accuracy_score(y_test, y_pred_class)

    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Micro F1 Score: {f1:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_class))

### Decision Tree

In [106]:
X_train, y_train = X_awid2trn_encoded, y_awid2trn
X_test, y_test = X_awid2tst_encoded, y_awid2tst

# Convert the target variable to integer codes
y_train = y_train['class'].astype('category').cat.codes
y_test = y_test['class'].astype('category').cat.codes

# Belirli parametrelerle dt modeli
dt_model = DecisionTreeClassifier(
    max_depth=20,
    ccp_alpha=0.001,
    max_leaf_nodes=100,
    min_samples_leaf=2,
    random_state=42
)


dt_model.fit(X_train, y_train)

evaluate_classifier(dt_model, X_test, y_test)

Precision: 0.9507
Recall: 0.9282
Micro F1 Score: 0.9282
Accuracy: 0.9282

Classification Report:
              precision    recall  f1-score   support

           0       0.22      0.69      0.33      8097
           1       0.08      0.07      0.07     20079
           2       1.00      0.96      0.98    530458

    accuracy                           0.93    558634
   macro avg       0.43      0.57      0.46    558634
weighted avg       0.95      0.93      0.94    558634



### LightGBM

In [107]:
X_train, y_train = X_awid2trn_encoded, y_awid2trn
X_test, y_test = X_awid2tst_encoded, y_awid2tst

# Convert the target variable to integer codes
y_train = y_train['class'].astype('category').cat.codes
y_test = y_test['class'].astype('category').cat.codes

# Create a LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Belirli parametreler ile lightgbm modeli
params = {
    'objective': 'multiclass',  # Assuming it's a multiclass classification problem
    'num_class': len(y_train.unique()),  # Number of classes
    'metric': 'multi_logloss',  # Evaluation metric
    'boosting_type': 'gbdt',
    'num_leaves': 20,
    'learning_rate': 0.01,
    'max_bin': 20,
    'max_depth': 10,
    'min_child_samples': 30,
    'min_data_in_bin': 10,
    'min_split_gain': 0.1,
    'n_estimators': 80,
    'reg_alpha': 0.01,
    'reg_lambda': 0.01,
    'n_jobs': 1,
    'verbose': 0
}


lgb_model = lgb.train(params, train_data, valid_sets=[test_data])

evaluate_classifier_lightgbm(lgb_model, X_test, y_test)



Precision: 0.9246
Recall: 0.9293
Micro F1 Score: 0.9293
Accuracy: 0.9293

Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.63      0.68      8097
           1       0.08      0.07      0.08     20079
           2       0.96      0.97      0.96    530458

    accuracy                           0.93    558634
   macro avg       0.59      0.56      0.57    558634
weighted avg       0.92      0.93      0.93    558634



### Logistic Regression

In [108]:
X_train, y_train = X_awid2trn_encoded, y_awid2trn
X_test, y_test = X_awid2tst_encoded, y_awid2tst

# Convert the target variable to integer codes
y_train = y_train['class'].astype('category').cat.codes
y_test = y_test['class'].astype('category').cat.codes

# Belirli parametreler ile logistic reg modeli
lr_model = LogisticRegression(
    solver='sag',
    max_iter=1000,
    tol=0.01,
    random_state=42,
    multi_class='multinomial'
)

lr_model.fit(X_train, y_train)

evaluate_classifier(lr_model, X_test, y_test)

Precision: 0.9690
Recall: 0.9276
Micro F1 Score: 0.9276
Accuracy: 0.9276

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.62      0.70      8097
           1       0.35      1.00      0.52     20079
           2       0.99      0.93      0.96    530458

    accuracy                           0.93    558634
   macro avg       0.72      0.85      0.73    558634
weighted avg       0.97      0.93      0.94    558634



### SGDClassifier

In [121]:
X_train, y_train = X_awid2trn_encoded, y_awid2trn
X_test, y_test = X_awid2tst_encoded, y_awid2tst

# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

# Convert the target variable to integer codes
y_train = y_train['class'].astype('category').cat.codes
y_test = y_test['class'].astype('category').cat.codes

# Belirli parametreler ile SGD Classifier
sgd_model = SGDClassifier(
    tol=1e-05,
    loss='modified_huber',
    early_stopping=True,
    random_state=42
)

sgd_model.fit(X_train, y_train)

evaluate_classifier(sgd_model, X_test, y_test)

Precision: 0.9245
Recall: 0.8878
Micro F1 Score: 0.8878
Accuracy: 0.8878

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.62      0.73      8097
           1       0.03      0.07      0.05     20079
           2       0.96      0.92      0.94    530458

    accuracy                           0.89    558634
   macro avg       0.63      0.54      0.57    558634
weighted avg       0.92      0.89      0.91    558634



### LinearSVC

In [110]:
X_train, y_train = X_awid2trn_encoded, y_awid2trn
X_test, y_test = X_awid2tst_encoded, y_awid2tst

# Convert the target variable to integer codes
y_train = y_train['class'].astype('category').cat.codes
y_test = y_test['class'].astype('category').cat.codes

# Belirli parametreler ile LinearSVC
linear_svc_model = LinearSVC(
    max_iter=20000,
    C=1.5,
    random_state=42
)

linear_svc_model.fit(X_train, y_train)

evaluate_classifier(linear_svc_model, X_test, y_test)



Precision: 0.9637
Recall: 0.9182
Micro F1 Score: 0.9182
Accuracy: 0.9182

Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.62      0.51      8097
           1       0.35      1.00      0.52     20079
           2       0.99      0.92      0.96    530458

    accuracy                           0.92    558634
   macro avg       0.59      0.85      0.66    558634
weighted avg       0.96      0.92      0.93    558634



### Random Forest

In [111]:
X_train, y_train = X_awid2trn_encoded, y_awid2trn
X_test, y_test = X_awid2tst_encoded, y_awid2tst

# Convert the target variable to integer codes
y_train = y_train['class'].astype('category').cat.codes
y_test = y_test['class'].astype('category').cat.codes

# Belirli parametreler ile Random Forest
random_forest_model = RandomForestClassifier(
    max_depth=20,
    ccp_alpha=0.001,
    max_leaf_nodes=100,
    min_samples_leaf=2,
    random_state=42
)

random_forest_model.fit(X_train, y_train)

evaluate_classifier(random_forest_model, X_test, y_test)

Precision: 0.9281
Recall: 0.9311
Micro F1 Score: 0.9311
Accuracy: 0.9311

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.74      0.80      8097
           1       0.08      0.07      0.08     20079
           2       0.96      0.97      0.96    530458

    accuracy                           0.93    558634
   macro avg       0.64      0.59      0.61    558634
weighted avg       0.93      0.93      0.93    558634



### Extra Trees

In [112]:
X_train, y_train = X_awid2trn_encoded, y_awid2trn
X_test, y_test = X_awid2tst_encoded, y_awid2tst

# Convert the target variable to integer codes
y_train = y_train['class'].astype('category').cat.codes
y_test = y_test['class'].astype('category').cat.codes

# Belirli hiperparametreler ile ET model
et_model = ExtraTreesClassifier(
    max_depth=200,
    n_estimators=200,
    ccp_alpha=0.0001,
    max_leaf_nodes=500,
    min_samples_leaf=2,
    min_samples_split=10,
    random_state=42
)

et_model.fit(X_train, y_train)

evaluate_classifier(et_model, X_test, y_test)

Precision: 0.9284
Recall: 0.9310
Micro F1 Score: 0.9310
Accuracy: 0.9310

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.62      0.75      8097
           1       0.08      0.07      0.07     20079
           2       0.96      0.97      0.96    530458

    accuracy                           0.93    558634
   macro avg       0.66      0.55      0.60    558634
weighted avg       0.93      0.93      0.93    558634

