## Wczytujemy dane

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Wczytanie danych
data = pd.read_csv('DDos.csv')

print(data.columns)
# Podział danych na cechy (X) i etykiety (y)
X = data.drop(' Label', axis=1)  # Kolumna 'Label' zawiera etykiety
y = data[' Label']

# Podział danych na zestaw treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', '

## Metoda bez iteracji 

### Tworzymy Random Forest

In [5]:
from sklearn.ensemble import RandomForestClassifier 

# Tworzenie modelu Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

### Uzyskujemy ważność cech

In [9]:
importances = rf.feature_importances_
feature_names = X.columns

# Tworzenie DataFrame z wynikami
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Sortowanie cech według ważności
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print(feature_importances.head(10))

                        Feature  Importance
6         Fwd Packet Length Max    0.087524
8        Fwd Packet Length Mean    0.081587
66       Init_Win_bytes_forward    0.078397
11        Bwd Packet Length Min    0.061249
53         Avg Fwd Segment Size    0.061098
0              Destination Port    0.060444
4   Total Length of Fwd Packets    0.049032
63            Subflow Fwd Bytes    0.048209
62          Subflow Fwd Packets    0.046814
68             act_data_pkt_fwd    0.037963


In [7]:
# Wyznaczanie średniej ważności cech
mean_importance = feature_importances['Importance'].mean()

# Wybór najważniejszych cech
important_features = feature_importances[feature_importances['Importance'] > mean_importance]
important_feature_names = important_features['Feature'].tolist()

# Nowy zestaw danych zawierający tylko najważniejsze cechy
X_important = X[important_feature_names]

print(important_feature_names)

[' Fwd Packet Length Max', ' Fwd Packet Length Mean', 'Init_Win_bytes_forward', ' Bwd Packet Length Min', ' Avg Fwd Segment Size', ' Destination Port', 'Total Length of Fwd Packets', ' Subflow Fwd Bytes', 'Subflow Fwd Packets', ' act_data_pkt_fwd', ' Fwd IAT Std', ' Fwd Header Length.1', 'Fwd IAT Total', ' Fwd Header Length', ' Bwd Header Length', ' Fwd IAT Mean', ' Fwd IAT Max', ' Average Packet Size', 'Bwd Packet Length Max', ' Fwd Packet Length Std']


## Metoda Recursive Feature Elimination
#### Bardziej wszechstronna metoda, która daje prezycyjne wyniki, ale wymaga więcej zasobów

In [8]:
from sklearn.feature_selection import RFE

# Tworzenie modelu RFE z użyciem modelu Random Forest
selector = RFE(rf, n_features_to_select=10, step=1)  # Możesz dostosować n_features_to_select
selector = selector.fit(X_train, y_train)

# Wybrane cechy
selected_features = X.columns[selector.support_]
print(selected_features)

Index([' Destination Port', 'Total Length of Fwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Mean',
       ' Fwd Header Length', ' Avg Fwd Segment Size', ' Fwd Header Length.1',
       ' Subflow Fwd Bytes', 'Init_Win_bytes_forward', ' act_data_pkt_fwd'],
      dtype='object')


### Zostawiamy tylko wybrane kolumny (i Label) w danych 

In [None]:
selected_features = [' Label',' Destination Port', 'Total Length of Fwd Packets',
                     ' Fwd Packet Length Max', ' Fwd Packet Length Mean',
                     ' Fwd Header Length', ' Avg Fwd Segment Size',
                     ' Subflow Fwd Bytes', 'Init_Win_bytes_forward', ' act_data_pkt_fwd']

new_data = data[selected_features]
print(new_data.sample())
new_data.to_csv("path.csv", index=False)
new_data.to_parquet("path.parquet", index=False)