In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import (
    RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier,
    VotingClassifier, StackingClassifier
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from scipy.stats import chi2_contingency
from scipy import stats
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

In [4]:
def ip_to_int(ip):
    """Convert IP address to integer"""
    try:
        parts = [int(part) for part in ip.split('.')]
        return parts[0] * 256**3 + parts[1] * 256**2 + parts[2] * 256 + parts[3]
    except:
        return 0


In [5]:
def create_new_features(df):
    """Create new features from existing ones"""
    # IP-based features
    df['Source_IP_num'] = df['Source_IP'].apply(ip_to_int)
    df['Destination_IP_num'] = df['Destination_IP'].apply(ip_to_int)
    
    # Packet statistics
    df['packets_ratio'] = df['fwd_packets_amount'] / (df['bwd_packets_amount'] + 1)
    df['bytes_ratio'] = df['fwd_packets_length'] / (df['bwd_packets_length'] + 1)
    df['total_packets'] = df['fwd_packets_amount'] + df['bwd_packets_amount']
    df['total_bytes'] = df['fwd_packets_length'] + df['bwd_packets_length']
    df['avg_fwd_packet_size'] = df['fwd_packets_length'] / (df['fwd_packets_amount'] + 1)
    df['avg_bwd_packet_size'] = df['bwd_packets_length'] / (df['bwd_packets_amount'] + 1)
    
    # Flag combinations
    df['total_flags'] = df['FIN_count'] + df['SYN_count'] + df['RST_count'] + df['PSH_count']
    
    # Inter-arrival time statistics
    df['iat_ratio'] = df['mean_fwd_inter_arrival_time'] / (df['mean_bwd_inter_arrival_time'] + 1)
    
    # Packet size patterns
    packet_size_cols = [col for col in df.columns if col.startswith('first_packet_sizes_')]
    df['mean_first_packet_size'] = df[packet_size_cols].mean(axis=1)
    df['std_first_packet_size'] = df[packet_size_cols].std(axis=1)
    
    # Bandwidth features
    bandwidth_cols = [col for col in df.columns if col.startswith('bandwidth_')]
    df['mean_bandwidth'] = df[bandwidth_cols].mean(axis=1)
    df['std_bandwidth'] = df[bandwidth_cols].std(axis=1)
    df['max_bandwidth'] = df[bandwidth_cols].max(axis=1)

    
    return df

In [6]:
def select_features_mi(X, y, k=30):
    """Select features using mutual information"""
    selector = SelectKBest(score_func=mutual_info_classif, k=k)
    selector.fit(X, y)
    
    feature_scores = pd.DataFrame({
        'Feature': X.columns,
        'Score': selector.scores_
    })
    
    best_features = feature_scores.nlargest(k, 'Score')['Feature'].tolist()
    return X[best_features], feature_scores

In [7]:
def analyze_correlations(X, threshold=0.95):
    """Analyze and remove highly correlated features"""
    correlation_matrix = X.corr()
    highly_correlated = np.where(np.abs(correlation_matrix) > threshold)
    highly_correlated = [(correlation_matrix.index[x], correlation_matrix.columns[y]) 
                        for x, y in zip(*highly_correlated) if x != y and x < y]
    
    features_to_drop = set()
    for feat1, feat2 in highly_correlated:
        if feat1 not in features_to_drop:
            features_to_drop.add(feat2)
    
    return X.drop(columns=list(features_to_drop)), correlation_matrix

In [15]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test):
    """Train and evaluate multiple classification models"""
    models = {
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
        'Bagging': BaggingClassifier(n_estimators=150, random_state=42),
        'ExtraTrees': ExtraTreesClassifier(n_estimators=150, random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'LightGBM': LGBMClassifier(n_estimators=150, random_state=42),
        'CatBoost': CatBoostClassifier(iterations=150, random_seed=42, verbose=False)
    }
    
    # Train and evaluate each model
    results = {}
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results[name] = {
            'model': model,
            'accuracy': accuracy,
            'predictions': y_pred
        }
        print(f"{name} Accuracy: {accuracy:.4f}")
        print("Classification Report:")
        print(classification_report(y_test, y_pred))
    
    # Create ensemble models
    print("\nTraining Voting Classifier...")
    voting_clf = VotingClassifier(
        estimators=[(name, model['model']) for name, model in results.items()],
        voting='soft'
    )
    voting_clf.fit(X_train, y_train)
    voting_pred = voting_clf.predict(X_test)
    voting_accuracy = accuracy_score(y_test, voting_pred)
    
    print("\nTraining Stacking Classifier...")
    stack_clf = StackingClassifier(
        estimators=[(name, model['model']) for name, model in results.items()],
        final_estimator=LogisticRegression(),
        cv=3
    )
    stack_clf.fit(X_train, y_train)
    stack_pred = stack_clf.predict(X_test)
    stack_accuracy = accuracy_score(y_test, stack_pred)
    
    # Add ensemble results
    results['Voting'] = {
        'model': voting_clf,
        'accuracy': voting_accuracy,
        'predictions': voting_pred
    }
    results['Stacking'] = {
        'model': stack_clf,
        'accuracy': stack_accuracy,
        'predictions': stack_pred
    }
    
    return results


In [16]:
# Main execution
print("Loading datasets...")
train_df = pd.read_csv('../APP-1/train.csv')
test_df = pd.read_csv('../APP-1/test.csv')

print("\nCreating new features...")
train_df = create_new_features(train_df)
test_df = create_new_features(test_df)

Loading datasets...

Creating new features...


In [17]:
# Prepare features and target
y = train_df['label']
test_labels = test_df['label']
columns_to_drop = ['label', 'Source_IP', 'Destination_IP']
X = train_df.drop(columns=columns_to_drop)
test_df_features = test_df.drop(columns=columns_to_drop)

In [18]:
# Handle categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    test_df_features[col] = le.transform(test_df_features[col].astype(str))
    label_encoders[col] = le

In [19]:
# Feature selection
print("\nSelecting features...")
X_selected, feature_scores = select_features_mi(X, y)
test_df_selected = test_df_features[X_selected.columns]


Selecting features...


In [20]:
# Remove highly correlated features
X_final, correlation_matrix = analyze_correlations(X_selected)
test_df_final = test_df_selected[X_final.columns]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_final)
test_df_scaled = scaler.transform(test_df_final)

In [None]:
# Train and evaluate models
print("\nTraining and evaluating models...")
results = train_and_evaluate_models(X_scaled, test_df_scaled, y, test_labels)



Training and evaluating models...

Training RandomForest...
RandomForest Accuracy: 0.6813
Classification Report:
                       precision    recall  f1-score   support

             1inch.io       0.40      0.40      0.40         5
   20thcenturystudios       0.50      0.40      0.44         5
              YouTube       1.00      0.40      0.57         5
        aboutads.info       0.75      0.60      0.67         5
   adventuresbydisney       0.67      0.80      0.73         5
alainelkanninterviews       1.00      1.00      1.00         5
           algemeiner       1.00      0.80      0.89         5
            amazonaws       0.62      1.00      0.77         5
                apple       1.00      0.60      0.75         5
          arstechnica       0.75      0.60      0.67         5
       atlantis-press       1.00      0.80      0.89         5
                axios       0.60      0.60      0.60         5
                  bbc       0.50      0.40      0.44         5
   

In [None]:
# Print final comparison
print("\nFinal Model Comparison:")
for name, result in results.items():
    print(f"{name} Accuracy: {result['accuracy']:.4f}")

# Save predictions from best model
best_model = max(results.items(), key=lambda x: x[1]['accuracy'])
print(f"\nBest model: {best_model[0]} (Accuracy: {best_model[1]['accuracy']:.4f})")
predictions_df = pd.DataFrame({
    'True_Label': test_labels,
    'Predicted_Label': best_model[1]['predictions'],
    'Correct': test_labels == best_model[1]['predictions']
})