In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Define file paths
train_path = 'C:\\Users\\Deepak DR\\Downloads\\Train_Data.csv'
test_path = 'C:\\Users\\Deepak DR\\Downloads\\Test_Data.csv'

# Load datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Assuming 'attack' is the target column in train.csv
X = train_df.drop(columns=['attack'])  # Features
y = train_df['attack']  # Target

# Step 2: Preprocessing pipeline
# Separate numerical and categorical columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Step 3: Training the model
# Split data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Append classifier to preprocessing pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(random_state=42))])

# Fit the model
pipeline.fit(X_train, y_train)

# Step 4: Prediction on test data
# Assuming 'test_df' is loaded and processed similarly to 'X_train'
# Preprocess the test data
X_test = test_df  # Assuming no target column in test.csv, just features

# Predict on test data
predictions = pipeline.predict(X_test)

# Optional: Save predictions to a CSV file as per submission requirements
submission_df = pd.DataFrame({'attack': predictions})
submission_df.to_csv('C:\\Users\\Deepak DR\\Downloads\\Submission.csv', index=False)


