# 🌳 Linear Regression Model

## 👽 Load and Summarize

In [229]:
# Required libraries
# import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# !pip install --upgrade pandas
import pandas as pd
# print(pd.__version__) # I am having trouble with pandas and dataframes...
import seaborn as sns
# print(sns.__version__)  # Should be 0.11.0 or higher so that `histplot() works
# !pip install --upgrade seaborn
from sklearn.feature_selection import VarianceThreshold

# Load the data
data = pd.read_csv('./data/train_data.csv')

# Load the test data
test_data = pd.read_csv('./data/test_data.csv')


## 🧯 Suppress the `FutureWarning`

In [195]:
# import warnings
# warnings.simplefilter("ignore", category=FutureWarning)

## 🧼 Clean the Dataset

In [231]:
def preprocess_for_pipeline(i_data, training_columns, drop_cols, categorical_cols):
    i_data = i_data.drop(columns=drop_cols, errors='ignore')

    for col in categorical_cols:
        if col in i_data.columns:
            i_data[col] = i_data[col].astype('string')

#     i_data = pd.get_dummies(i_data, columns=[c for c in categorical_cols if c in i_data.columns], drop_first=True)
    i_data = i_data.reindex(columns=training_columns, fill_value=0)
    i_data = i_data.replace([np.inf, -np.inf], np.nan).dropna()

    return i_data

## 🐫 Removing Low Variance Cells

In [208]:
# # Fit the selector
# selector = VarianceThreshold(threshold=0.001)
# selector.fit(data[numeric_cols])  # call .fit() before accessing .variances_

# # Now safe to access .variances_
# low_variance = [col for col, var in zip(numeric_cols, selector.variances_) if var < 0.00001]

# print("Low-variance features:\n" + "-" * 100)

# if low_variance:
#     print("-" * 100)
#     print(data[low_variance].describe())
#     print("-" * 100)
#     print(data.groupby('class')[low_variance].mean())
# else:
#     print("No low-variance features found.")

In [198]:
# Drop these rows because they don't help
# data = data.drop(columns=low_variance) # I am going to keep them for the moment

## 🌲 Build the Random Forest Model

In [232]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Split
X = data.drop(columns = 'class')
y = data['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42069) # Nice

# print(X)

# 🔫 Apply the Model

### 🔧 Build Pre-Processing Pipeline

In [233]:
# Preprocessing for categorical columns
cat_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')

# Combine all into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, categorical_cols),
    ],
    remainder='passthrough'  # all other columns are kept as-is
)

# Final pipeline: preprocessing + classifier
pipeline = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42069, # Nice
        n_jobs=-1
    ))
])

## 🦇 Define Pre-Processing Logic

In [237]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, r2_score

# Define column groups
categorical_cols = ['protocol_type', 'service', 'flag']
drop_cols = ['num_outbound_cmds', 'is_host_login']
label_col = 'class'

# Fit the pipeline on training data
pipeline.fit(X_train, y_train)

train_y_pred = pipeline.predict(X)
test_y_pred = pipeline.predict(test_data)

