In [1]:
!pip uninstall -y scikit-learn
!pip install scikit-learn==1.3.1

Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
Collecting scikit-learn==1.3.1
  Downloading scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
imbalanced-learn 0.13.0 requires scikit-learn<2,>=1.3.2, but you have scikit-learn 1.3.1 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.3.1


In [3]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix
from datetime import datetime

pd.options.display.min_rows = 200

In [5]:
# Read dataset
train = pd.read_csv('train_sys.csv')
test = pd.read_csv('test_sys.csv')

# Separate features and target from training data
X = train.drop('target', axis=1)
y = train['target']

# First, let's identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns
print("Categorical columns:", len(categorical_columns))

Categorical columns: 28


## Feature Engineering

In [6]:
# Create a dictionary to store label encoders
label_encoders = {}

# Handle categorical variables for both training and test data together
for column in categorical_columns:
    # Combine train and test values to get all possible categories
    all_values = pd.concat([X[column], test[column]]).astype(str).unique()

    # Create and fit label encoder
    le = LabelEncoder()
    le.fit(all_values)

    # Transform both train and test
    X[column] = le.transform(X[column].astype(str))
    test[column] = le.transform(test[column].astype(str))

    # Store the encoder
    label_encoders[column] = le

# Handle missing values with SimpleImputer for training data
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Handle missing values for test data
test = pd.DataFrame(imputer.transform(test), columns=test.columns)

# Create train-validation split (80-20)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=0
)

# Print shapes
print("\
Training set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", test.shape)

Training set shape: (80000, 75)
Validation set shape: (20000, 75)
Test set shape: (10000, 75)


In [7]:
res = X_train.isnull().sum()
print(res.to_string())


MachineID                             0
ProductName                           0
EngineVersion                         0
AppVersion                            0
SignatureVersion                      0
IsBetaUser                            0
RealTimeProtectionState               0
IsPassiveModeEnabled                  0
AntivirusConfigID                     0
NumAntivirusProductsInstalled         0
NumAntivirusProductsEnabled           0
HasTpm                                0
CountryID                             0
CityID                                0
GeoRegionID                           0
LocaleEnglishNameID                   0
PlatformType                          0
Processor                             0
OSVersion                             0
OSBuildNumber                         0
OSProductSuite                        0
OsPlatformSubRelease                  0
OSBuildLab                            0
SKUEditionName                        0
IsSystemProtected                     0


# Create an Ensemble

In [None]:
# Define and train the stacking ensemble
# Define base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=200, random_state=0, n_jobs=-1)),
    ('lgb', lgb.LGBMClassifier(n_estimators=392,
              								 learning_rate=0.05,
              								 max_depth=9,
              								 num_leaves=63,
              								 min_child_samples=33,
              								 subsample=0.855,
              								 colsample_bytree=0.732,
              								 random_state=0,
              								 verbose=-1)),
    ('xgb', XGBClassifier(n_estimators=200, random_state=0, n_jobs=-1))
]

# Define meta-learner
meta_learner = LogisticRegression()

# Create stacking classifier
stacking = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=5,
    n_jobs=-1
)

# Train the model
print("Training stacking ensemble...")
stacking.fit(X_train, y_train)

# Make predictions on validation set
val_predictions = stacking.predict(X_val)

# Calculate and print validation accuracy
val_accuracy = accuracy_score(y_val, val_predictions)
print("\
Validation Accuracy: {:.4f}".format(val_accuracy))

# Make predictions on test set and save to CSV
print("Making predictions on test set...")
test_predictions = stacking.predict(test)

# Print confusion matrix for validation set
val_confusion_matrix = confusion_matrix(y_val, val_predictions)
print("Confusion Matrix for Validation Set:")
print(val_confusion_matrix)


In [None]:
# Create submission file
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
submission_filename = f'submission_{timestamp}.csv'
submission = pd.DataFrame({
    'id': range(len(test_predictions)),
    'target': test_predictions
})

# Save predictions
# submission.to_csv(submission_filename, index=False)
print(f"\
Predictions saved to {submission_filename}")

# Show first few predictions
print("\
First few predictions:")
print(submission.head())