In [6]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
    --------------------------------------- 1.8/124.9 MB 12.6 MB/s eta 0:00:10
   -- ------------------------------------- 7.1/124.9 MB 20.8 MB/s eta 0:00:06
   --- ------------------------------------ 12.3/124.9 MB 22.0 MB/s eta 0:00:06
   ----- ---------------------------------- 16.5/124.9 MB 20.8 MB/s eta 0:00:06
   ------- -------------------------------- 23.1/124.9 MB 23.2 MB/s eta 0:00:05
   --------- ------------------------------ 29.4/124.9 MB 24.2 MB/s eta 0:00:04
   ----------- ---------------------------- 34.6/124.9 MB 24.4 MB/s eta 0:00:04
   ------------ --------------------------- 38.5/124.9 MB 24.0 MB/s eta 0:00:04
   -------------- ------------------------- 46.1/124.9 MB 24.7 MB/s eta 0:00:04
   ---------------- ----------------------- 51.1/124.9 MB 24.5 

In [16]:
import pandas as pd
df = pd.read_csv("C:\\Users\\ik_ad\\Downloads\\bank_marketing_full.csv", delimiter=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [17]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 1: Data Cleaning and Preprocessing

# Replace 'unknown' values with NaN
df.replace("unknown", np.nan, inplace=True)

# Fill missing values in categorical columns with mode
categorical_columns = df.select_dtypes(include=["object"]).columns.tolist()
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Encoding categorical features
label_encoders = {}
for col in categorical_columns:
    if col != "y":  # Exclude target variable
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# Encode target variable ('y': yes -> 1, no -> 0)
df["y"] = df["y"].map({"yes": 1, "no": 0})

# Step 2: Splitting Dataset into Training and Testing Sets
X = df.drop(columns=["y"])  # Features
y = df["y"]  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 3: Define Models
models = {
    "ID3 (Entropy)": DecisionTreeClassifier(criterion="entropy", random_state=42),
    "CART (Gini)": DecisionTreeClassifier(criterion="gini", random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)
}

# Cross-validation folds
cv_folds = 5

# Step 4: Model Training and Evaluation
results = []

for model_name, model in models.items():
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")

    # Train model on full training set
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Store results
    results.append({
        "Model": model_name,
        "Cross-Validation Accuracy": np.mean(cv_scores),
        "Test Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
display(results_df)

Unnamed: 0,Model,Cross-Validation Accuracy,Test Accuracy,Precision,Recall,F1 Score
0,ID3 (Entropy),0.88868,0.886744,0.497415,0.518319,0.507652
1,CART (Gini),0.885918,0.897305,0.541837,0.572198,0.556604
2,Random Forest,0.911684,0.920248,0.690577,0.529095,0.599146
3,Gradient Boosting,0.914385,0.921462,0.691156,0.547414,0.610944
4,XGBoost,0.91217,0.917577,0.659004,0.556034,0.603156


In [9]:
df = pd.read_csv("C:\\Users\\ik_ad\\Downloads\\covertype_cs5805.csv")
df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
0,3351,206,27,726,124,3813,192,252,180,2271,...,0,0,0,0,0,0,1,0,0,0
1,2732,129,7,212,1,1082,231,236,137,912,...,0,0,0,0,0,0,0,0,0,1
2,2572,24,9,201,25,957,216,222,142,2191,...,0,0,0,0,0,0,0,0,0,1
3,2824,69,13,417,39,3223,233,214,110,6478,...,0,0,0,0,0,0,0,0,0,1
4,2529,84,5,120,9,1092,227,231,139,4983,...,0,0,0,0,0,0,0,0,0,1


In [10]:
# Step 1: Splitting Dataset into Training and Testing Sets
X = df.drop(columns=["Cover_Type"])  # Features
y = df["Cover_Type"]  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Step 2: Define Models
models = {
    "ID3 (Entropy)": DecisionTreeClassifier(criterion="entropy", random_state=42),
    "CART (Gini)": DecisionTreeClassifier(criterion="gini", random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)
}

# Cross-validation folds
cv_folds = 5

# Step 3: Model Training and Evaluation
results = []

for model_name, model in models.items():
    # Perform cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv_folds, scoring="accuracy")

    # Train model on full training set
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average="weighted")  # Weighted for multi-class
    recall = recall_score(y_test, y_pred, average="weighted")  # Weighted for multi-class
    f1 = f1_score(y_test, y_pred, average="weighted")  # Weighted for multi-class

    # Store results
    results.append({
        "Model": model_name,
        "Cross-Validation Accuracy": np.mean(cv_scores),
        "Test Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
display(results_df)

Unnamed: 0,Model,Cross-Validation Accuracy,Test Accuracy,Precision,Recall,F1 Score
0,ID3 (Entropy),0.81988,0.837019,0.836389,0.837019,0.836579
1,CART (Gini),0.811295,0.832889,0.832441,0.832889,0.832603
2,Random Forest,0.870503,0.88495,0.886331,0.88495,0.882883
3,Gradient Boosting,0.7679,0.77162,0.77121,0.77162,0.767565
4,XGBoost,0.842212,0.848378,0.848697,0.848378,0.847203


For the categorical data, I changed the "y" and "n" values to 1 and 0 to make it numerical. I chose 5 folds for cross-validation because it provides a good balance between computational efficiency and reliable model evaluation. Since the dataset is big, I chose a lower number of folds for efficient computational cost.

Cross-Validation Accuracy measures how well a machine learning model performs across multiple training and validation splits. Test Accuracy is the percentage of correctly classified instances in the test dataset. Precision measures the percentage of positive predictions that are correct. Recall measures how many actual positive instances were correctly identified by the model. Finally, F1 Score determines how well of a balance between precision and recall.

Random Forest performed the best out of the models due to the data being extremely complex, which random forests are better suited for. ID3 is best suited for classification tasks where you need to identify the most informative attribute to split data in a decision tree.