In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
import time
from sklearn.preprocessing import LabelEncoder

In [6]:
# Load the dataset
file_path = '../datasets/train_set.csv'
train_df = pd.read_csv(file_path)
train_df = train_df.drop(columns=['Assembly Code', 'Assembly Description', 'Type Name'])

# Start timing
start_time = time.time()
# Define preprocessing for categorical features
categorical_features = ['Family', 'SubFamily', 'ObjectGroup', 'ObjectName', 'Description', 'Type Comments', 'Structural Material', 'Material']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

# Create the model pipeline with a Random Forest Classifier
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', XGBClassifier())])

# Split the data into training and testing sets
X = train_df.drop('Category', axis=1)
y = train_df['Category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Train the model
model.fit(X_train, y_train_encoded)

# Save the trained model to a file
model_filename = 'xgboost.pkl'
joblib.dump(model, model_filename)

# End timing
end_time = time.time()

# Calculate total runtime
total_time = end_time - start_time
print(f"Total runtime: {total_time} seconds")

# Predict and evaluate the model
y_pred = model.predict(X_test)
y_test_encoded = label_encoder.transform(y_test)
print(classification_report(y_test_encoded, y_pred))

Total runtime: 2.6478450298309326 seconds
              precision    recall  f1-score   support

           0       0.78      0.70      0.74        10
           1       1.00      1.00      1.00       107
           3       0.90      0.75      0.82        12
           4       0.00      0.00      0.00         1
           6       1.00      1.00      1.00        65
           7       0.00      0.00      0.00         2
           8       1.00      1.00      1.00         6
          10       1.00      1.00      1.00         4
          11       1.00      0.67      0.80         3
          12       0.92      1.00      0.96        23
          14       1.00      1.00      1.00        45
          15       0.99      0.96      0.97        74
          16       1.00      1.00      1.00         2
          18       0.33      0.71      0.45         7
          19       0.70      1.00      0.82        14
          20       0.86      1.00      0.92       139
          21       1.00      1.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
