In [5]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
import time

In [6]:
# Load the dataset
file_path = '../datasets/train_set.csv'
train_df = pd.read_csv(file_path)
train_df = train_df.drop(columns=['Assembly Code', 'Assembly Description', 'Type Name'])

# Start timing
start_time = time.time()
# Define preprocessing for categorical features
categorical_features = ['Family', 'SubFamily', 'ObjectGroup', 'ObjectName', 'Description', 'Type Comments', 'Structural Material', 'Material']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)])

# Create the model pipeline with a Random Forest Classifier
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier())])

# Split the data into training and testing sets
X = train_df.drop('Category', axis=1)
y = train_df['Category']
class_labels = train_df['Category'].unique()

print("Unique Class Labels:")
print(class_labels)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Save the trained model to a file
model_filename = 'random_forest_model_test.pkl'
joblib.dump(model, model_filename)

# End timing
end_time = time.time()

# Calculate total runtime
total_time = end_time - start_time
print(f"Total runtime: {total_time} seconds")

# Predict and evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

Unique Class Labels:
['Precast Concrete Wall' 'Parapets Wall' 'Structural Wall'
 'Cast In-Situ RC Wall' 'General Wall Item' 'Wall Finish'
 'General Ceiling Item' 'Plaster Ceiling'
 'General Door Item (including Gate, Roller Shutter)'
 'General Signage Item' 'Side-Hung Window' 'Top-Hung Window'
 'Sliding Window' 'General Window Item' 'Adjustable Louvred Window'
 'General Floor Item' 'Floor Finish' 'Precast Floor' 'Ramp'
 'Architectural Column' 'General Roof Item' 'Dry Wall' 'Compartment Wall'
 'Brickwall' 'Concrete Roof' 'Fire Alarm Fixture and Device'
 'Fireman Intercom Fixture and Device' 'Curtain Wall'
 'Lightweight Concrete Panel' 'Metal Roof' 'Wall Skirting'
 'Fixed Louvred Window' 'Hose Reel' 'Speaker' 'Breeching Inlet'
 'Precision Blockwall' 'General Door Item']
Total runtime: 1.243274211883545 seconds
                                                    precision    recall  f1-score   support

                         Adjustable Louvred Window       0.90      1.00      0.95      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
