In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
from sklearn.preprocessing import MaxAbsScaler
import time

In [2]:


# Load dataset
file_path = "../datasets/train_set2.csv"
df = pd.read_csv(file_path)

# Define a constant for missing values
EMPTYCONST = "*empty*"

# Concatenate relevant text features
catFamily = df["Family"].fillna(EMPTYCONST)
catSubFamily = df["SubFamily"].fillna(EMPTYCONST)
catObjectGroup = df["ObjectGroup"].fillna(EMPTYCONST)
catObjectName = df["ObjectName"].fillna(EMPTYCONST)
catDescription = df["Description"].fillna(EMPTYCONST)
catTypeComments = df["Type Comments"].fillna(EMPTYCONST)
catStructuralMaterial = df["Structural Material"].fillna(EMPTYCONST)
catMaterial = df["Material"].fillna(EMPTYCONST)

# Create a new feature combining all text features
df["Features"] = (
    catFamily + "|" +
    catSubFamily + "|" +
    catObjectGroup + "|" +
    catObjectName + "|" +
    catDescription + "|" +
    catTypeComments + "|" +
    catStructuralMaterial + "|" +
    catMaterial
)

# Split data into features (X) and target variable (y)
X = df["Features"]
y = df["Category"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Start timing
start_time = time.time()

# Vectorize text data using HashingVectorizer
vectorizer = HashingVectorizer(n_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
# Scale data using MaxAbsScaler (handles sparse input)
scaler = MaxAbsScaler()
X_train_tfidf = scaler.fit_transform(X_train_tfidf)
X_test_tfidf = scaler.transform(X_test_tfidf)
# Check vectorized data


# Initialize Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the model
nb_classifier.fit(X_train_tfidf, y_train)

# End timing
end_time = time.time()

# Calculate total runtime
total_time = end_time - start_time
print(f"Total runtime: {total_time} seconds")
# Make predictions on the test set
predictions = nb_classifier.predict(X_test_tfidf)

# Save the trained model to a file
model_filename = "nb_hv.pkl"
joblib.dump(nb_classifier, model_filename)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, predictions))

ValueError: Negative values in data passed to MultinomialNB (input X)