In [44]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [45]:
df = pd.read_csv('train_data.csv')

In [46]:
X = df['PRODUCTS']
y = df['COMMODITY CODE']

In [47]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [48]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000,stop_words='english')
X_vectorized = vectorizer.fit_transform(X)

In [49]:
from collections import Counter
counts = Counter(y_encoded)
rare_classes = [cls for cls, count in counts.items() if count == 1]

In [50]:
X_filtered = X_vectorized[~np.isin(y_encoded, rare_classes)]
y_filtered = y_encoded[~np.isin(y_encoded, rare_classes)]


In [51]:
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered, test_size=0.2, random_state=42, stratify=y_filtered
)

In [56]:
sgd = SGDClassifier(loss='modified_huber', alpha=1e-3, random_state=42)
sgd.fit(X_train, y_train)

In [57]:
y_pred = sgd.predict(X_test)


In [58]:
y_pred_decoded = le.inverse_transform(y_pred)
y_test_decoded = le.inverse_transform(y_test)

In [59]:
print("validation Accuracy:", accuracy_score(y_pred_decoded,y_test_decoded ))
print("Training accuracy Score    : ",sgd.score(X_train,y_train))
print("Classification Report:")
print(classification_report(y_pred_decoded, y_test_decoded,zero_division=0 ))
print("Confusion Matrix:")
print(confusion_matrix(y_pred_decoded, y_test_decoded))


validation Accuracy: 0.7552405907575036
Training accuracy Score    :  0.7736585075337978
Classification Report:
              precision    recall  f1-score   support

  1701910000       1.00      0.91      0.95        11
  1704909999       1.00      1.00      1.00        12
  2106909855       0.50      1.00      0.67         2
  3209100000       1.00      0.75      0.86         8
  3212900000       1.00      1.00      1.00         6
  3215907090       0.80      0.89      0.84         9
  3302109000       0.85      1.00      0.92        11
  3303001000       0.95      0.81      0.87        67
  3304100000       0.10      0.33      0.15         3
  3304200000       0.31      1.00      0.48         5
  3304910000       0.97      0.78      0.87        37
  3304990000       0.84      0.77      0.81       105
  3305100000       0.82      1.00      0.90         9
  3305300000       0.00      0.00      0.00         0
  3305900000       0.85      0.85      0.85        20
  3307100000       0.73

In [60]:
from sklearn.metrics import precision_recall_fscore_support

precision, recall, f1_score, support = precision_recall_fscore_support(
    y_pred_decoded,
    y_test_decoded,
    average='weighted',
    zero_division=0
)

print(f'Total Precision: {precision:.2f}')
print(f'Total Recall: {recall:.2f}')
print(f'Total F1-score: {f1_score:.2f}')



Total Precision: 0.87
Total Recall: 0.76
Total F1-score: 0.80


In [61]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = sgd, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))



Accuracy: 75.51 %
Standard Deviation: 0.54 %


In [62]:
import joblib

In [63]:
joblib.dump(vectorizer, 'vectorizer.joblib')
joblib.dump(le, 'label_encoder.joblib')
joblib.dump(sgd, 'model.joblib')

['model.joblib']

In [64]:
loaded_vectorizer = joblib.load('vectorizer.joblib')
loaded_label_encoder = joblib.load('label_encoder.joblib')
loaded_model = joblib.load('model.joblib')


In [65]:
print(type(loaded_vectorizer))
print(type(loaded_model))
print(type(loaded_label_encoder))


<class 'sklearn.feature_extraction.text.TfidfVectorizer'>
<class 'sklearn.linear_model._stochastic_gradient.SGDClassifier'>
<class 'sklearn.preprocessing._label.LabelEncoder'>


In [66]:
new_products = ["J06 slim-fit, bleached-effect comfort-denim jeans","Ankle Length Jet Black Denim Jeans - 0123045",
                 "Polo By RL ROUND NECK TEE SHIRT- 0122215","Super Team Suede Sneakers",
                 "FastFit Work Glove with Elastic Cuff for Secure Fit, Performance Gloves for Multi-Purpose Use",
                "Simpli-Magic Cotton Bath Towels, 25”x50”, Gray, 6 Pack",
                 "COVERGIRL Easy Breezy Brow Fill Plus Shape Plus Define Powder Eyebrow Makeup, Black, 0.024 Ounce",
                "Continental Grand Prix 5000 S TR Tyre",
                "Schwinn Replacement Bike Tyre, 20 x 1.125 BMX Bike Tyre, Fast Rolling Tread, Steel Bead Construction, Puncture Guard, Fits Internal Rim Widths 15mm-20mm, PSI Range 30 to 40"]
new_sample_vector=loaded_vectorizer.transform(new_products)
new_predicted_class = loaded_model.predict(new_sample_vector)
new_predicted_class_label = loaded_label_encoder.inverse_transform(new_predicted_class)
for i in range(len(new_products)):
    print(f'HS Code for {new_products[i]} : {new_predicted_class_label[i]}\n')

HS Code for J06 slim-fit, bleached-effect comfort-denim jeans : 6204623190

HS Code for Ankle Length Jet Black Denim Jeans - 0123045 : 6204623190

HS Code for Polo By RL ROUND NECK TEE SHIRT- 0122215 : 6109100010

HS Code for Super Team Suede Sneakers : 6404199000

HS Code for FastFit Work Glove with Elastic Cuff for Secure Fit, Performance Gloves for Multi-Purpose Use : 4015190000

HS Code for Simpli-Magic Cotton Bath Towels, 25”x50”, Gray, 6 Pack : 6109100010

HS Code for COVERGIRL Easy Breezy Brow Fill Plus Shape Plus Define Powder Eyebrow Makeup, Black, 0.024 Ounce : 3304910000

HS Code for Continental Grand Prix 5000 S TR Tyre : 4011500000

HS Code for Schwinn Replacement Bike Tyre, 20 x 1.125 BMX Bike Tyre, Fast Rolling Tread, Steel Bead Construction, Puncture Guard, Fits Internal Rim Widths 15mm-20mm, PSI Range 30 to 40 : 4011500000

