In [1]:
import pandas as pd
import numpy as np
import torch
import pickle
from sentence_transformers import util, SentenceTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Getting data from 3 different sources by researching and colating into one.

In [2]:
def get_data():
    df1 = pd.read_json("./data/harmonized-system.json")
    df2 = pd.read_json("./data/subheadings.json")
    df3 = pd.read_json("./data/htsdata.json")
    final_df = df1.append(df2, ignore_index=True).dropna()
    final_df = final_df.append(df3, ignore_index=True)
    return final_df

Module to find best matches between input string and hs code description

In [3]:
def find_best_match(sent_embedd, train_embeddings, df_sentences):
    sentence = ''
    match_score = 0
    for train_item, train_sent in zip(train_embeddings, df_sentences):
        sim = util.cos_sim(sent_embedd, train_item)
        if sim[0][0].item() > match_score:
            match_score = sim[0][0].item()
            sentence = train_sent
    return match_score, sentence

Module to find the corresponding HS code for the obtained matches

In [22]:
def get_hs_code(descriptions, model, sentence_embeddings, df):
    hs_out = []
    for item_desc in descriptions:
        flag_out = {}
        query_vec = model.encode([item_desc])[0]
        match_score, target_sentence = find_best_match(query_vec, sentence_embeddings,df['description'])
        #print(match_score, target_sentence)
        code = df[df['description']==target_sentence]['hscode'].values[0]
        flag_out['input'] = item_desc
        flag_out['score'] = match_score
        flag_out['standard_desc'] = target_sentence
        flag_out['hscode'] = code
        hs_out.append(flag_out)
    return hs_out

Module to calculate success matrix.

In [23]:
def calc_success_matrix(test_df, model, embeddings, df):
    test_df["predict_hs"] = test_df['description'].apply(lambda x: int(get_hs_code([x], model, embeddings, df)[0]["hscode"]))
    test_df.to_csv("./data/test_HS_2_res.csv")
    display(test_df)
    accuracy = accuracy_score(test_df['hscode'],test_df["predict_hs"])
    precision = precision_score(test_df['hscode'],test_df["predict_hs"],average='weighted')
    recall = recall_score(test_df['hscode'],test_df["predict_hs"],average='weighted')
    F1_score = f1_score(test_df['hscode'],test_df["predict_hs"],average='weighted')
    return accuracy, precision, recall, F1_score

In [24]:
def main():
    final_df = get_data()
    sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')
    sentence_embeddings = sbert_model.encode(final_df['description'])
    df_test = pd.read_csv("./data/test_HS.csv")
    accuracy,precision, recall, f1_scr = calc_success_matrix(df_test,sbert_model, sentence_embeddings, final_df)
    print(f"accuracy: {accuracy} | precision: {precision} | recall: {recall} | F1 score: {f1_scr}")

In [25]:
main()

  final_df = df1.append(df2, ignore_index=True).dropna()
  final_df = final_df.append(df3, ignore_index=True)


Unnamed: 0,hscode,description,predict_hs
0,851640,ELECTRIC SMOOTHING IRONS,851640
1,430130,RAW FURSKINS OF (INDIAN-PERSIAN ETC) LAMB WHOL...,430130
2,441911,"BREAD BOARDS, CHOPPING BOARDS AND SIMILAR BOARDS",4419110000
3,853990,PARTS OF ARTICLES OF HEADING 8539,9801001051
4,392310,BOXS CASES CRATES AND SMLR ARTCLS OF PLSTCS,392310
...,...,...,...
230,611300,"GRMNTS,MADE UP OF KNTTD CRCHTD FABRICS OF HDNG...",6210
231,540824,"WOVN FBRICS,PRINTED,CONTAINING 85% OR MOREARTI...",540774
232,846231,"NUMRCLY CONTRLLD SHEARNG MCHNS(INCL PRSSES),EX...",846239
233,420100,SDDLRY AND HRNSS FOR ANY ANML (INCL TRACTSLEAD...,420100


accuracy: 0.825531914893617 | precision: 0.823404255319149 | recall: 0.825531914893617 | F1 score: 0.8241134751773049


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
