In [31]:
import pandas as pd
import numpy as np
import os
from dotenv import dotenv_values, find_dotenv
import re
import getresponse
import importlib
importlib.reload(getresponse)
from getresponse import getresponse

In [2]:
# this looks for your configuration file and then reads it as a dictionary
config = dotenv_values(find_dotenv())

# set path using the dictionary key for which one you want
concordancedata = os.path.abspath(config["DATA_CONCORDANCE"]) + '\\'

In [4]:
# import data

# https://econweb.ucsd.edu/~jrauch/rauch_classification.html
rauch = pd.read_csv(concordancedata + 'Rauch_classification_revised.csv')
# https://unstats.un.org/unsd/trade/hs2_2sitc3.htm
sitc_hs6 = pd.read_csv(concordancedata + 'HS6 to STIC rev3 concordance - unstats.csv')[['HS02', 'SITC, R3']]
# rename columns for merging later
sitc_hs6.rename(columns={'HS02': 'hs6', 'SITC, R3': 'sitc4'}, inplace=True)
# switch to string datatype for easier data cleaning
sitc_hs6['hs6']= sitc_hs6['hs6'].astype(str)
sitc_hs6['sitc4']= sitc_hs6['sitc4'].astype(str)
rauch['sitc4'] = rauch['sitc4'].astype(str)

# https://econweb.ucsd.edu/~jrauch/rauchclass/SITC_Rev_3_english_structure.txt
with open(concordancedata + 'SITC_Rev_3_english_structure.txt', 'r', errors='ignore') as file:
    lines = file.readlines()

# refine a regex pattern for splitting the lines
pattern = re.compile(r'(\S+)\s+(.+)')

# df columns
code_list = []
description_list = []

# iterate through lines and extract code and description
for line in lines:
    match = pattern.match(line)
    if match:
        code_list.append(match.group(1))
        description_list.append(match.group(2))

sitc_desc = pd.DataFrame({'sitc4': code_list[1:], 'desc': description_list[1:]})

# keep sitc 4 digit long codes
sitc_desc['sitc4'] = sitc_desc['sitc4'].str.replace('.', '', regex=True)
sitc_desc = sitc_desc[sitc_desc['sitc4'].str.len() == 4]

In [15]:
# restrict to our product categories
sitc_hs6 = sitc_hs6.loc[sitc_hs6['hs6'].str.startswith('85') | sitc_hs6['hs6'].str.startswith('61') | sitc_hs6['hs6'].str.startswith('62')]

# fix formatting for sitc codes in order to merge with rauch codes file
sitc_hs6['sitc4'] = sitc_hs6['sitc4'].str.replace('.', '', regex=True)
sitc_hs6['sitc4'] = sitc_hs6['sitc4'].apply(lambda x: x[:-1] if len(x) == 5 else x)

In [16]:
# do the merge
concordance = pd.merge(left=sitc_hs6, right=rauch, how='left', on='sitc4')
concordance = pd.merge(left=concordance, right=sitc_desc, how='left', on='sitc4')

In [89]:
concordance

Unnamed: 0,hs6,sitc4,con,lib,desc
0,6101.1,8431,n,n,"Overcoats, car coats, capes, cloaks, anoraks (..."
1,6101.2,8431,n,n,"Overcoats, car coats, capes, cloaks, anoraks (..."
2,6101.3,8431,n,n,"Overcoats, car coats, capes, cloaks, anoraks (..."
3,6101.9,8431,n,n,"Overcoats, car coats, capes, cloaks, anoraks (..."
4,6102.1,8441,n,n,"Overcoats, car coats, capes, cloaks, anoraks (..."
...,...,...,...,...,...
517,8547.1,7732,n,n,Electrical insulating equipment
518,8547.2,7732,n,n,Electrical insulating equipment
519,8547.9,7732,n,n,Electrical insulating equipment
520,8548.1,7781,n,r,"Batteries and electric accumulators, and parts..."


In [28]:
# dictionary of unclassified sitc codes
unmatched = concordance[concordance['lib'].isna()].drop_duplicates(subset=['sitc4', 'con', 'lib', 'desc'])
sitc_unmatched = dict(zip(unmatched['sitc4'], unmatched['desc']))

In [30]:
# there are 9 SITC codes that don't have a Rauch classification (let ChatGPT fill in the matching?)
sitc_unmatched

{'7169': 'Parts, n.e.s., suitable for use solely or principally with the machines falling within group 716',
 '8138': 'Parts of the portable electric lamps of heading 813.12 (excluding storage batteries)',
 '8985': 'Other prepared unrecorded media for sound recording or similar recording of other phenomena (excluding products of group 883)',
 '8987': 'Records and other recorded media (excluding magnetic tapes) for sound or other similarly recorded phenomena (including matrices and masters for the production of records, but excluding products of group 883)',
 '7723': 'Electrical resistors (including rheostats and potentiometers), other than heating resistors; parts thereof',
 '7724': 'Electrical apparatus for switching or protecting electrical circuits or for making connections to or in electrical circuits (e.g., switches, fuses, lightning arresters, voltage limiters, surge suppressors, plugs, junction boxes), for a voltage exceeding 1',
 '7726': 'Boards, panels (including numerical con

In [81]:
# find classification for unmatched commodities using chatgpt (this is the prompt that provides relatively consistent responses but not perfect)
for code, desc in sitc_unmatched.items():
    # print(code)
    # print(desc)
    print(getresponse("James Rauch classifies SITC Rev. 2 commodities into organized exchange, reference priced, and differentiated products. Because ambiguities arose that were sometimes sufficiently important to affect the classification, both conservative and liberal classifications were made, with the former minimizing the number of commodities that are classified as either organized exchange or reference priced and the latter maximizing those numbers. Determine the most likely conservative and liberal classification of the commodity with SITC rev. 3 code "  
                      + code + ", whose description is as follows: " 
                      + desc + ". Considering this description, state only the single most likely conservative classification followed by the single most likely liberal classification. State the response in this format: Conservative classification (either organized exchange, reference priced, or differentiated), Liberal classification (either organized exchange, reference priced, or differentiated)"))

Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Differentiated


In [95]:
# another instance where one of the products shows up as reference priced

# find classification for unmatched commodities using chatgpt (this is the prompt that provides relatively consistent responses but not perfect)
for code, desc in sitc_unmatched.items():
    # print(code)
    # print(desc)
    print(getresponse("James Rauch classifies SITC Rev. 2 commodities into organized exchange, reference priced, and differentiated products. Because ambiguities arose that were sometimes sufficiently important to affect the classification, both conservative and liberal classifications were made, with the former minimizing the number of commodities that are classified as either organized exchange or reference priced and the latter maximizing those numbers. Determine the most likely conservative and liberal classification of the commodity with SITC rev. 3 code "  
                      + code + ", whose description is as follows: " 
                      + desc + ". Considering this description, state only the single most likely conservative classification followed by the single most likely liberal classification. State the response in this format: Conservative classification (either organized exchange, reference priced, or differentiated), Liberal classification (either organized exchange, reference priced, or differentiated)"))

Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Reference priced
Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Reference priced
Liberal classification: Reference priced
Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Differentiated


In [94]:
# see if results are consistent with some other products

# 8431 (conservative = differentiated, liberal = differentiated)
print(getresponse("James Rauch classifies SITC Rev. 2 commodities into organized exchange, reference priced, and differentiated products. Because ambiguities arose that were sometimes sufficiently important to affect the classification, both conservative and liberal classifications were made, with the former minimizing the number of commodities that are classified as either organized exchange or reference priced and the latter maximizing those numbers. Determine the most likely conservative and liberal classification of the commodity with SITC rev. 3 code "  
                    + '8431' + ", whose description is as follows: " 
                    + 'Overcoats, car coats, capes, cloaks, anoraks (including ski jackets), windcheaters, wind jackets and similar articles (other than those of heading 843.23)' 
                    + ". Considering this description, state only the single most likely conservative classification followed by the single most likely liberal classification. State the response in this format: Conservative classification (either organized exchange, reference priced, or differentiated), Liberal classification (either organized exchange, reference priced, or differentiated)"))

# 7781 (conservative = differentiated, liberal = reference priced)
print(getresponse("James Rauch classifies SITC Rev. 2 commodities into organized exchange, reference priced, and differentiated products. Because ambiguities arose that were sometimes sufficiently important to affect the classification, both conservative and liberal classifications were made, with the former minimizing the number of commodities that are classified as either organized exchange or reference priced and the latter maximizing those numbers. Determine the most likely conservative and liberal classification of the commodity with SITC rev. 3 code "  
                    + '7781' + ", whose description is as follows: " 
                    + 'Batteries and electric accumulators, and parts thereof' 
                    + ". Considering this description, state only the single most likely conservative classification followed by the single most likely liberal classification. State the response in this format: Conservative classification (either organized exchange, reference priced, or differentiated), Liberal classification (either organized exchange, reference priced, or differentiated)"))

# 7786 (conservative = differentiated, liberal = reference priced)
print(getresponse("James Rauch classifies SITC Rev. 2 commodities into organized exchange, reference priced, and differentiated products. Because ambiguities arose that were sometimes sufficiently important to affect the classification, both conservative and liberal classifications were made, with the former minimizing the number of commodities that are classified as either organized exchange or reference priced and the latter maximizing those numbers. Determine the most likely conservative and liberal classification of the commodity with SITC rev. 3 code "  
                    + '7786' + ", whose description is as follows: " 
                    + 'Electrical capacitors, fixed, variable or adjustable (pre-set)' 
                    + ". Considering this description, state only the single most likely conservative classification followed by the single most likely liberal classification. State the response in this format: Conservative classification (either organized exchange, reference priced, or differentiated), Liberal classification (either organized exchange, reference priced, or differentiated)"))

# 8424 (conservative = differentiated, liberal = differentiated)
print(getresponse("James Rauch classifies SITC Rev. 2 commodities into organized exchange, reference priced, and differentiated products. Because ambiguities arose that were sometimes sufficiently important to affect the classification, both conservative and liberal classifications were made, with the former minimizing the number of commodities that are classified as either organized exchange or reference priced and the latter maximizing those numbers. Determine the most likely conservative and liberal classification of the commodity with SITC rev. 3 code "  
                    + '8424' + ", whose description is as follows: " 
                    + 'Dresses' 
                    + ". Considering this description, state only the single most likely conservative classification followed by the single most likely liberal classification. State the response in this format: Conservative classification (either organized exchange, reference priced, or differentiated), Liberal classification (either organized exchange, reference priced, or differentiated)"))



Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Differentiated
Conservative classification: Differentiated
Liberal classification: Differentiated


In [84]:
# create a version of concordance with chatgpt fillins
concordance_chatgptfilled = concordance.copy()
# create dummy to indicate whether rauch classification was already provided or was filled in
concordance_chatgptfilled['chatgpt_classified'] = 0

# dictionary used to translate chatgpt results into the naming system from the csv
acronyms = {'differentiated': 'n',
            'reference priced': 'r',
            'organized exchange': 'w'}

# run prompt through all unmatched products
for code, desc in sitc_unmatched.items():
    response = getresponse("James Rauch classifies SITC Rev. 2 commodities into organized exchange, reference priced, and differentiated products. Because ambiguities arose that were sometimes sufficiently important to affect the classification, both conservative and liberal classifications were made, with the former minimizing the number of commodities that are classified as either organized exchange or reference priced and the latter maximizing those numbers. Determine the most likely conservative and liberal classification of the commodity with SITC rev. 3 code "  
                      + code + ", whose description is as follows: " 
                      + desc + ". Considering this description, state only the single most likely conservative classification followed by the single most likely liberal classification. State the response in this format: Conservative classification (either organized exchange, reference priced, or differentiated), Liberal classification (either organized exchange, reference priced, or differentiated)")
    
    # Extract conservative and liberal classifications
    conservative_value = response.split('\n')[0].split(': ')[1]
    liberal_value = response.split('\n')[1].split(': ')[1]

    # Update the corresponding rows in the DataFrame
    concordance_chatgptfilled.loc[concordance_chatgptfilled['sitc4'] == code, 'con'] = acronyms[conservative_value.lower()]
    concordance_chatgptfilled.loc[concordance_chatgptfilled['sitc4'] == code, 'lib'] = acronyms[liberal_value.lower()]
    concordance_chatgptfilled.loc[concordance_chatgptfilled['sitc4'] == code, 'chatgpt_classified'] = 1

In [85]:
# taking a look at the previously unclassified commodities
concordance_chatgptfilled[concordance_chatgptfilled['chatgpt_classified'] == 1]

Unnamed: 0,hs6,sitc4,con,lib,desc,chatgpt_classified
256,8503.0,7169,n,n,"Parts, n.e.s., suitable for use solely or prin...",1
309,8513.9,8138,n,n,Parts of the portable electric lamps of headin...,1
375,8523.2,8985,n,n,Other prepared unrecorded media for sound reco...,1
376,8523.3,8985,n,n,Other prepared unrecorded media for sound reco...,1
377,8523.9,8985,n,n,Other prepared unrecorded media for sound reco...,1
378,8524.1,8987,n,n,Records and other recorded media (excluding ma...,1
379,8524.31,8987,n,n,Records and other recorded media (excluding ma...,1
380,8524.32,8987,n,n,Records and other recorded media (excluding ma...,1
381,8524.39,8987,n,n,Records and other recorded media (excluding ma...,1
386,8524.6,8987,n,n,Records and other recorded media (excluding ma...,1


In [86]:
# dictionary of products with differing "liberal" classification than conservative
conflicting_matches = concordance[(concordance['lib'] != concordance['con']) & (concordance['lib'].notnull())].drop_duplicates(subset=['sitc4', 'con', 'lib', 'desc'])
sitc_conflicting_matches_desc = list(conflicting_matches['desc'])

In [88]:
# save concordance file
concordance_chatgptfilled.to_csv(concordancedata + 'rauch_hs6_sitc.csv')